http - Erlang の HTTP クローラー

Question

私は単純な HTTP クローラーでコーディングしていますが、下部のコードの実行に問題があります。50 個の URL を要求していますが、20 個以上のコンテンツが返されます。クローラーをテストするために、それぞれ 150kB のサイズのいくつかのファイルを生成しました。それで、20以上の応答は帯域幅によって制限されていると思いますか? BUT: 最後のファイルがフェッチされるまで、Erlang スニペットが終了しないようにするにはどうすればよいでしょうか? テストデータサーバーはオンラインなので、コードを試してみてください。ヒントは大歓迎です:)

-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/1]).

start() ->
  ibrowse:start(),
  proc_lib:spawn(?MODULE, send_reqs, []).

to_url(Id) ->
  ?BASE_URL ++ integer_to_list(Id).

fetch_ids() ->
  lists:seq(1, 50).

send_reqs() ->
  spawn_workers(fetch_ids()).

spawn_workers(Ids) ->
  lists:foreach(fun do_spawn/1, Ids).

do_spawn(Id) ->
  proc_lib:spawn_link(?MODULE, do_send_req, [Id]).

do_send_req(Id) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
{ok, Status, _H, B} ->
  io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]);
Err ->
  io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err])
  end.

それが出力です：

Requesting ID 1 ... 
Requesting ID 2 ... 
Requesting ID 3 ... 
Requesting ID 4 ... 
Requesting ID 5 ... 
Requesting ID 6 ... 
Requesting ID 7 ... 
Requesting ID 8 ... 
Requesting ID 9 ... 
Requesting ID 10 ... 
Requesting ID 11 ... 
Requesting ID 12 ... 
Requesting ID 13 ... 
Requesting ID 14 ... 
Requesting ID 15 ... 
Requesting ID 16 ... 
Requesting ID 17 ... 
Requesting ID 18 ... 
Requesting ID 19 ... 
Requesting ID 20 ... 
Requesting ID 21 ... 
Requesting ID 22 ... 
Requesting ID 23 ... 
Requesting ID 24 ... 
Requesting ID 25 ... 
Requesting ID 26 ... 
Requesting ID 27 ... 
Requesting ID 28 ... 
Requesting ID 29 ... 
Requesting ID 30 ... 
Requesting ID 31 ... 
Requesting ID 32 ... 
Requesting ID 33 ... 
Requesting ID 34 ... 
Requesting ID 35 ... 
Requesting ID 36 ... 
Requesting ID 37 ... 
Requesting ID 38 ... 
Requesting ID 39 ... 
Requesting ID 40 ... 
Requesting ID 41 ... 
Requesting ID 42 ... 
Requesting ID 43 ... 
Requesting ID 44 ... 
Requesting ID 45 ... 
Requesting ID 46 ... 
Requesting ID 47 ... 
Requesting ID 48 ... 
Requesting ID 49 ... 
Requesting ID 50 ... 
OK -- ID: 49 -- Status: "200" -- Content length: 150000
OK -- ID: 47 -- Status: "200" -- Content length: 150000
OK -- ID: 50 -- Status: "200" -- Content length: 150000
OK -- ID: 17 -- Status: "200" -- Content length: 150000
OK -- ID: 48 -- Status: "200" -- Content length: 150000
OK -- ID: 45 -- Status: "200" -- Content length: 150000
OK -- ID: 46 -- Status: "200" -- Content length: 150000
OK -- ID: 10 -- Status: "200" -- Content length: 150000
OK -- ID: 09 -- Status: "200" -- Content length: 150000
OK -- ID: 19 -- Status: "200" -- Content length: 150000
OK -- ID: 13 -- Status: "200" -- Content length: 150000
OK -- ID: 21 -- Status: "200" -- Content length: 150000
OK -- ID: 16 -- Status: "200" -- Content length: 150000
OK -- ID: 27 -- Status: "200" -- Content length: 150000
OK -- ID: 03 -- Status: "200" -- Content length: 150000
OK -- ID: 23 -- Status: "200" -- Content length: 150000
OK -- ID: 29 -- Status: "200" -- Content length: 150000
OK -- ID: 14 -- Status: "200" -- Content length: 150000
OK -- ID: 18 -- Status: "200" -- Content length: 150000
OK -- ID: 01 -- Status: "200" -- Content length: 150000
OK -- ID: 30 -- Status: "200" -- Content length: 150000
OK -- ID: 40 -- Status: "200" -- Content length: 150000
OK -- ID: 05 -- Status: "200" -- Content length: 150000

アップデート：

wait_workers に関するヒントを提供してくれた Stemm に感謝します。私はあなたと私のコードを組み合わせましたが、同じ動作です:(

-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/2]).

start() ->
  ibrowse:start(),
  proc_lib:spawn(?MODULE, send_reqs, []).

to_url(Id) ->
  ?BASE_URL ++ integer_to_list(Id).

fetch_ids() ->
  lists:seq(1, 50).

send_reqs() ->
  spawn_workers(fetch_ids()).

spawn_workers(Ids) ->
  %% collect reference to each worker
  Refs = [ do_spawn(Id) || Id <- Ids ],
  %% wait for response from each worker
  wait_workers(Refs).

wait_workers(Refs) ->
  lists:foreach(fun receive_by_ref/1, Refs).

receive_by_ref(Ref) ->
  %% receive message only from worker with specific reference
  receive
{Ref, done} ->
    done
  end.

do_spawn(Id) ->
  Ref = make_ref(),
  proc_lib:spawn_link(?MODULE, do_send_req, [Id, {self(), Ref}]),
  Ref.

do_send_req(Id, {Pid, Ref}) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
  {ok, Status, _H, B} ->
    io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]),
    %% send message that work is done
    Pid ! {Ref, done};
  Err ->
    io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err]),
    %% repeat request if there was error while fetching a page, 
    do_send_req(Id, {Pid, Ref})
    %% or - if you don't want to repeat request, put there:
    %% Pid ! {Ref, done}
end.

クローラーフォークを実行すると、少数のファイルに対して正常に分岐しますが、コードはファイル全体をフェッチしません (ファイルサイズはそれぞれ 150000 バイトです)。クローラーはいくつかのファイルを部分的にフェッチします。次の Web サーバーログを参照してください:(

82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /10 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /1 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /3 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /8 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /39 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /7 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /6 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /2 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /5 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /50 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /9 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /44 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /38 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /47 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /49 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /43 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /37 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /46 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /48 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /36 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /42 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /41 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /45 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /17 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /35 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /16 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /15 HTTP/1.1" 200 17020 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /21 HTTP/1.1" 200 120360 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /40 HTTP/1.1" 200 117600 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /34 HTTP/1.1" 200 60660 "-" "-"

どんなヒントでも大歓迎です。何が問題なのかわかりません:(

score 1 · Accepted Answer

それで、私があなたを正しく理解していれば、spawn_workers各ワーカーが停止するまで（そしてページをフェッチするまで）関数から制御を返したくありませんか？その場合、次のようにコードを変更できます。

spawn_workers(Ids) ->
  %% collect reference to each worker
  Refs = [ do_spawn(Id) || Id <- Ids ],
  %% wait for response from each worker
  wait_workers(Refs).

wait_workers(Refs) ->
  lists:foreach(fun receive_by_ref/1, Refs).

receive_by_ref(Ref) ->
  %% receive message only from worker with specific reference
  receive
    {Ref, done} ->
        done
  end.

do_spawn(Id) ->
  Ref = make_ref(),
  proc_lib:spawn_link(?MODULE, do_send_req, [Id, {self(), Ref}]),
  Ref.

do_send_req(Id, {Pid, Ref}) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
      {ok, Status, _H, B} ->
        io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]),
        %% send message that work is done
        Pid ! {Ref, done};
      Err ->
        io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err]),
        %% repeat request if there was error while fetching a page, 
        do_send_req(Id, {Pid, Ref})
        %% or - if you don't want to repeat request, put there:
        %% Pid ! {Ref, done}
  end.

編集：

startすべてのワーカーがタスクを終了するのを待たずに、エントリポイント (関数) が制御を返すことに気付きました(そこを呼び出すためspawn)。そこでも待ちたい場合は、同様のトリックを実行してください。

start() ->
  ibrowse:start(),
  Ref = make_ref(),
  proc_lib:spawn(?MODULE, send_reqs, [self(), Ref]),
  receive_by_ref(Ref).

send_reqs(Pid, Ref) ->
  spawn_workers(fetch_ids()),
  Pid ! {Ref, done}.

score 1 · Accepted Answer

スーパーバイザと queue モジュールを組み合わせて使用できます。N 個のフェッチする子を生成し、各子はキューの 1 つのアイテムをフェッチして処理します。完了したら、親プロセスに通知して、キュー内の次のアイテムを続行します。そうすれば、同時リクエストの数に上限を設けることができます。
一度に 500 件のリクエストを生成すると、ibrowse が混乱する可能性があります。コンソールにエラーが表示されますか?
見ibrowse:get_config_value/1て、ibrowse:set_config_value/2

http - Erlang の HTTP クローラー

アップデート：

2 に答える 2

Related

Reference