Ruby/EventMachineには2つのHTTPクライアントが付属しています.
どちらも Transfer-Encoding: chunked を正しく扱えないなどの問題を抱えています.
EventMachine::Protocols::HttpClient を手元で使うぶんには問題ない感じにしたのでパッチを公開します. ライセンスはLicenseIssues - Ruby / EventMachine - Tracのままで.
いまEventMachineを使うかCoro+AnyEvent::HTTPのどちらでクローラを作るかテスト中です. いまのとおろEventMachineのほうがいい感じですが単に私がCoroなどに慣れていないせいかもしれません.
--- /var/lib/gems/1.8/gems/eventmachine-0.12.10/lib/em/protocols/httpclient.rb 2009-11-20 14:40:58.000000000 +0900 +++ lib/httpclient3.rb 2009-12-10 17:59:59.000000000 +0900 @@ -57,7 +57,7 @@ # Handle status codes like 304, 100, etc. # Refactor this code so that protocol errors all get handled one way (an exception?), # instead of sprinkling set_deferred_status :failed calls everywhere. - class HttpClient < Connection + class HttpClient3 < Connection include EventMachine::Deferrable MaxPostContentLength = 20 * 1024 * 1024 @@ -127,7 +127,8 @@ # TODO: We ASSUME the caller wants to send a 1.1 request. May not be a good assumption. req = [ "#{verb} #{request}#{qs} HTTP/#{version}", - "Host: #{host}:#{port}", + #"Host: #{host}:#{port}", + "Host: #{host}", "User-agent: Ruby EventMachine", ] @@ -159,6 +160,8 @@ def receive_data data + data = @data + data + @data = '' while data and data.length > 0 case @read_state when :base @@ -167,6 +170,7 @@ @headers = [] @content_length = nil # not zero @content = "" + @chunked=false @status = nil @read_state = :header @connection_close = nil @@ -175,7 +179,7 @@ if ary.length == 2 data = ary.last if ary.first == "" - if (@content_length and @content_length > 0) || @connection_close + if (@content_length and @content_length > 0) || @connection_close || @chunked @read_state = :content else dispatch_response @@ -195,6 +199,8 @@ @content_length ||= $'.to_i elsif ary.first =~ /\Aconnection:\s*close/i @connection_close = true + elsif ary.first =~ /\ATransfer-Encoding:\s*chunked/i + @chunked = true end end else @@ -216,12 +222,36 @@ @read_state = :base end else + if @chunked + index = data.index("\r\n") + unless index + @data = data + data = '' + break + end + chunk_length = data[0..index].hex + if chunk_length == 0 + data = data[(index+4)..-1] + dispatch_response + @read_state = :base + else + data_start = index + 2 + if data.length < (data_start + chunk_length + 2) + @data = data + data = '' + else + @content << data[data_start, chunk_length] + data = data[(data_start + chunk_length + 2)..-1] + end + end + else @content << data data = "" end end end end + end # We get called here when we have received an HTTP response line. @@ -253,7 +283,7 @@ def unbind if !@connected set_deferred_status :failed, {:status => 0} # YECCCCH. Find a better way to signal no-connect/network error. - elsif (@read_state == :content and @content_length == nil) + elsif (@read_state == :content and @content_length == nil) && !@chunked dispatch_response end end