Ruby/EventMachineには2つのHTTPクライアントが付属しています.

どちらも Transfer-Encoding: chunked を正しく扱えないなどの問題を抱えています.

EventMachine::Protocols::HttpClient を手元で使うぶんには問題ない感じにしたのでパッチを公開します. ライセンスはLicenseIssues - Ruby / EventMachine - Tracのままで.

いまEventMachineを使うかCoro+AnyEvent::HTTPのどちらでクローラを作るかテスト中です. いまのとおろEventMachineのほうがいい感じですが単に私がCoroなどに慣れていないせいかもしれません.

--- /var/lib/gems/1.8/gems/eventmachine-0.12.10/lib/em/protocols/httpclient.rb	2009-11-20 14:40:58.000000000 +0900
+++ lib/httpclient3.rb	2009-12-10 17:59:59.000000000 +0900
@@ -57,7 +57,7 @@
     # Handle status codes like 304, 100, etc.
     # Refactor this code so that protocol errors all get handled one way (an exception?),
     # instead of sprinkling set_deferred_status :failed calls everywhere.
-    class HttpClient < Connection
+    class HttpClient3 < Connection
       include EventMachine::Deferrable
 
       MaxPostContentLength = 20 * 1024 * 1024
@@ -127,7 +127,8 @@
         # TODO: We ASSUME the caller wants to send a 1.1 request. May not be a good assumption.
         req = [
           "#{verb} #{request}#{qs} HTTP/#{version}",
-          "Host: #{host}:#{port}",
+          #"Host: #{host}:#{port}",
+          "Host: #{host}",
           "User-agent: Ruby EventMachine",
         ]
 
@@ -159,6 +160,8 @@
 
 
       def receive_data data
+        data = @data + data
+        @data = ''
         while data and data.length > 0
           case @read_state
           when :base
@@ -167,6 +170,7 @@
             @headers = []
             @content_length = nil # not zero
             @content = ""
+            @chunked=false
             @status = nil
             @read_state = :header
             @connection_close = nil
@@ -175,7 +179,7 @@
             if ary.length == 2
               data = ary.last
               if ary.first == ""
-                if (@content_length and @content_length > 0) || @connection_close
+                if (@content_length and @content_length > 0) || @connection_close || @chunked
                   @read_state = :content
                 else
                   dispatch_response
@@ -195,6 +199,8 @@
                   @content_length ||= $'.to_i
                 elsif ary.first =~ /\Aconnection:\s*close/i
                   @connection_close = true
+                elsif ary.first =~ /\ATransfer-Encoding:\s*chunked/i
+                  @chunked = true
                 end
               end
             else
@@ -216,12 +222,36 @@
                 @read_state = :base
               end
             else
+              if @chunked 
+                index = data.index("\r\n")
+                unless index
+                  @data = data
+                  data = ''
+                  break
+                end
+                chunk_length = data[0..index].hex
+                if chunk_length == 0
+                  data = data[(index+4)..-1]
+                  dispatch_response
+                  @read_state = :base
+                else
+                  data_start = index + 2
+                  if data.length < (data_start + chunk_length + 2)
+                    @data = data
+                    data = ''
+                  else
+                    @content << data[data_start, chunk_length]
+                    data = data[(data_start + chunk_length + 2)..-1]
+                  end
+                end
+              else
               @content << data
               data = ""
             end
           end
         end
       end
+      end
 
 
       # We get called here when we have received an HTTP response line.
@@ -253,7 +283,7 @@
       def unbind
         if !@connected
           set_deferred_status :failed, {:status => 0} # YECCCCH. Find a better way to signal no-connect/network error.
-        elsif (@read_state == :content and @content_length == nil)
+        elsif (@read_state == :content and @content_length == nil) && !@chunked
           dispatch_response
         end
       end