1
0

zurllib.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. #
  2. # zurllib.py
  3. #
  4. # This is (hopefully) a drop-in for urllib which will request gzip/deflate
  5. # compression and then decompress the output if a compressed response is
  6. # received while maintaining the API.
  7. #
  8. # by Robert Stone 2/22/2003
  9. # extended by Matt Chisholm
  10. # tracker announce --bind support added by Jeremy Evans 11/2005
  11. import sys
  12. import threading
  13. import thread
  14. from BitTorrent import PeerID
  15. user_agent = PeerID.make_id()
  16. del PeerID
  17. import urllib2
  18. OldOpenerDirector = urllib2.OpenerDirector
  19. class MyOpenerDirector(OldOpenerDirector):
  20. def __init__(self):
  21. OldOpenerDirector.__init__(self)
  22. self.addheaders = [('User-agent', user_agent)]
  23. urllib2.OpenerDirector = MyOpenerDirector
  24. del urllib2
  25. from httplib import HTTPConnection, HTTP
  26. from urllib import *
  27. from urllib2 import *
  28. from gzip import GzipFile
  29. from StringIO import StringIO
  30. import pprint
  31. DEBUG = False
  32. url_socket_timeout = 30
  33. http_bindaddr = None
  34. # ow ow ow.
  35. # this is here so we can track open http connections in our pending
  36. # connection count. we have to buffer because maybe urllib connections
  37. # start before rawserver does - hopefully not more than 10 of them!
  38. #
  39. # this can all go away when we use a reasonable http client library
  40. # and the connections are managed inside rawserver
  41. class PreRawServerBuffer(object):
  42. def __init__(self):
  43. self.pending_sockets = {}
  44. self.pending_sockets_lock = threading.RLock()
  45. def add_pending_connection(self, addr):
  46. # the XP connection rate limiting is unique at the IP level
  47. assert isinstance(addr, str)
  48. self.pending_sockets_lock.acquire()
  49. self.pending_sockets.setdefault(addr, 0)
  50. self.pending_sockets[addr] += 1
  51. self.pending_sockets_lock.release()
  52. def remove_pending_connection(self, addr):
  53. self.pending_sockets_lock.acquire()
  54. self.pending_sockets[addr] -= 1
  55. if self.pending_sockets[addr] <= 0:
  56. del self.pending_sockets[addr]
  57. self.pending_sockets_lock.release()
  58. rawserver = PreRawServerBuffer()
  59. def bind_tracker_connection(bindaddr):
  60. global http_bindaddr
  61. http_bindaddr = bindaddr
  62. def set_zurllib_rawserver(new_rawserver):
  63. global rawserver
  64. old_rawserver = rawserver
  65. rawserver = new_rawserver
  66. while old_rawserver.pending_sockets:
  67. addr = old_rawserver.pending_sockets.keys()[0]
  68. new_rawserver.add_pending_connection(addr)
  69. old_rawserver.remove_pending_connection(addr)
  70. assert len(old_rawserver.pending_sockets) == 0
  71. unsafe_threads = []
  72. def add_unsafe_thread():
  73. global unsafe_threads
  74. unsafe_threads.append(thread.get_ident())
  75. class BindingHTTPConnection(HTTPConnection):
  76. def connect(self):
  77. """Connect to the host and port specified in __init__."""
  78. ident = thread.get_ident()
  79. # never, ever, ever call urlopen from any of these threads
  80. assert ident not in unsafe_threads, "You may not use urllib from this thread!"
  81. msg = "getaddrinfo returns an empty list"
  82. for res in socket.getaddrinfo(self.host, self.port, 0,
  83. socket.SOCK_STREAM):
  84. af, socktype, proto, canonname, sa = res
  85. addr = sa[0]
  86. # the obvious multithreading problem is avoided by using locks.
  87. # the lock is only acquired during the function call, so there's
  88. # no danger of urllib blocking rawserver.
  89. rawserver.add_pending_connection(addr)
  90. try:
  91. self.sock = socket.socket(af, socktype, proto)
  92. self.sock.settimeout(url_socket_timeout)
  93. if http_bindaddr:
  94. self.sock.bind((http_bindaddr, 0))
  95. if self.debuglevel > 0:
  96. print "connect: (%s, %s)" % (self.host, self.port)
  97. self.sock.connect(sa)
  98. except socket.error, msg:
  99. if self.debuglevel > 0:
  100. print 'connect fail:', (self.host, self.port)
  101. if self.sock:
  102. self.sock.close()
  103. self.sock = None
  104. rawserver.remove_pending_connection(addr)
  105. if self.sock:
  106. break
  107. if not self.sock:
  108. raise socket.error, msg
  109. class BindingHTTP(HTTP):
  110. _connection_class = BindingHTTPConnection
  111. if sys.version_info >= (2,4):
  112. BindingHTTP = BindingHTTPConnection
  113. class HTTPContentEncodingHandler(HTTPHandler):
  114. """Inherit and add gzip/deflate/etc support to HTTP gets."""
  115. def http_open(self, req):
  116. # add the Accept-Encoding header to the request
  117. # support gzip encoding (identity is assumed)
  118. req.add_header("Accept-Encoding","gzip")
  119. if DEBUG:
  120. print "Sending:"
  121. print req.headers
  122. print "\n"
  123. fp = self.do_open(BindingHTTP, req)
  124. headers = fp.headers
  125. if DEBUG:
  126. pprint.pprint(headers.dict)
  127. url = fp.url
  128. resp = addinfourldecompress(fp, headers, url)
  129. if hasattr(fp, 'code'):
  130. resp.code = fp.code
  131. if hasattr(fp, 'msg'):
  132. resp.msg = fp.msg
  133. return resp
  134. class addinfourldecompress(addinfourl):
  135. """Do gzip decompression if necessary. Do addinfourl stuff too."""
  136. def __init__(self, fp, headers, url):
  137. # we need to do something more sophisticated here to deal with
  138. # multiple values? What about other weird crap like q-values?
  139. # basically this only works for the most simplistic case and will
  140. # break in some other cases, but for now we only care about making
  141. # this work with the BT tracker so....
  142. if headers.has_key('content-encoding') and headers['content-encoding'] == 'gzip':
  143. if DEBUG:
  144. print "Contents of Content-encoding: " + headers['Content-encoding'] + "\n"
  145. self.gzip = 1
  146. self.rawfp = fp
  147. fp = GzipStream(fp)
  148. else:
  149. self.gzip = 0
  150. return addinfourl.__init__(self, fp, headers, url)
  151. def close(self):
  152. self.fp.close()
  153. if self.gzip:
  154. self.rawfp.close()
  155. def iscompressed(self):
  156. return self.gzip
  157. class GzipStream(StringIO):
  158. """Magically decompress a file object.
  159. This is not the most efficient way to do this but GzipFile() wants
  160. to seek, etc, which won't work for a stream such as that from a socket.
  161. So we copy the whole shebang info a StringIO object, decompress that
  162. then let people access the decompressed output as a StringIO object.
  163. The disadvantage is memory use and the advantage is random access.
  164. Will mess with fixing this later.
  165. """
  166. def __init__(self,fp):
  167. self.fp = fp
  168. # this is nasty and needs to be fixed at some point
  169. # copy everything into a StringIO (compressed)
  170. compressed = StringIO()
  171. r = fp.read()
  172. while r:
  173. compressed.write(r)
  174. r = fp.read()
  175. # now, unzip (gz) the StringIO to a string
  176. compressed.seek(0,0)
  177. gz = GzipFile(fileobj = compressed)
  178. str = ''
  179. r = gz.read()
  180. while r:
  181. str += r
  182. r = gz.read()
  183. # close our utility files
  184. compressed.close()
  185. gz.close()
  186. # init our stringio selves with the string
  187. StringIO.__init__(self, str)
  188. del str
  189. def close(self):
  190. self.fp.close()
  191. return StringIO.close(self)
  192. def test():
  193. """Test this module.
  194. At the moment this is lame.
  195. """
  196. print "Running unit tests.\n"
  197. def printcomp(fp):
  198. try:
  199. if fp.iscompressed():
  200. print "GET was compressed.\n"
  201. else:
  202. print "GET was uncompressed.\n"
  203. except:
  204. print "no iscompressed function! this shouldn't happen"
  205. print "Trying to GET a compressed document...\n"
  206. #fp = urlopen('http://a.scarywater.net/hng/index.shtml')
  207. fp = urlopen('http://hotornot.com')
  208. print len(fp.read())
  209. printcomp(fp)
  210. fp.close()
  211. print "Trying to GET a compressed document...\n"
  212. fp = urlopen('http://bittorrent.com')
  213. print len(fp.read())
  214. printcomp(fp)
  215. fp.close()
  216. print "Trying to GET an unknown document...\n"
  217. fp = urlopen('http://www.otaku.org/')
  218. print len(fp.read())
  219. printcomp(fp)
  220. fp.close()
  221. #
  222. # Install the HTTPContentEncodingHandler that we've defined above.
  223. #
  224. install_opener(build_opener(HTTPContentEncodingHandler, ProxyHandler({})))
  225. if __name__ == '__main__':
  226. test()