HTTPDownloader.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. from cStringIO import StringIO
  2. from gzip import GzipFile
  3. from twisted.web.http import parseContentRange # package management sucks! if you have trouble with this line, stop using it!
  4. from twisted.web import error
  5. from twisted.web import client
  6. from twisted.python import failure
  7. from BTL.reactor_magic import reactor
  8. class ProgressHTTPDownloader(client.HTTPDownloader):
  9. def __init__(self, url, file, progressCallback, *a, **kw):
  10. client.HTTPDownloader.__init__(self, url, file, *a, **kw)
  11. self.progressCallback = progressCallback
  12. self.written = 0
  13. def gotHeaders(self, headers):
  14. self.response_headers = headers
  15. client.HTTPDownloader.gotHeaders(self, headers)
  16. self.contentLength = headers.get("content-length", None)
  17. if self.contentLength is not None:
  18. self.contentLength = int(self.contentLength[0])
  19. def pagePart(self, data):
  20. if not self.file:
  21. return
  22. try:
  23. self.file.write(data)
  24. self.written += len(data)
  25. if self.progressCallback:
  26. self.progressCallback(self.written, self.contentLength)
  27. except IOError:
  28. #raise
  29. self.file = None
  30. self.deferred.errback(failure.Failure())
  31. class HTTPPageUnGzip(client.HTTPPageGetter):
  32. decode = False
  33. # there are a lot of broken trackers out there...
  34. delimiter = '\n'
  35. def handleHeader(self, key, value):
  36. if not self.decode:
  37. if key.lower() == 'content-encoding' and value.lower() == 'gzip':
  38. self.decode = True
  39. return client.HTTPPageGetter.handleHeader(self, key, value)
  40. def handleResponse(self, response):
  41. if self.quietLoss:
  42. return
  43. if self.failed:
  44. self.factory.noPage(
  45. failure.Failure(
  46. error.Error(
  47. self.status, self.message, response)))
  48. elif self.length != None and self.length != 0:
  49. self.factory.noPage(failure.Failure(
  50. client.PartialDownloadError(self.status, self.message, response)))
  51. else:
  52. if self.decode:
  53. s = StringIO()
  54. s.write(response)
  55. s.seek(-1)
  56. g = GzipFile(fileobj=s, mode='rb')
  57. try:
  58. response = g.read()
  59. except IOError:
  60. self.factory.noPage(failure.Failure(
  61. client.PartialDownloadError(self.status, self.message, response)))
  62. self.transport.loseConnection()
  63. return
  64. g.close()
  65. self.factory.page(response)
  66. # server might be stupid and not close connection.
  67. self.transport.loseConnection()
  68. def lineReceived(self, line):
  69. return client.HTTPPageGetter.lineReceived(self, line.rstrip('\r'))
  70. class HTTPProxyUnGzipClientFactory(client.HTTPClientFactory):
  71. protocol = HTTPPageUnGzip
  72. def __init__(self, url, method='GET', postdata=None, headers=None,
  73. agent="Twisted PageGetter", timeout=0, cookies=None,
  74. followRedirect=1, proxy=None):
  75. if headers is None:
  76. headers = {}
  77. headers['Accept-encoding'] = 'gzip'
  78. self.proxy = proxy
  79. client.HTTPClientFactory.__init__(self, url, method=method,
  80. postdata=postdata, headers=headers,
  81. agent=agent, timeout=timeout,
  82. cookies=cookies,
  83. followRedirect=followRedirect)
  84. def setURL(self, url):
  85. client.HTTPClientFactory.setURL(self, url)
  86. if self.proxy:
  87. self.path = "%s://%s:%s%s" % (self.scheme,
  88. self.host,
  89. self.port,
  90. self.path)
  91. def getPageFactory(url,
  92. agent="BitTorrent client",
  93. bindAddress=None,
  94. contextFactory=None,
  95. proxy=None,
  96. timeout=120):
  97. """Download a web page as a string.
  98. Download a page. Return a deferred, which will callback with a
  99. page (as a string) or errback with a description of the error.
  100. See HTTPClientFactory to see what extra args can be passed.
  101. """
  102. scheme, host, port, path = client._parse(url)
  103. if proxy:
  104. host, port = proxy.split(':')
  105. port = int(port)
  106. factory = HTTPProxyUnGzipClientFactory(url, agent=agent, proxy=proxy)
  107. if scheme == 'https':
  108. from twisted.internet import ssl
  109. if contextFactory is None:
  110. contextFactory = ssl.ClientContextFactory()
  111. reactor.connectSSL(host, port, factory, contextFactory,
  112. bindAddress=bindAddress,
  113. timeout=timeout)
  114. else:
  115. reactor.connectTCP(host, port, factory,
  116. bindAddress=bindAddress,
  117. timeout=timeout)
  118. return factory
  119. def downloadPageFactory(url, file, progressCallback=None,
  120. agent="BitTorrent client",
  121. bindAddress=None,
  122. contextFactory=None):
  123. """Download a web page to a file.
  124. @param file: path to file on filesystem, or file-like object.
  125. """
  126. scheme, host, port, path = client._parse(url)
  127. factory = ProgressHTTPDownloader(url, file,
  128. progressCallback=progressCallback,
  129. agent=agent,
  130. supportPartial=0)
  131. if scheme == 'https':
  132. from twisted.internet import ssl
  133. if contextFactory is None:
  134. contextFactory = ssl.ClientContextFactory()
  135. reactor.connectSSL(host, port, factory, contextFactory,
  136. bindAddress=bindAddress)
  137. else:
  138. reactor.connectTCP(host, port, factory,
  139. bindAddress=bindAddress)
  140. return factory