Speed up news download by implementing multi-threaded downloading

This commit is contained in:
Kovid Goyal 2009-12-10 19:42:47 -07:00
parent ea2dfd7ce2
commit 802e8caef6
2 changed files with 23 additions and 20 deletions

View File

@ -24,6 +24,12 @@
news sources. news sources.
type: major type: major
- title: "Speed up download of news"
description: >
Speed up download of news by allowing the downlaod to happen in multiple threads
(default 5). This may break some recipes, so please report any breakage you notice.
type: major
- title: "FB2 Output: Support creation of TOC from <h1> tags" - title: "FB2 Output: Support creation of TOC from <h1> tags"
- title: "E-book viewer: Make keyboard shortcuts customizable" - title: "E-book viewer: Make keyboard shortcuts customizable"
@ -95,6 +101,7 @@
- Soldiers Magazine - Soldiers Magazine
- The Economist - The Economist
- Arizona Daily Star - Arizona Daily Star
- ESPN
- version: 0.6.26 - version: 0.6.26

View File

@ -9,7 +9,6 @@ UTF-8 encoding with any charset declarations removed.
''' '''
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname, quote from urllib import url2pathname, quote
from threading import RLock
from httplib import responses from httplib import responses
from PIL import Image from PIL import Image
from cStringIO import StringIO from cStringIO import StringIO
@ -39,7 +38,6 @@ class closing(object):
except Exception: except Exception:
pass pass
_browser_lock = RLock()
bad_url_counter = 0 bad_url_counter = 0
def basename(url): def basename(url):
@ -125,7 +123,6 @@ class RecursiveFetcher(object):
self.imagemap_lock = threading.RLock() self.imagemap_lock = threading.RLock()
self.stylemap = css_map self.stylemap = css_map
self.image_url_processor = None self.image_url_processor = None
self.browser_lock = _browser_lock
self.stylemap_lock = threading.RLock() self.stylemap_lock = threading.RLock()
self.downloaded_paths = [] self.downloaded_paths = []
self.current_dir = self.base_dir self.current_dir = self.base_dir
@ -196,26 +193,25 @@ class RecursiveFetcher(object):
for i in range(2, 6): for i in range(2, 6):
purl[i] = quote(purl[i]) purl[i] = quote(purl[i])
url = urlparse.urlunparse(purl) url = urlparse.urlunparse(purl)
with self.browser_lock: try:
try: with closing(self.browser.open_novisit(url, timeout=self.timeout)) as f:
data = response(f.read()+f.read())
data.newurl = f.geturl()
except urllib2.URLError, err:
if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code]
if getattr(err, 'reason', [0])[0] == 104 or \
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
self.log.debug('Temporary error, retrying in 1 second')
time.sleep(1)
with closing(self.browser.open(url, timeout=self.timeout)) as f: with closing(self.browser.open(url, timeout=self.timeout)) as f:
data = response(f.read()+f.read()) data = response(f.read()+f.read())
data.newurl = f.geturl() data.newurl = f.geturl()
except urllib2.URLError, err: else:
if hasattr(err, 'code') and responses.has_key(err.code): raise err
raise FetchError, responses[err.code] finally:
if getattr(err, 'reason', [0])[0] == 104 or \ self.last_fetch_at = time.time()
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know return data
self.log.debug('Temporary error, retrying in 1 second')
time.sleep(1)
with closing(self.browser.open(url, timeout=self.timeout)) as f:
data = response(f.read()+f.read())
data.newurl = f.geturl()
else:
raise err
finally:
self.last_fetch_at = time.time()
return data
def start_fetch(self, url): def start_fetch(self, url):