Speed up news download by implementing multi-threaded downloading

This commit is contained in:
Kovid Goyal 2009-12-10 19:42:47 -07:00
parent ea2dfd7ce2
commit 802e8caef6
2 changed files with 23 additions and 20 deletions

View File

@ -24,6 +24,12 @@
news sources.
type: major
- title: "Speed up download of news"
description: >
Speed up download of news by allowing the downlaod to happen in multiple threads
(default 5). This may break some recipes, so please report any breakage you notice.
type: major
- title: "FB2 Output: Support creation of TOC from <h1> tags"
- title: "E-book viewer: Make keyboard shortcuts customizable"
@ -95,6 +101,7 @@
- Soldiers Magazine
- The Economist
- Arizona Daily Star
- ESPN
- version: 0.6.26

View File

@ -9,7 +9,6 @@ UTF-8 encoding with any charset declarations removed.
'''
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname, quote
from threading import RLock
from httplib import responses
from PIL import Image
from cStringIO import StringIO
@ -39,7 +38,6 @@ class closing(object):
except Exception:
pass
_browser_lock = RLock()
bad_url_counter = 0
def basename(url):
@ -125,7 +123,6 @@ class RecursiveFetcher(object):
self.imagemap_lock = threading.RLock()
self.stylemap = css_map
self.image_url_processor = None
self.browser_lock = _browser_lock
self.stylemap_lock = threading.RLock()
self.downloaded_paths = []
self.current_dir = self.base_dir
@ -196,26 +193,25 @@ class RecursiveFetcher(object):
for i in range(2, 6):
purl[i] = quote(purl[i])
url = urlparse.urlunparse(purl)
with self.browser_lock:
try:
try:
with closing(self.browser.open_novisit(url, timeout=self.timeout)) as f:
data = response(f.read()+f.read())
data.newurl = f.geturl()
except urllib2.URLError, err:
if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code]
if getattr(err, 'reason', [0])[0] == 104 or \
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
self.log.debug('Temporary error, retrying in 1 second')
time.sleep(1)
with closing(self.browser.open(url, timeout=self.timeout)) as f:
data = response(f.read()+f.read())
data.newurl = f.geturl()
except urllib2.URLError, err:
if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code]
if getattr(err, 'reason', [0])[0] == 104 or \
getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know
self.log.debug('Temporary error, retrying in 1 second')
time.sleep(1)
with closing(self.browser.open(url, timeout=self.timeout)) as f:
data = response(f.read()+f.read())
data.newurl = f.geturl()
else:
raise err
finally:
self.last_fetch_at = time.time()
return data
else:
raise err
finally:
self.last_fetch_at = time.time()
return data
def start_fetch(self, url):