From 802e8caef6aa2bce8bbc063343e346776e91f171 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 10 Dec 2009 19:42:47 -0700 Subject: [PATCH] Speed up news download by implementing multi-threaded downloading --- Changelog.yaml | 7 +++++++ src/calibre/web/fetch/simple.py | 36 +++++++++++++++------------------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/Changelog.yaml b/Changelog.yaml index 62bbc38dce..1b5dcb7b69 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -24,6 +24,12 @@ news sources. type: major + - title: "Speed up download of news" + description: > + Speed up download of news by allowing the downlaod to happen in multiple threads + (default 5). This may break some recipes, so please report any breakage you notice. + type: major + - title: "FB2 Output: Support creation of TOC from

tags" - title: "E-book viewer: Make keyboard shortcuts customizable" @@ -95,6 +101,7 @@ - Soldiers Magazine - The Economist - Arizona Daily Star + - ESPN - version: 0.6.26 diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 95a0e78afe..4957e81b14 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -9,7 +9,6 @@ UTF-8 encoding with any charset declarations removed. ''' import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback from urllib import url2pathname, quote -from threading import RLock from httplib import responses from PIL import Image from cStringIO import StringIO @@ -39,7 +38,6 @@ class closing(object): except Exception: pass -_browser_lock = RLock() bad_url_counter = 0 def basename(url): @@ -125,7 +123,6 @@ class RecursiveFetcher(object): self.imagemap_lock = threading.RLock() self.stylemap = css_map self.image_url_processor = None - self.browser_lock = _browser_lock self.stylemap_lock = threading.RLock() self.downloaded_paths = [] self.current_dir = self.base_dir @@ -196,26 +193,25 @@ class RecursiveFetcher(object): for i in range(2, 6): purl[i] = quote(purl[i]) url = urlparse.urlunparse(purl) - with self.browser_lock: - try: + try: + with closing(self.browser.open_novisit(url, timeout=self.timeout)) as f: + data = response(f.read()+f.read()) + data.newurl = f.geturl() + except urllib2.URLError, err: + if hasattr(err, 'code') and responses.has_key(err.code): + raise FetchError, responses[err.code] + if getattr(err, 'reason', [0])[0] == 104 or \ + getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know + self.log.debug('Temporary error, retrying in 1 second') + time.sleep(1) with closing(self.browser.open(url, timeout=self.timeout)) as f: data = response(f.read()+f.read()) data.newurl = f.geturl() - except urllib2.URLError, err: - if hasattr(err, 'code') and responses.has_key(err.code): - raise FetchError, responses[err.code] - if getattr(err, 'reason', [0])[0] == 104 or \ - getattr(getattr(err, 'args', [None])[0], 'errno', None) == -2: # Connection reset by peer or Name or service not know - self.log.debug('Temporary error, retrying in 1 second') - time.sleep(1) - with closing(self.browser.open(url, timeout=self.timeout)) as f: - data = response(f.read()+f.read()) - data.newurl = f.geturl() - else: - raise err - finally: - self.last_fetch_at = time.time() - return data + else: + raise err + finally: + self.last_fetch_at = time.time() + return data def start_fetch(self, url):