diff --git a/src/calibre/utils/logging.py b/src/calibre/utils/logging.py index ac6e9bfcbd..c557ee6790 100644 --- a/src/calibre/utils/logging.py +++ b/src/calibre/utils/logging.py @@ -177,6 +177,17 @@ class ThreadSafeLog(Log): with self._lock: Log.prints(self, *args, **kwargs) +class ThreadSafeWrapper(Log): + + def __init__(self, other_log): + Log.__init__(self, level=other_log.filter_level) + self.outputs = list(other_log.outputs) + self._lock = RLock() + + def prints(self, *args, **kwargs): + with self._lock: + Log.prints(self, *args, **kwargs) + class GUILog(ThreadSafeLog): ''' diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index d07930bb02..47d9bf95a7 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -29,6 +29,7 @@ from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image from calibre.utils.localization import canonicalize_lang +from calibre.utils.logging import ThreadSafeWrapper class LoginFailed(ValueError): pass @@ -841,7 +842,7 @@ class BasicNewsRecipe(Recipe): :param parser: Command line option parser. Used to intelligently merge options. :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. ''' - self.log = log + self.log = ThreadSafeWrapper(log) if not isinstance(self.title, unicode): self.title = unicode(self.title, 'utf-8', 'replace') diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index f38ae73dab..8b8d18908a 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -156,7 +156,8 @@ class RecursiveFetcher(object): def get_soup(self, src, url=None): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) - nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup + # Some websites have buggy doctype declarations that mess up beautifulsoup + nmassage += [(re.compile(r'', re.DOTALL|re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'', re.DOTALL), lambda m: '')) @@ -207,6 +208,7 @@ class RecursiveFetcher(object): def fetch_url(self, url): data = None self.log.debug('Fetching', url) + st = time.time() # Check for a URL pointing to the local filesystem and special case it # for efficiency and robustness. Bypasses delay checking as it does not @@ -225,6 +227,7 @@ class RecursiveFetcher(object): data = response(f.read()) data.newurl = 'file:'+url # This is what mechanize does for # local URLs + self.log.debug('Fetched %s in %f seconds' % (url, time.time() - st)) return data delta = time.time() - self.last_fetch_at @@ -260,11 +263,11 @@ class RecursiveFetcher(object): raise err finally: self.last_fetch_at = time.time() + self.log.debug('Fetched %s in %f seconds' % (url, time.time() - st)) return data def start_fetch(self, url): soup = BeautifulSoup(u'') - self.log.debug('Downloading') res = self.process_links(soup, url, 0, into_dir='') self.log.debug(url, 'saved to', res) return res @@ -526,7 +529,9 @@ class RecursiveFetcher(object): else: dsrc = xml_to_unicode(dsrc, self.verbose)[0] + st = time.time() soup = self.get_soup(dsrc, url=iurl) + self.log.debug('Parsed %s in %f seconds' % (iurl, time.time() - st)) base = soup.find('base', href=True) if base is not None: