Improved logging for news downloads, log performance numbers

2025-07-09 03:04:10 -04:00 · 2014-08-17 09:43:39 +05:30 · 2014-08-17 09:43:39 +05:30 · cf73771c87
commit cf73771c87
parent 1027cf8a1e
3 changed files with 20 additions and 3 deletions
--- a/src/calibre/utils/logging.py
+++ b/src/calibre/utils/logging.py
@ -177,6 +177,17 @@ class ThreadSafeLog(Log):
        with self._lock:
            Log.prints(self, *args, **kwargs)
 class ThreadSafeWrapper(Log):
    def __init__(self, other_log):
        Log.__init__(self, level=other_log.filter_level)
        self.outputs = list(other_log.outputs)
        self._lock = RLock()
    def prints(self, *args, **kwargs):
        with self._lock:
            Log.prints(self, *args, **kwargs)
 class GUILog(ThreadSafeLog):
    '''
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -29,6 +29,7 @@ from calibre.ptempfile import PersistentTemporaryFile
 from calibre.utils.date import now as nowf
 from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
 from calibre.utils.localization import canonicalize_lang
 from calibre.utils.logging import ThreadSafeWrapper
 class LoginFailed(ValueError):
    pass
@ -841,7 +842,7 @@ class BasicNewsRecipe(Recipe):
        :param parser:  Command line option parser. Used to intelligently merge options.
        :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
        '''
-        self.log = log
+        self.log = ThreadSafeWrapper(log)
        if not isinstance(self.title, unicode):
            self.title = unicode(self.title, 'utf-8', 'replace')
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -156,7 +156,8 @@ class RecursiveFetcher(object):
    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
-        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')]  # Some websites have buggy doctype declarations that mess up beautifulsoup
+        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
@ -207,6 +208,7 @@ class RecursiveFetcher(object):
    def fetch_url(self, url):
        data = None
        self.log.debug('Fetching', url)
        st = time.time()
        # Check for a URL pointing to the local filesystem and special case it
        # for efficiency and robustness. Bypasses delay checking as it does not
@ -225,6 +227,7 @@ class RecursiveFetcher(object):
                data = response(f.read())
                data.newurl = 'file:'+url  # This is what mechanize does for
                # local URLs
            self.log.debug('Fetched %s in %f seconds' % (url, time.time() - st))
            return data
        delta = time.time() - self.last_fetch_at
@ -260,11 +263,11 @@ class RecursiveFetcher(object):
                raise err
        finally:
            self.last_fetch_at = time.time()
        self.log.debug('Fetched %s in %f seconds' % (url, time.time() - st))
        return data
    def start_fetch(self, url):
        soup = BeautifulSoup(u'<a href="'+url+'" />')
        self.log.debug('Downloading')
        res = self.process_links(soup, url, 0, into_dir='')
        self.log.debug(url, 'saved to', res)
        return res
@ -526,7 +529,9 @@ class RecursiveFetcher(object):
                    else:
                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]
                    st = time.time()
                    soup = self.get_soup(dsrc, url=iurl)
                    self.log.debug('Parsed %s in %f seconds' % (iurl, time.time() - st))
                    base = soup.find('base', href=True)
                    if base is not None: