Improved logging for news downloads, log performance numbers

This commit is contained in:
Kovid Goyal 2014-08-17 09:43:39 +05:30
parent 1027cf8a1e
commit cf73771c87
3 changed files with 20 additions and 3 deletions

View File

@ -177,6 +177,17 @@ class ThreadSafeLog(Log):
with self._lock: with self._lock:
Log.prints(self, *args, **kwargs) Log.prints(self, *args, **kwargs)
class ThreadSafeWrapper(Log):
def __init__(self, other_log):
Log.__init__(self, level=other_log.filter_level)
self.outputs = list(other_log.outputs)
self._lock = RLock()
def prints(self, *args, **kwargs):
with self._lock:
Log.prints(self, *args, **kwargs)
class GUILog(ThreadSafeLog): class GUILog(ThreadSafeLog):
''' '''

View File

@ -29,6 +29,7 @@ from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf from calibre.utils.date import now as nowf
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import ThreadSafeWrapper
class LoginFailed(ValueError): class LoginFailed(ValueError):
pass pass
@ -841,7 +842,7 @@ class BasicNewsRecipe(Recipe):
:param parser: Command line option parser. Used to intelligently merge options. :param parser: Command line option parser. Used to intelligently merge options.
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
''' '''
self.log = log self.log = ThreadSafeWrapper(log)
if not isinstance(self.title, unicode): if not isinstance(self.title, unicode):
self.title = unicode(self.title, 'utf-8', 'replace') self.title = unicode(self.title, 'utf-8', 'replace')

View File

@ -156,7 +156,8 @@ class RecursiveFetcher(object):
def get_soup(self, src, url=None): def get_soup(self, src, url=None):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps) nmassage.extend(self.preprocess_regexps)
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL), lambda m: '')] # Some websites have buggy doctype declarations that mess up beautifulsoup # Some websites have buggy doctype declarations that mess up beautifulsoup
nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
# Remove comments as they can leave detritus when extracting tags leaves # Remove comments as they can leave detritus when extracting tags leaves
# multiple nested comments # multiple nested comments
nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
@ -207,6 +208,7 @@ class RecursiveFetcher(object):
def fetch_url(self, url): def fetch_url(self, url):
data = None data = None
self.log.debug('Fetching', url) self.log.debug('Fetching', url)
st = time.time()
# Check for a URL pointing to the local filesystem and special case it # Check for a URL pointing to the local filesystem and special case it
# for efficiency and robustness. Bypasses delay checking as it does not # for efficiency and robustness. Bypasses delay checking as it does not
@ -225,6 +227,7 @@ class RecursiveFetcher(object):
data = response(f.read()) data = response(f.read())
data.newurl = 'file:'+url # This is what mechanize does for data.newurl = 'file:'+url # This is what mechanize does for
# local URLs # local URLs
self.log.debug('Fetched %s in %f seconds' % (url, time.time() - st))
return data return data
delta = time.time() - self.last_fetch_at delta = time.time() - self.last_fetch_at
@ -260,11 +263,11 @@ class RecursiveFetcher(object):
raise err raise err
finally: finally:
self.last_fetch_at = time.time() self.last_fetch_at = time.time()
self.log.debug('Fetched %s in %f seconds' % (url, time.time() - st))
return data return data
def start_fetch(self, url): def start_fetch(self, url):
soup = BeautifulSoup(u'<a href="'+url+'" />') soup = BeautifulSoup(u'<a href="'+url+'" />')
self.log.debug('Downloading')
res = self.process_links(soup, url, 0, into_dir='') res = self.process_links(soup, url, 0, into_dir='')
self.log.debug(url, 'saved to', res) self.log.debug(url, 'saved to', res)
return res return res
@ -526,7 +529,9 @@ class RecursiveFetcher(object):
else: else:
dsrc = xml_to_unicode(dsrc, self.verbose)[0] dsrc = xml_to_unicode(dsrc, self.verbose)[0]
st = time.time()
soup = self.get_soup(dsrc, url=iurl) soup = self.get_soup(dsrc, url=iurl)
self.log.debug('Parsed %s in %f seconds' % (iurl, time.time() - st))
base = soup.find('base', href=True) base = soup.find('base', href=True)
if base is not None: if base is not None: