From 1d84c0d6ac90f04bcadbea0dffec75f1d38677db Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 3 Apr 2011 13:52:59 -0600 Subject: [PATCH 01/50] developpez.com by louhike --- recipes/developpez.recipe | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 recipes/developpez.recipe diff --git a/recipes/developpez.recipe b/recipes/developpez.recipe new file mode 100644 index 0000000000..707e702c0a --- /dev/null +++ b/recipes/developpez.recipe @@ -0,0 +1,21 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1301849956(BasicNewsRecipe): + title = u'Developpez.com' + description = u'Toutes les news du site Developpez.com' + publisher = u'Developpez.com' + timefmt = ' [%a, %d %b, %Y]' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + encoding = 'ISO-8859-1' + language = 'fr' + __author__ = 'louhike' + remove_javascript = True + keep_only_tags = [dict(name='div', attrs={'class':'content'})] + + feeds = [(u'Tous les articles', u'http://www.developpez.com/index/rss')] + + def get_cover_url(self): + return 'http://javascript.developpez.com/template/images/logo.gif' + From 492d16e5c996418ec311cfc5a1a2462b1889eaea Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 3 Apr 2011 14:31:14 -0600 Subject: [PATCH 02/50] ODT Input: Fix handling of the element. Fixes #749655 (Private bug) --- src/odf/odf2xhtml.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/odf/odf2xhtml.py b/src/odf/odf2xhtml.py index 390d407d16..26da9d9905 100644 --- a/src/odf/odf2xhtml.py +++ b/src/odf/odf2xhtml.py @@ -1386,12 +1386,19 @@ ol, ul { padding-left: 2em; } self.purgedata() def s_text_s(self, tag, attrs): - """ Generate a number of spaces. ODF has an element; HTML uses   - We use   so we can send the output through an XML parser if we desire to + # Changed by Kovid to fix non breaking spaces being prepended to + # element instead of being part of the text flow. + # We don't use an entity for the nbsp as the contents of self.data will + # be escaped on writeout. + """ Generate a number of spaces. We use the non breaking space for + the text:s ODF element. """ - c = attrs.get( (TEXTNS,'c'),"1") - for x in xrange(int(c)): - self.writeout(' ') + try: + c = int(attrs.get((TEXTNS, 'c'), 1)) + except: + c = 0 + if c > 0: + self.data.append(u'\u00a0'*c) def s_text_span(self, tag, attrs): """ The element matches the element in HTML. It is From 6be7471d2e7d93793de6e25e7e9222cb82b49cc4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 08:02:28 -0600 Subject: [PATCH 03/50] F-Secure by louhike and more work on the new metadata download system --- recipes/f_secure.recipe | 22 +++ src/calibre/ebooks/metadata/sources/amazon.py | 1 + src/calibre/ebooks/metadata/sources/base.py | 6 + .../ebooks/metadata/sources/identify.py | 172 ++++++++++++++++-- 4 files changed, 186 insertions(+), 15 deletions(-) create mode 100644 recipes/f_secure.recipe diff --git a/recipes/f_secure.recipe b/recipes/f_secure.recipe new file mode 100644 index 0000000000..f276a4961a --- /dev/null +++ b/recipes/f_secure.recipe @@ -0,0 +1,22 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1301860159(BasicNewsRecipe): + title = u'F-Secure Weblog' + language = 'en' + __author__ = 'louhike' + description = u'All the news from the weblog of F-Secure' + publisher = u'F-Secure' + timefmt = ' [%a, %d %b, %Y]' + encoding = 'ISO-8859-1' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + language = 'en_EN' + remove_javascript = True + keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})] + remove_tags = [dict(name='a'),dict(name='hr')] + + feeds = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')] + def get_cover_url(self): + return 'http://www.f-secure.com/weblog/archives/images/company_logo.png' diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index cfa2b09ea8..9334d818ec 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -282,6 +282,7 @@ class Amazon(Source): capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'identifier:amazon', 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate']) + has_html_comments = True AMAZON_DOMAINS = { 'com': _('US'), diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 7cc4ed3518..08012c3ee8 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase from calibre.ebooks.metadata import check_isbn msprefs = JSONConfig('metadata_sources.json') +msprefs.defaults['txt_comments'] = False +msprefs.defaults['ignore_fields'] = [] +msprefs.defaults['max_tags'] = 10 def create_log(ostream=None): log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) @@ -104,6 +107,9 @@ class Source(Plugin): #: during the identify phase touched_fields = frozenset() + #: Set this to True if your plugin return HTML formatted comments + has_html_comments = False + def __init__(self, *args, **kwargs): Plugin.__init__(self, *args, **kwargs) self._isbn_to_identifier_cache = {} diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index 1d4d8840e8..ab86e8ffa2 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' import time +from datetime import datetime from Queue import Queue, Empty from threading import Thread from io import BytesIO +from operator import attrgetter from calibre.customize.ui import metadata_plugins -from calibre.ebooks.metadata.sources.base import create_log +from calibre.ebooks.metadata.sources.base import create_log, msprefs from calibre.ebooks.metadata.xisbn import xisbn +from calibre.ebooks.metadata.book.base import Metadata +from calibre.utils.date import utc_tz +from calibre.utils.html2text import html2text # How long to wait for more results after first result is found WAIT_AFTER_FIRST_RESULT = 30 # seconds @@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): log('Merging results from different sources and finding earliest', 'publication dates') start_time = time.time() - merged_results = merge_identify_results(results, log) + results = merge_identify_results(results, log) log('We have %d merged results, merging took: %.2f seconds' % - (len(merged_results), time.time() - start_time)) + (len(results), time.time() - start_time)) + + if msprefs['txt_comments']: + for r in results: + if r.plugin.has_html_comments and r.comments: + r.comments = html2text(r.comments) + + dummy = Metadata(_('Unknown')) + max_tags = msprefs['max_tags'] + for f in msprefs['ignore_fields']: + for r in results: + setattr(r, f, getattr(dummy, f)) + r.tags = r.tags[:max_tags] + + return results + class ISBNMerge(object): def __init__(self): self.pools = {} + self.isbnless_results = [] def isbn_in_pool(self, isbn): if isbn: @@ -140,22 +161,143 @@ class ISBNMerge(object): return True return False - def add_result(self, result, isbn): - pool = self.isbn_in_pool(isbn) - if pool is None: - isbns, min_year = xisbn.get_isbn_pool(isbn) - if not isbns: - isbns = frozenset([isbn]) - self.pool[isbns] = pool = (min_year, []) + def add_result(self, result): + isbn = result.isbn + if isbn: + pool = self.isbn_in_pool(isbn) + if pool is None: + isbns, min_year = xisbn.get_isbn_pool(isbn) + if not isbns: + isbns = frozenset([isbn]) + self.pool[isbns] = pool = (min_year, []) + + if not self.pool_has_result_from_same_source(pool, result): + pool[1].append(result) + else: + self.isbnless_results.append(result) + + def finalize(self): + has_isbn_result = False + for results in self.pools.itervalues(): + if results: + has_isbn_result = True + break + self.has_isbn_result = has_isbn_result + + if has_isbn_result: + self.merge_isbn_results() + else: + self.results = sorted(self.isbnless_results, + key=attrgetter('relevance_in_source')) + + return self.results + + def merge_isbn_results(self): + self.results = [] + for min_year, results in self.pool.itervalues(): + if results: + self.results.append(self.merge(results, min_year)) + + self.results.sort(key=attrgetter('average_source_relevance')) + + def length_merge(self, attr, results, null_value=None, shortest=True): + values = [getattr(x, attr) for x in results if not x.is_null(attr)] + values = [x for x in values if len(x) > 0] + if not values: + return null_value + values.sort(key=len, reverse=not shortest) + return values[0] + + def random_merge(self, attr, results, null_value=None): + values = [getattr(x, attr) for x in results if not x.is_null(attr)] + return values[0] if values else null_value + + def merge(self, results, min_year): + ans = Metadata(_('Unknown')) + + # We assume the shortest title has the least cruft in it + ans.title = self.length_merge('title', results, null_value=ans.title) + + # No harm in having extra authors, maybe something useful like an + # editor or translator + ans.authors = self.length_merge('authors', results, + null_value=ans.authors, shortest=False) + + # We assume the shortest publisher has the least cruft in it + ans.publisher = self.length_merge('publisher', results, + null_value=ans.publisher) + + # We assume the smallest set of tags has the least cruft in it + ans.tags = self.length_merge('tags', results, + null_value=ans.tags) + + # We assume the longest series has the most info in it + ans.series = self.length_merge('series', results, + null_value=ans.series, shortest=False) + for r in results: + if r.series and r.series == ans.series: + ans.series_index = r.series_index + break + + # Average the rating over all sources + ratings = [] + for r in results: + rating = r.rating + if rating and rating > 0 and rating <= 5: + ratings.append(rating) + if ratings: + ans.rating = sum(ratings)/len(ratings) + + # Smallest language is likely to be valid + ans.language = self.length_merge('language', results, + null_value=ans.language) + + # Choose longest comments + ans.comments = self.length_merge('comments', results, + null_value=ans.comments, shortest=False) + + # Published date + if min_year: + min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) + ans.pubdate = min_date + else: + min_date = datetime(10000, 1, 1, tzinfo=utc_tz) + for r in results: + if r.pubdate is not None and r.pubdate < min_date: + min_date = r.pubdate + if min_date.year < 10000: + ans.pubdate = min_date + + # Identifiers + for r in results: + ans.identifiers.update(r.identifiers) + + # Merge any other fields with no special handling (random merge) + touched_fields = set() + for r in results: + touched_fields |= r.plugin.touched_fields + + for f in touched_fields: + if f.startswith('identifier:') or not ans.is_null(f): + continue + setattr(ans, f, self.random_merge(f, results, + null_value=getattr(ans, f))) + + avg = [x.relevance_in_source for x in results] + avg = sum(avg)/len(avg) + ans.average_source_relevance = avg + + return ans - if not self.pool_has_result_from_same_source(pool, result): - pool[1].append(result) def merge_identify_results(result_map, log): + isbn_merge = ISBNMerge() for plugin, results in result_map.iteritems(): for result in results: - isbn = result.isbn - if isbn: - isbns, min_year = xisbn.get_isbn_pool(isbn) + isbn_merge.add_result(result) + + return isbn_merge.finalize() + + From 7599a89c472d92cd29afdbf33f7c6faa7526211c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 09:32:32 -0600 Subject: [PATCH 04/50] Fix #750336 (Pocketbook 602/902 2.0.6 FW won't connect) --- src/calibre/devices/eb600/driver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py index 5374c6c4e2..01277980db 100644 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -244,7 +244,8 @@ class POCKETBOOK602(USBMS): BCD = [0x0324] VENDOR_NAME = '' - WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['PB602', 'PB603', 'PB902', 'PB903'] + WINDOWS_MAIN_MEM = WINDOWS_CARD_A_MEM = ['PB602', 'PB603', 'PB902', + 'PB903', 'PB'] class POCKETBOOK701(USBMS): From 4b7bc8ce365d99a87ce03cd614b3e8e3f5fceb62 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 10:04:51 -0600 Subject: [PATCH 05/50] Fix #750288 (TimesofIndia news fetch not working) --- recipes/toi.recipe | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/recipes/toi.recipe b/recipes/toi.recipe index 643d120a36..8a772b6f9d 100644 --- a/recipes/toi.recipe +++ b/recipes/toi.recipe @@ -1,3 +1,4 @@ +import re from calibre.web.feeds.news import BasicNewsRecipe class TimesOfIndia(BasicNewsRecipe): @@ -8,10 +9,10 @@ class TimesOfIndia(BasicNewsRecipe): max_articles_per_feed = 25 no_stylesheets = True - keep_only_tags = [dict(attrs={'class':'maintable12'})] + keep_only_tags = [{'class':['maintable12', 'prttabl']}] remove_tags = [ dict(style=lambda x: x and 'float' in x), - dict(attrs={'class':'prvnxtbg'}), + {'class':['prvnxtbg', 'footbdrin', 'bcclftr']}, ] feeds = [ @@ -38,8 +39,28 @@ class TimesOfIndia(BasicNewsRecipe): ('Most Read', 'http://timesofindia.indiatimes.com/rssfeedmostread.cms') ] - def print_version(self, url): - return url + '?prtpage=1' + + def get_article_url(self, article): + url = BasicNewsRecipe.get_article_url(self, article) + if '/0Ltimesofindia' in url: + url = url.partition('/0L')[-1] + url = url.replace('0B', '.').replace('0N', '.com').replace('0C', + '/').replace('0E', '-') + url = 'http://' + url.rpartition('/')[0] + match = re.search(r'/([0-9a-zA-Z]+?)\.cms', url) + if match is not None: + num = match.group(1) + num = re.sub(r'[^0-9]', '', num) + return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' % + num) + else: + cms = re.search(r'/(\d+)\.cms', url) + if cms is not None: + return ('http://timesofindia.indiatimes.com/articleshow/%s.cms?prtpage=1' % + cms.group(1)) + + return url + def preprocess_html(self, soup): return soup From 7d1c706835bbc17990596804e232233272fc5796 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 10:41:43 -0600 Subject: [PATCH 06/50] Fix #750101 (Private bug) --- src/calibre/ebooks/pdf/fonts.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/pdf/fonts.cpp b/src/calibre/ebooks/pdf/fonts.cpp index 99ab7517c1..9b9e7708a3 100644 --- a/src/calibre/ebooks/pdf/fonts.cpp +++ b/src/calibre/ebooks/pdf/fonts.cpp @@ -72,6 +72,7 @@ XMLFont::XMLFont(string* font_name, double size, GfxRGB rgb) : size(size-1), line_size(-1.0), italic(false), bold(false), font_name(font_name), font_family(NULL), color(rgb) { + if (!this->font_name) this->font_name = new string(DEFAULT_FONT_FAMILY); this->font_family = family_name(this->font_name); if (strcasestr(font_name->c_str(), "bold")) this->bold = true; @@ -134,7 +135,15 @@ Fonts::size_type Fonts::add_font(XMLFont *f) { } Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) { - XMLFont *f = new XMLFont(font_name, size, rgb); + XMLFont *f = NULL; + if (font_name == NULL) { + string *fn = new string("Unknown"); + f = new XMLFont(fn, size, rgb); + // fn must not be deleted + } else { + f = new XMLFont(font_name, size, rgb); + } + return this->add_font(f); } From 83175da4b297af6c46954ded3b4cd4f476302104 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 10:59:57 -0600 Subject: [PATCH 07/50] ... --- src/calibre/ebooks/pdf/fonts.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/pdf/fonts.cpp b/src/calibre/ebooks/pdf/fonts.cpp index 9b9e7708a3..c5261298ff 100644 --- a/src/calibre/ebooks/pdf/fonts.cpp +++ b/src/calibre/ebooks/pdf/fonts.cpp @@ -136,13 +136,9 @@ Fonts::size_type Fonts::add_font(XMLFont *f) { Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) { XMLFont *f = NULL; - if (font_name == NULL) { - string *fn = new string("Unknown"); - f = new XMLFont(fn, size, rgb); - // fn must not be deleted - } else { - f = new XMLFont(font_name, size, rgb); - } + if (font_name == NULL) + font_name = new string("Unknown"); + f = new XMLFont(font_name, size, rgb); return this->add_font(f); } From 3e1a43e86a50f06d7f71291825b3475db0d73de8 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 11:00:20 -0600 Subject: [PATCH 08/50] ... --- src/calibre/ebooks/pdf/fonts.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/pdf/fonts.cpp b/src/calibre/ebooks/pdf/fonts.cpp index c5261298ff..c3a709869e 100644 --- a/src/calibre/ebooks/pdf/fonts.cpp +++ b/src/calibre/ebooks/pdf/fonts.cpp @@ -138,6 +138,7 @@ Fonts::size_type Fonts::add_font(string* font_name, double size, GfxRGB rgb) { XMLFont *f = NULL; if (font_name == NULL) font_name = new string("Unknown"); + // font_name must not be deleted f = new XMLFont(font_name, size, rgb); return this->add_font(f); From d1859b0f784e972e0ff8af16e7b1afbb9f455c4d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 11:14:12 -0600 Subject: [PATCH 09/50] ... --- src/calibre/ebooks/metadata/sources/base.py | 1 + .../ebooks/metadata/sources/identify.py | 194 +++++++++--------- 2 files changed, 100 insertions(+), 95 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 08012c3ee8..d306a02bcb 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -21,6 +21,7 @@ msprefs = JSONConfig('metadata_sources.json') msprefs.defaults['txt_comments'] = False msprefs.defaults['ignore_fields'] = [] msprefs.defaults['max_tags'] = 10 +msprefs.defaults['wait_after_first_identify_result'] = 30 # seconds def create_log(ostream=None): log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index ab86e8ffa2..87d34c0bff 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -21,9 +21,7 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import utc_tz from calibre.utils.html2text import html2text -# How long to wait for more results after first result is found -WAIT_AFTER_FIRST_RESULT = 30 # seconds - +# Download worker {{{ class Worker(Thread): def __init__(self, plugin, kwargs, abort): @@ -47,99 +45,9 @@ def is_worker_alive(workers): return True return False -def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): - start_time = time.time() - plugins = list(metadata_plugins['identify']) - - kwargs = { - 'title': title, - 'authors': authors, - 'identifiers': identifiers, - 'timeout': timeout, - } - - log('Running identify query with parameters:') - log(kwargs) - log('Using plugins:', ', '.join([p.name for p in plugins])) - log('The log (if any) from individual plugins is below') - - workers = [Worker(p, kwargs, abort) for p in plugins] - for w in workers: - w.start() - - first_result_at = None - results = dict.fromkeys(plugins, []) - - def get_results(): - found = False - for w in workers: - try: - result = w.rq.get_nowait() - except Empty: - pass - else: - results[w.plugin].append(result) - found = True - return found - - while True: - time.sleep(0.2) - - if get_results() and first_result_at is None: - first_result_at = time.time() - - if not is_worker_alive(workers): - break - - if (first_result_at is not None and time.time() - first_result_at < - WAIT_AFTER_FIRST_RESULT): - log('Not waiting any longer for more results') - abort.set() - break - - get_results() - sort_kwargs = dict(kwargs) - for k in list(sort_kwargs.iterkeys()): - if k not in ('title', 'authors', 'identifiers'): - sort_kwargs.pop(k) - - for plugin, results in results.iteritems(): - results.sort(key=plugin.identify_results_keygen(**sort_kwargs)) - plog = plugin.buf.getvalue().strip() - if plog: - log('\n'+'*'*35, plugin.name, '*'*35) - log('Found %d results'%len(results)) - log(plog) - log('\n'+'*'*80) - - for i, result in enumerate(results): - result.relevance_in_source = i - result.has_cached_cover_url = \ - plugin.get_cached_cover_url(result.identifiers) is not None - result.identify_plugin = plugin - - log('The identify phase took %.2f seconds'%(time.time() - start_time)) - log('Merging results from different sources and finding earliest', - 'publication dates') - start_time = time.time() - results = merge_identify_results(results, log) - log('We have %d merged results, merging took: %.2f seconds' % - (len(results), time.time() - start_time)) - - if msprefs['txt_comments']: - for r in results: - if r.plugin.has_html_comments and r.comments: - r.comments = html2text(r.comments) - - dummy = Metadata(_('Unknown')) - max_tags = msprefs['max_tags'] - for f in msprefs['ignore_fields']: - for r in results: - setattr(r, f, getattr(dummy, f)) - r.tags = r.tags[:max_tags] - - return results +# }}} +# Merge results from different sources {{{ class ISBNMerge(object): @@ -298,6 +206,102 @@ def merge_identify_results(result_map, log): return isbn_merge.finalize() +# }}} + +def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): + start_time = time.time() + plugins = list(metadata_plugins['identify']) + + kwargs = { + 'title': title, + 'authors': authors, + 'identifiers': identifiers, + 'timeout': timeout, + } + + log('Running identify query with parameters:') + log(kwargs) + log('Using plugins:', ', '.join([p.name for p in plugins])) + log('The log (if any) from individual plugins is below') + + workers = [Worker(p, kwargs, abort) for p in plugins] + for w in workers: + w.start() + + first_result_at = None + results = dict.fromkeys(plugins, []) + + def get_results(): + found = False + for w in workers: + try: + result = w.rq.get_nowait() + except Empty: + pass + else: + results[w.plugin].append(result) + found = True + return found + + wait_time = msprefs['wait_after_first_identify_result'] + while True: + time.sleep(0.2) + + if get_results() and first_result_at is None: + first_result_at = time.time() + + if not is_worker_alive(workers): + break + + if (first_result_at is not None and time.time() - first_result_at < + wait_time): + log('Not waiting any longer for more results') + abort.set() + break + + get_results() + sort_kwargs = dict(kwargs) + for k in list(sort_kwargs.iterkeys()): + if k not in ('title', 'authors', 'identifiers'): + sort_kwargs.pop(k) + + for plugin, results in results.iteritems(): + results.sort(key=plugin.identify_results_keygen(**sort_kwargs)) + plog = plugin.buf.getvalue().strip() + if plog: + log('\n'+'*'*35, plugin.name, '*'*35) + log('Found %d results'%len(results)) + log(plog) + log('\n'+'*'*80) + + for i, result in enumerate(results): + result.relevance_in_source = i + result.has_cached_cover_url = \ + plugin.get_cached_cover_url(result.identifiers) is not None + result.identify_plugin = plugin + + log('The identify phase took %.2f seconds'%(time.time() - start_time)) + log('Merging results from different sources and finding earliest', + 'publication dates') + start_time = time.time() + results = merge_identify_results(results, log) + log('We have %d merged results, merging took: %.2f seconds' % + (len(results), time.time() - start_time)) + + if msprefs['txt_comments']: + for r in results: + if r.plugin.has_html_comments and r.comments: + r.comments = html2text(r.comments) + + dummy = Metadata(_('Unknown')) + max_tags = msprefs['max_tags'] + for f in msprefs['ignore_fields']: + for r in results: + setattr(r, f, getattr(dummy, f)) + r.tags = r.tags[:max_tags] + + return results + From ac3693cfdc586b6c3f89bb5841d6fc881d3c6b7c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 11:36:26 -0600 Subject: [PATCH 10/50] Conversion pipeline: Handle inline