From 6be7471d2e7d93793de6e25e7e9222cb82b49cc4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 4 Apr 2011 08:02:28 -0600 Subject: [PATCH] F-Secure by louhike and more work on the new metadata download system --- recipes/f_secure.recipe | 22 +++ src/calibre/ebooks/metadata/sources/amazon.py | 1 + src/calibre/ebooks/metadata/sources/base.py | 6 + .../ebooks/metadata/sources/identify.py | 172 ++++++++++++++++-- 4 files changed, 186 insertions(+), 15 deletions(-) create mode 100644 recipes/f_secure.recipe diff --git a/recipes/f_secure.recipe b/recipes/f_secure.recipe new file mode 100644 index 0000000000..f276a4961a --- /dev/null +++ b/recipes/f_secure.recipe @@ -0,0 +1,22 @@ +from calibre.web.feeds.news import BasicNewsRecipe + +class AdvancedUserRecipe1301860159(BasicNewsRecipe): + title = u'F-Secure Weblog' + language = 'en' + __author__ = 'louhike' + description = u'All the news from the weblog of F-Secure' + publisher = u'F-Secure' + timefmt = ' [%a, %d %b, %Y]' + encoding = 'ISO-8859-1' + oldest_article = 7 + max_articles_per_feed = 100 + no_stylesheets = True + use_embedded_content = False + language = 'en_EN' + remove_javascript = True + keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})] + remove_tags = [dict(name='a'),dict(name='hr')] + + feeds = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')] + def get_cover_url(self): + return 'http://www.f-secure.com/weblog/archives/images/company_logo.png' diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index cfa2b09ea8..9334d818ec 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -282,6 +282,7 @@ class Amazon(Source): capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'identifier:amazon', 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate']) + has_html_comments = True AMAZON_DOMAINS = { 'com': _('US'), diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 7cc4ed3518..08012c3ee8 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase from calibre.ebooks.metadata import check_isbn msprefs = JSONConfig('metadata_sources.json') +msprefs.defaults['txt_comments'] = False +msprefs.defaults['ignore_fields'] = [] +msprefs.defaults['max_tags'] = 10 def create_log(ostream=None): log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) @@ -104,6 +107,9 @@ class Source(Plugin): #: during the identify phase touched_fields = frozenset() + #: Set this to True if your plugin return HTML formatted comments + has_html_comments = False + def __init__(self, *args, **kwargs): Plugin.__init__(self, *args, **kwargs) self._isbn_to_identifier_cache = {} diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py index 1d4d8840e8..ab86e8ffa2 100644 --- a/src/calibre/ebooks/metadata/sources/identify.py +++ b/src/calibre/ebooks/metadata/sources/identify.py @@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' import time +from datetime import datetime from Queue import Queue, Empty from threading import Thread from io import BytesIO +from operator import attrgetter from calibre.customize.ui import metadata_plugins -from calibre.ebooks.metadata.sources.base import create_log +from calibre.ebooks.metadata.sources.base import create_log, msprefs from calibre.ebooks.metadata.xisbn import xisbn +from calibre.ebooks.metadata.book.base import Metadata +from calibre.utils.date import utc_tz +from calibre.utils.html2text import html2text # How long to wait for more results after first result is found WAIT_AFTER_FIRST_RESULT = 30 # seconds @@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30): log('Merging results from different sources and finding earliest', 'publication dates') start_time = time.time() - merged_results = merge_identify_results(results, log) + results = merge_identify_results(results, log) log('We have %d merged results, merging took: %.2f seconds' % - (len(merged_results), time.time() - start_time)) + (len(results), time.time() - start_time)) + + if msprefs['txt_comments']: + for r in results: + if r.plugin.has_html_comments and r.comments: + r.comments = html2text(r.comments) + + dummy = Metadata(_('Unknown')) + max_tags = msprefs['max_tags'] + for f in msprefs['ignore_fields']: + for r in results: + setattr(r, f, getattr(dummy, f)) + r.tags = r.tags[:max_tags] + + return results + class ISBNMerge(object): def __init__(self): self.pools = {} + self.isbnless_results = [] def isbn_in_pool(self, isbn): if isbn: @@ -140,22 +161,143 @@ class ISBNMerge(object): return True return False - def add_result(self, result, isbn): - pool = self.isbn_in_pool(isbn) - if pool is None: - isbns, min_year = xisbn.get_isbn_pool(isbn) - if not isbns: - isbns = frozenset([isbn]) - self.pool[isbns] = pool = (min_year, []) + def add_result(self, result): + isbn = result.isbn + if isbn: + pool = self.isbn_in_pool(isbn) + if pool is None: + isbns, min_year = xisbn.get_isbn_pool(isbn) + if not isbns: + isbns = frozenset([isbn]) + self.pool[isbns] = pool = (min_year, []) + + if not self.pool_has_result_from_same_source(pool, result): + pool[1].append(result) + else: + self.isbnless_results.append(result) + + def finalize(self): + has_isbn_result = False + for results in self.pools.itervalues(): + if results: + has_isbn_result = True + break + self.has_isbn_result = has_isbn_result + + if has_isbn_result: + self.merge_isbn_results() + else: + self.results = sorted(self.isbnless_results, + key=attrgetter('relevance_in_source')) + + return self.results + + def merge_isbn_results(self): + self.results = [] + for min_year, results in self.pool.itervalues(): + if results: + self.results.append(self.merge(results, min_year)) + + self.results.sort(key=attrgetter('average_source_relevance')) + + def length_merge(self, attr, results, null_value=None, shortest=True): + values = [getattr(x, attr) for x in results if not x.is_null(attr)] + values = [x for x in values if len(x) > 0] + if not values: + return null_value + values.sort(key=len, reverse=not shortest) + return values[0] + + def random_merge(self, attr, results, null_value=None): + values = [getattr(x, attr) for x in results if not x.is_null(attr)] + return values[0] if values else null_value + + def merge(self, results, min_year): + ans = Metadata(_('Unknown')) + + # We assume the shortest title has the least cruft in it + ans.title = self.length_merge('title', results, null_value=ans.title) + + # No harm in having extra authors, maybe something useful like an + # editor or translator + ans.authors = self.length_merge('authors', results, + null_value=ans.authors, shortest=False) + + # We assume the shortest publisher has the least cruft in it + ans.publisher = self.length_merge('publisher', results, + null_value=ans.publisher) + + # We assume the smallest set of tags has the least cruft in it + ans.tags = self.length_merge('tags', results, + null_value=ans.tags) + + # We assume the longest series has the most info in it + ans.series = self.length_merge('series', results, + null_value=ans.series, shortest=False) + for r in results: + if r.series and r.series == ans.series: + ans.series_index = r.series_index + break + + # Average the rating over all sources + ratings = [] + for r in results: + rating = r.rating + if rating and rating > 0 and rating <= 5: + ratings.append(rating) + if ratings: + ans.rating = sum(ratings)/len(ratings) + + # Smallest language is likely to be valid + ans.language = self.length_merge('language', results, + null_value=ans.language) + + # Choose longest comments + ans.comments = self.length_merge('comments', results, + null_value=ans.comments, shortest=False) + + # Published date + if min_year: + min_date = datetime(min_year, 1, 2, tzinfo=utc_tz) + ans.pubdate = min_date + else: + min_date = datetime(10000, 1, 1, tzinfo=utc_tz) + for r in results: + if r.pubdate is not None and r.pubdate < min_date: + min_date = r.pubdate + if min_date.year < 10000: + ans.pubdate = min_date + + # Identifiers + for r in results: + ans.identifiers.update(r.identifiers) + + # Merge any other fields with no special handling (random merge) + touched_fields = set() + for r in results: + touched_fields |= r.plugin.touched_fields + + for f in touched_fields: + if f.startswith('identifier:') or not ans.is_null(f): + continue + setattr(ans, f, self.random_merge(f, results, + null_value=getattr(ans, f))) + + avg = [x.relevance_in_source for x in results] + avg = sum(avg)/len(avg) + ans.average_source_relevance = avg + + return ans - if not self.pool_has_result_from_same_source(pool, result): - pool[1].append(result) def merge_identify_results(result_map, log): + isbn_merge = ISBNMerge() for plugin, results in result_map.iteritems(): for result in results: - isbn = result.isbn - if isbn: - isbns, min_year = xisbn.get_isbn_pool(isbn) + isbn_merge.add_result(result) + + return isbn_merge.finalize() + +