F-Secure by louhike and more work on the new metadata download system

2025-07-07 10:14:46 -04:00 · 2011-04-04 08:02:28 -06:00 · 2011-04-04 08:02:28 -06:00 · 6be7471d2e
commit 6be7471d2e
parent 492d16e5c9
4 changed files with 186 additions and 15 deletions
--- a/recipes/f_secure.recipe
+++ b/recipes/f_secure.recipe
@ -0,0 +1,22 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1301860159(BasicNewsRecipe):
    title          = u'F-Secure Weblog'
    language = 'en'
    __author__ = 'louhike'
    description = u'All the news from the weblog of F-Secure'
    publisher = u'F-Secure'
    timefmt = ' [%a, %d %b, %Y]'
    encoding = 'ISO-8859-1'
    oldest_article = 7
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content   = False
    language = 'en_EN'
    remove_javascript = True
    keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
    remove_tags = [dict(name='a'),dict(name='hr')]
    feeds          = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')]
    def get_cover_url(self):
        return 'http://www.f-secure.com/weblog/archives/images/company_logo.png'
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -282,6 +282,7 @@ class Amazon(Source):
    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
        'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
    has_html_comments = True
    AMAZON_DOMAINS = {
            'com': _('US'),
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase
 from calibre.ebooks.metadata import check_isbn
 msprefs = JSONConfig('metadata_sources.json')
 msprefs.defaults['txt_comments'] = False
 msprefs.defaults['ignore_fields'] = []
 msprefs.defaults['max_tags'] = 10
 def create_log(ostream=None):
    log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -104,6 +107,9 @@ class Source(Plugin):
    #: during the identify phase
    touched_fields = frozenset()
    #: Set this to True if your plugin return HTML formatted comments
    has_html_comments = False
    def __init__(self, *args, **kwargs):
        Plugin.__init__(self, *args, **kwargs)
        self._isbn_to_identifier_cache = {}
--- a/src/calibre/ebooks/metadata/sources/identify.py
+++ b/src/calibre/ebooks/metadata/sources/identify.py
@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import time
 from datetime import datetime
 from Queue import Queue, Empty
 from threading import Thread
 from io import BytesIO
 from operator import attrgetter
 from calibre.customize.ui import metadata_plugins
-from calibre.ebooks.metadata.sources.base import create_log
+from calibre.ebooks.metadata.sources.base import create_log, msprefs
 from calibre.ebooks.metadata.xisbn import xisbn
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.date import utc_tz
 from calibre.utils.html2text import html2text
 # How long to wait for more results after first result is found
 WAIT_AFTER_FIRST_RESULT = 30 # seconds
@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
    log('Merging results from different sources and finding earliest',
            'publication dates')
    start_time = time.time()
-    merged_results = merge_identify_results(results, log)
+    results = merge_identify_results(results, log)
    log('We have %d merged results, merging took: %.2f seconds' %
-            (len(merged_results), time.time() - start_time))
+            (len(results), time.time() - start_time))
    if msprefs['txt_comments']:
        for r in results:
            if r.plugin.has_html_comments and r.comments:
                r.comments = html2text(r.comments)
    dummy = Metadata(_('Unknown'))
    max_tags = msprefs['max_tags']
    for f in msprefs['ignore_fields']:
        for r in results:
            setattr(r, f, getattr(dummy, f))
            r.tags = r.tags[:max_tags]
    return results
 class ISBNMerge(object):
    def __init__(self):
        self.pools = {}
        self.isbnless_results = []
    def isbn_in_pool(self, isbn):
        if isbn:
@ -140,7 +161,9 @@ class ISBNMerge(object):
                return True
        return False
-    def add_result(self, result, isbn):
+    def add_result(self, result):
        isbn = result.isbn
        if isbn:
            pool = self.isbn_in_pool(isbn)
            if pool is None:
                isbns, min_year = xisbn.get_isbn_pool(isbn)
@ -150,12 +173,131 @@ class ISBNMerge(object):
            if not self.pool_has_result_from_same_source(pool, result):
                pool[1].append(result)
        else:
            self.isbnless_results.append(result)
    def finalize(self):
        has_isbn_result = False
        for results in self.pools.itervalues():
            if results:
                has_isbn_result = True
                break
        self.has_isbn_result = has_isbn_result
        if has_isbn_result:
            self.merge_isbn_results()
        else:
            self.results = sorted(self.isbnless_results,
                    key=attrgetter('relevance_in_source'))
        return self.results
    def merge_isbn_results(self):
        self.results = []
        for min_year, results in self.pool.itervalues():
            if results:
                self.results.append(self.merge(results, min_year))
        self.results.sort(key=attrgetter('average_source_relevance'))
    def length_merge(self, attr, results, null_value=None, shortest=True):
        values = [getattr(x, attr) for x in results if not x.is_null(attr)]
        values = [x for x in values if len(x) > 0]
        if not values:
            return null_value
        values.sort(key=len, reverse=not shortest)
        return values[0]
    def random_merge(self, attr, results, null_value=None):
        values = [getattr(x, attr) for x in results if not x.is_null(attr)]
        return values[0] if values else null_value
    def merge(self, results, min_year):
        ans = Metadata(_('Unknown'))
        # We assume the shortest title has the least cruft in it
        ans.title = self.length_merge('title', results, null_value=ans.title)
        # No harm in having extra authors, maybe something useful like an
        # editor or translator
        ans.authors = self.length_merge('authors', results,
                null_value=ans.authors, shortest=False)
        # We assume the shortest publisher has the least cruft in it
        ans.publisher = self.length_merge('publisher', results,
                null_value=ans.publisher)
        # We assume the smallest set of tags has the least cruft in it
        ans.tags = self.length_merge('tags', results,
                null_value=ans.tags)
        # We assume the longest series has the most info in it
        ans.series = self.length_merge('series', results,
                null_value=ans.series, shortest=False)
        for r in results:
            if r.series and r.series == ans.series:
                ans.series_index = r.series_index
                break
        # Average the rating over all sources
        ratings = []
        for r in results:
            rating = r.rating
            if rating and rating > 0 and rating <= 5:
                ratings.append(rating)
        if ratings:
            ans.rating = sum(ratings)/len(ratings)
        # Smallest language is likely to be valid
        ans.language = self.length_merge('language', results,
                null_value=ans.language)
        # Choose longest comments
        ans.comments = self.length_merge('comments', results,
                null_value=ans.comments, shortest=False)
        # Published date
        if min_year:
            min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
            ans.pubdate = min_date
        else:
            min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
            for r in results:
                if r.pubdate is not None and r.pubdate < min_date:
                    min_date = r.pubdate
            if min_date.year < 10000:
                ans.pubdate = min_date
        # Identifiers
        for r in results:
            ans.identifiers.update(r.identifiers)
        # Merge any other fields with no special handling (random merge)
        touched_fields = set()
        for r in results:
            touched_fields |= r.plugin.touched_fields
        for f in touched_fields:
            if f.startswith('identifier:') or not ans.is_null(f):
                continue
            setattr(ans, f, self.random_merge(f, results,
                null_value=getattr(ans, f)))
        avg = [x.relevance_in_source for x in results]
        avg = sum(avg)/len(avg)
        ans.average_source_relevance = avg
        return ans
 def merge_identify_results(result_map, log):
    isbn_merge = ISBNMerge()
    for plugin, results in result_map.iteritems():
        for result in results:
-            isbn = result.isbn
+            isbn_merge.add_result(result)
-            if isbn:
+
-                isbns, min_year = xisbn.get_isbn_pool(isbn)
+    return isbn_merge.finalize()