F-Secure by louhike and more work on the new metadata download system

2025-08-11 09:13:57 -04:00 · 2011-04-04 08:02:28 -06:00 · 2011-04-04 08:02:28 -06:00 · 6be7471d2e
commit 6be7471d2e
parent 492d16e5c9
4 changed files with 186 additions and 15 deletions
--- a/recipes/f_secure.recipe
+++ b/recipes/f_secure.recipe
@ -0,0 +1,22 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1301860159(BasicNewsRecipe):
+    title          = u'F-Secure Weblog'
+    language = 'en'
+    __author__ = 'louhike'
+    description = u'All the news from the weblog of F-Secure'
+    publisher = u'F-Secure'
+    timefmt = ' [%a, %d %b, %Y]'
+    encoding = 'ISO-8859-1'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    no_stylesheets = True
+    use_embedded_content   = False
+    language = 'en_EN'
+    remove_javascript = True
+    keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
+    remove_tags = [dict(name='a'),dict(name='hr')]
+
+    feeds          = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')]
+    def get_cover_url(self):
+        return 'http://www.f-secure.com/weblog/archives/images/company_logo.png'
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -282,6 +282,7 @@ class Amazon(Source):
    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
        'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
+    has_html_comments = True

    AMAZON_DOMAINS = {
            'com': _('US'),
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase
 from calibre.ebooks.metadata import check_isbn

 msprefs = JSONConfig('metadata_sources.json')
+msprefs.defaults['txt_comments'] = False
+msprefs.defaults['ignore_fields'] = []
+msprefs.defaults['max_tags'] = 10

 def create_log(ostream=None):
    log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@ -104,6 +107,9 @@ class Source(Plugin):
    #: during the identify phase
    touched_fields = frozenset()

+    #: Set this to True if your plugin return HTML formatted comments
+    has_html_comments = False
+
    def __init__(self, *args, **kwargs):
        Plugin.__init__(self, *args, **kwargs)
        self._isbn_to_identifier_cache = {}
--- a/src/calibre/ebooks/metadata/sources/identify.py
+++ b/src/calibre/ebooks/metadata/sources/identify.py
@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

 import time
+from datetime import datetime
 from Queue import Queue, Empty
 from threading import Thread
 from io import BytesIO
+from operator import attrgetter

 from calibre.customize.ui import metadata_plugins
-from calibre.ebooks.metadata.sources.base import create_log
+from calibre.ebooks.metadata.sources.base import create_log, msprefs
 from calibre.ebooks.metadata.xisbn import xisbn
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.utils.date import utc_tz
+from calibre.utils.html2text import html2text

 # How long to wait for more results after first result is found
 WAIT_AFTER_FIRST_RESULT = 30 # seconds
@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
    log('Merging results from different sources and finding earliest',
            'publication dates')
    start_time = time.time()
-    merged_results = merge_identify_results(results, log)
+    results = merge_identify_results(results, log)
    log('We have %d merged results, merging took: %.2f seconds' %
-            (len(merged_results), time.time() - start_time))
+            (len(results), time.time() - start_time))
+
+    if msprefs['txt_comments']:
+        for r in results:
+            if r.plugin.has_html_comments and r.comments:
+                r.comments = html2text(r.comments)
+
+    dummy = Metadata(_('Unknown'))
+    max_tags = msprefs['max_tags']
+    for f in msprefs['ignore_fields']:
+        for r in results:
+            setattr(r, f, getattr(dummy, f))
+            r.tags = r.tags[:max_tags]
+
+    return results
+

 class ISBNMerge(object):

    def __init__(self):
        self.pools = {}
+        self.isbnless_results = []

    def isbn_in_pool(self, isbn):
        if isbn:
@ -140,7 +161,9 @@ class ISBNMerge(object):
                return True
        return False

-    def add_result(self, result, isbn):
+    def add_result(self, result):
+        isbn = result.isbn
+        if isbn:
            pool = self.isbn_in_pool(isbn)
            if pool is None:
                isbns, min_year = xisbn.get_isbn_pool(isbn)
@ -150,12 +173,131 @@ class ISBNMerge(object):

            if not self.pool_has_result_from_same_source(pool, result):
                pool[1].append(result)
+        else:
+            self.isbnless_results.append(result)
+
+    def finalize(self):
+        has_isbn_result = False
+        for results in self.pools.itervalues():
+            if results:
+                has_isbn_result = True
+                break
+        self.has_isbn_result = has_isbn_result
+
+        if has_isbn_result:
+            self.merge_isbn_results()
+        else:
+            self.results = sorted(self.isbnless_results,
+                    key=attrgetter('relevance_in_source'))
+
+        return self.results
+
+    def merge_isbn_results(self):
+        self.results = []
+        for min_year, results in self.pool.itervalues():
+            if results:
+                self.results.append(self.merge(results, min_year))
+
+        self.results.sort(key=attrgetter('average_source_relevance'))
+
+    def length_merge(self, attr, results, null_value=None, shortest=True):
+        values = [getattr(x, attr) for x in results if not x.is_null(attr)]
+        values = [x for x in values if len(x) > 0]
+        if not values:
+            return null_value
+        values.sort(key=len, reverse=not shortest)
+        return values[0]
+
+    def random_merge(self, attr, results, null_value=None):
+        values = [getattr(x, attr) for x in results if not x.is_null(attr)]
+        return values[0] if values else null_value
+
+    def merge(self, results, min_year):
+        ans = Metadata(_('Unknown'))
+
+        # We assume the shortest title has the least cruft in it
+        ans.title = self.length_merge('title', results, null_value=ans.title)
+
+        # No harm in having extra authors, maybe something useful like an
+        # editor or translator
+        ans.authors = self.length_merge('authors', results,
+                null_value=ans.authors, shortest=False)
+
+        # We assume the shortest publisher has the least cruft in it
+        ans.publisher = self.length_merge('publisher', results,
+                null_value=ans.publisher)
+
+        # We assume the smallest set of tags has the least cruft in it
+        ans.tags = self.length_merge('tags', results,
+                null_value=ans.tags)
+
+        # We assume the longest series has the most info in it
+        ans.series = self.length_merge('series', results,
+                null_value=ans.series, shortest=False)
+        for r in results:
+            if r.series and r.series == ans.series:
+                ans.series_index = r.series_index
+                break
+
+        # Average the rating over all sources
+        ratings = []
+        for r in results:
+            rating = r.rating
+            if rating and rating > 0 and rating <= 5:
+                ratings.append(rating)
+        if ratings:
+            ans.rating = sum(ratings)/len(ratings)
+
+        # Smallest language is likely to be valid
+        ans.language = self.length_merge('language', results,
+                null_value=ans.language)
+
+        # Choose longest comments
+        ans.comments = self.length_merge('comments', results,
+                null_value=ans.comments, shortest=False)
+
+        # Published date
+        if min_year:
+            min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
+            ans.pubdate = min_date
+        else:
+            min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
+            for r in results:
+                if r.pubdate is not None and r.pubdate < min_date:
+                    min_date = r.pubdate
+            if min_date.year < 10000:
+                ans.pubdate = min_date
+
+        # Identifiers
+        for r in results:
+            ans.identifiers.update(r.identifiers)
+
+        # Merge any other fields with no special handling (random merge)
+        touched_fields = set()
+        for r in results:
+            touched_fields |= r.plugin.touched_fields
+
+        for f in touched_fields:
+            if f.startswith('identifier:') or not ans.is_null(f):
+                continue
+            setattr(ans, f, self.random_merge(f, results,
+                null_value=getattr(ans, f)))
+
+        avg = [x.relevance_in_source for x in results]
+        avg = sum(avg)/len(avg)
+        ans.average_source_relevance = avg
+
+        return ans
+

 def merge_identify_results(result_map, log):
+    isbn_merge = ISBNMerge()
    for plugin, results in result_map.iteritems():
        for result in results:
-            isbn = result.isbn
-            if isbn:
-                isbns, min_year = xisbn.get_isbn_pool(isbn)
+            isbn_merge.add_result(result)
+
+    return isbn_merge.finalize()
+
+