From 6be7471d2e7d93793de6e25e7e9222cb82b49cc4 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Mon, 4 Apr 2011 08:02:28 -0600
Subject: [PATCH] F-Secure by louhike and more work on the new metadata
 download system

---
 recipes/f_secure.recipe                       |  22 +++
 src/calibre/ebooks/metadata/sources/amazon.py |   1 +
 src/calibre/ebooks/metadata/sources/base.py   |   6 +
 .../ebooks/metadata/sources/identify.py       | 172 ++++++++++++++++--
 4 files changed, 186 insertions(+), 15 deletions(-)
 create mode 100644 recipes/f_secure.recipe

diff --git a/recipes/f_secure.recipe b/recipes/f_secure.recipe
new file mode 100644
index 0000000000..f276a4961a
--- /dev/null
+++ b/recipes/f_secure.recipe
@@ -0,0 +1,22 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1301860159(BasicNewsRecipe):
+    title          = u'F-Secure Weblog'
+    language = 'en'
+    __author__ = 'louhike'
+    description = u'All the news from the weblog of F-Secure'
+    publisher = u'F-Secure'
+    timefmt = ' [%a, %d %b, %Y]'
+    encoding = 'ISO-8859-1'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    no_stylesheets = True
+    use_embedded_content   = False
+    language = 'en_EN'
+    remove_javascript = True
+    keep_only_tags = [dict(name='div', attrs={'class':'modSectionTd2'})]
+    remove_tags = [dict(name='a'),dict(name='hr')]
+
+    feeds          = [(u'Weblog', u'http://www.f-secure.com/weblog/weblog.rss')]
+    def get_cover_url(self):
+        return 'http://www.f-secure.com/weblog/archives/images/company_logo.png'
diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py
index cfa2b09ea8..9334d818ec 100644
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@@ -282,6 +282,7 @@ class Amazon(Source):
     capabilities = frozenset(['identify', 'cover'])
     touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
         'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate'])
+    has_html_comments = True
 
     AMAZON_DOMAINS = {
             'com': _('US'),
diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py
index 7cc4ed3518..08012c3ee8 100644
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@@ -18,6 +18,9 @@ from calibre.utils.titlecase import titlecase
 from calibre.ebooks.metadata import check_isbn
 
 msprefs = JSONConfig('metadata_sources.json')
+msprefs.defaults['txt_comments'] = False
+msprefs.defaults['ignore_fields'] = []
+msprefs.defaults['max_tags'] = 10
 
 def create_log(ostream=None):
     log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
@@ -104,6 +107,9 @@ class Source(Plugin):
     #: during the identify phase
     touched_fields = frozenset()
 
+    #: Set this to True if your plugin return HTML formatted comments
+    has_html_comments = False
+
     def __init__(self, *args, **kwargs):
         Plugin.__init__(self, *args, **kwargs)
         self._isbn_to_identifier_cache = {}
diff --git a/src/calibre/ebooks/metadata/sources/identify.py b/src/calibre/ebooks/metadata/sources/identify.py
index 1d4d8840e8..ab86e8ffa2 100644
--- a/src/calibre/ebooks/metadata/sources/identify.py
+++ b/src/calibre/ebooks/metadata/sources/identify.py
@@ -8,13 +8,18 @@ __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 
 import time
+from datetime import datetime
 from Queue import Queue, Empty
 from threading import Thread
 from io import BytesIO
+from operator import attrgetter
 
 from calibre.customize.ui import metadata_plugins
-from calibre.ebooks.metadata.sources.base import create_log
+from calibre.ebooks.metadata.sources.base import create_log, msprefs
 from calibre.ebooks.metadata.xisbn import xisbn
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.utils.date import utc_tz
+from calibre.utils.html2text import html2text
 
 # How long to wait for more results after first result is found
 WAIT_AFTER_FIRST_RESULT = 30 # seconds
@@ -117,14 +122,30 @@ def identify(log, abort, title=None, authors=None, identifiers=[], timeout=30):
     log('Merging results from different sources and finding earliest',
             'publication dates')
     start_time = time.time()
-    merged_results = merge_identify_results(results, log)
+    results = merge_identify_results(results, log)
     log('We have %d merged results, merging took: %.2f seconds' %
-            (len(merged_results), time.time() - start_time))
+            (len(results), time.time() - start_time))
+
+    if msprefs['txt_comments']:
+        for r in results:
+            if r.plugin.has_html_comments and r.comments:
+                r.comments = html2text(r.comments)
+
+    dummy = Metadata(_('Unknown'))
+    max_tags = msprefs['max_tags']
+    for f in msprefs['ignore_fields']:
+        for r in results:
+            setattr(r, f, getattr(dummy, f))
+            r.tags = r.tags[:max_tags]
+
+    return results
+
 
 class ISBNMerge(object):
 
     def __init__(self):
         self.pools = {}
+        self.isbnless_results = []
 
     def isbn_in_pool(self, isbn):
         if isbn:
@@ -140,22 +161,143 @@ class ISBNMerge(object):
                 return True
         return False
 
-    def add_result(self, result, isbn):
-        pool = self.isbn_in_pool(isbn)
-        if pool is None:
-            isbns, min_year = xisbn.get_isbn_pool(isbn)
-            if not isbns:
-                isbns = frozenset([isbn])
-            self.pool[isbns] = pool = (min_year, [])
+    def add_result(self, result):
+        isbn = result.isbn
+        if isbn:
+            pool = self.isbn_in_pool(isbn)
+            if pool is None:
+                isbns, min_year = xisbn.get_isbn_pool(isbn)
+                if not isbns:
+                    isbns = frozenset([isbn])
+                self.pool[isbns] = pool = (min_year, [])
+
+            if not self.pool_has_result_from_same_source(pool, result):
+                pool[1].append(result)
+        else:
+            self.isbnless_results.append(result)
+
+    def finalize(self):
+        has_isbn_result = False
+        for results in self.pools.itervalues():
+            if results:
+                has_isbn_result = True
+                break
+        self.has_isbn_result = has_isbn_result
+
+        if has_isbn_result:
+            self.merge_isbn_results()
+        else:
+            self.results = sorted(self.isbnless_results,
+                    key=attrgetter('relevance_in_source'))
+
+        return self.results
+
+    def merge_isbn_results(self):
+        self.results = []
+        for min_year, results in self.pool.itervalues():
+            if results:
+                self.results.append(self.merge(results, min_year))
+
+        self.results.sort(key=attrgetter('average_source_relevance'))
+
+    def length_merge(self, attr, results, null_value=None, shortest=True):
+        values = [getattr(x, attr) for x in results if not x.is_null(attr)]
+        values = [x for x in values if len(x) > 0]
+        if not values:
+            return null_value
+        values.sort(key=len, reverse=not shortest)
+        return values[0]
+
+    def random_merge(self, attr, results, null_value=None):
+        values = [getattr(x, attr) for x in results if not x.is_null(attr)]
+        return values[0] if values else null_value
+
+    def merge(self, results, min_year):
+        ans = Metadata(_('Unknown'))
+
+        # We assume the shortest title has the least cruft in it
+        ans.title = self.length_merge('title', results, null_value=ans.title)
+
+        # No harm in having extra authors, maybe something useful like an
+        # editor or translator
+        ans.authors = self.length_merge('authors', results,
+                null_value=ans.authors, shortest=False)
+
+        # We assume the shortest publisher has the least cruft in it
+        ans.publisher = self.length_merge('publisher', results,
+                null_value=ans.publisher)
+
+        # We assume the smallest set of tags has the least cruft in it
+        ans.tags = self.length_merge('tags', results,
+                null_value=ans.tags)
+
+        # We assume the longest series has the most info in it
+        ans.series = self.length_merge('series', results,
+                null_value=ans.series, shortest=False)
+        for r in results:
+            if r.series and r.series == ans.series:
+                ans.series_index = r.series_index
+                break
+
+        # Average the rating over all sources
+        ratings = []
+        for r in results:
+            rating = r.rating
+            if rating and rating > 0 and rating <= 5:
+                ratings.append(rating)
+        if ratings:
+            ans.rating = sum(ratings)/len(ratings)
+
+        # Smallest language is likely to be valid
+        ans.language = self.length_merge('language', results,
+                null_value=ans.language)
+
+        # Choose longest comments
+        ans.comments = self.length_merge('comments', results,
+                null_value=ans.comments, shortest=False)
+
+        # Published date
+        if min_year:
+            min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
+            ans.pubdate = min_date
+        else:
+            min_date = datetime(10000, 1, 1, tzinfo=utc_tz)
+            for r in results:
+                if r.pubdate is not None and r.pubdate < min_date:
+                    min_date = r.pubdate
+            if min_date.year < 10000:
+                ans.pubdate = min_date
+
+        # Identifiers
+        for r in results:
+            ans.identifiers.update(r.identifiers)
+
+        # Merge any other fields with no special handling (random merge)
+        touched_fields = set()
+        for r in results:
+            touched_fields |= r.plugin.touched_fields
+
+        for f in touched_fields:
+            if f.startswith('identifier:') or not ans.is_null(f):
+                continue
+            setattr(ans, f, self.random_merge(f, results,
+                null_value=getattr(ans, f)))
+
+        avg = [x.relevance_in_source for x in results]
+        avg = sum(avg)/len(avg)
+        ans.average_source_relevance = avg
+
+        return ans
 
-        if not self.pool_has_result_from_same_source(pool, result):
-            pool[1].append(result)
 
 def merge_identify_results(result_map, log):
+    isbn_merge = ISBNMerge()
     for plugin, results in result_map.iteritems():
         for result in results:
-            isbn = result.isbn
-            if isbn:
-                isbns, min_year = xisbn.get_isbn_pool(isbn)
+            isbn_merge.add_result(result)
+
+    return isbn_merge.finalize()
+
+