Initial implementation of relevance sorting of metadata identify results. Needs testing

2025-07-09 03:04:10 -04:00 · 2011-03-21 22:26:21 -06:00 · 2011-03-21 22:26:21 -06:00 · 74d1fb4c49
commit 74d1fb4c49
parent d37f302a0e
4 changed files with 137 additions and 24 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -28,11 +28,12 @@ class Worker(Thread): # {{{
    Get book details from amazons book page in a separate thread
    '''

-    def __init__(self, url, result_queue, browser, log, timeout=20):
+    def __init__(self, url, result_queue, browser, log, relevance, plugin, timeout=20):
        Thread.__init__(self)
        self.daemon = True
        self.url, self.result_queue = url, result_queue
        self.log, self.timeout = log, timeout
+        self.relevance, self.plugin = relevance, plugin
        self.browser = browser.clone_browser()
        self.cover_url = self.amazon_id = self.isbn = None

@ -161,6 +162,15 @@ class Worker(Thread): # {{{
        else:
            self.log.warning('Failed to find product description for url: %r'%self.url)

+        mi.source_relevance = self.relevance
+
+        if self.amazon_id:
+            if self.isbn:
+                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
+            if self.cover_url:
+                self.cache_identifier_to_cover_url(self.amazon_id,
+                        self.cover_url)
+
        self.result_queue.put(mi)

    def parse_asin(self, root):
@ -321,6 +331,20 @@ class Amazon(Source):

    # }}}

+    def get_cached_cover_url(self, identifiers):
+        url = None
+        asin = identifiers.get('amazon', None)
+        if asin is None:
+            asin = identifiers.get('asin', None)
+        if asin is None:
+            isbn = identifiers.get('isbn', None)
+            if isbn is not None:
+                asin = self.cached_isbn_to_identifier(isbn)
+        if asin is not None:
+            url = self.cached_identifier_to_cover_url(asin)
+
+        return url
+
    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
            identifiers={}, timeout=30):
        '''
@ -396,7 +420,8 @@ class Amazon(Source):
            log.error('No matches found with query: %r'%query)
            return

-        workers = [Worker(url, result_queue, br, log) for url in matches]
+        workers = [Worker(url, result_queue, br, log, i, self) for i, url in
+                enumerate(matches)]

        for w in workers:
            w.start()
@ -414,14 +439,6 @@ class Amazon(Source):
            if not a_worker_is_alive:
                break

-        for w in workers:
-            if w.amazon_id:
-                if w.isbn:
-                    self.cache_isbn_to_identifier(w.isbn, w.amazon_id)
-                if w.cover_url:
-                    self.cache_identifier_to_cover_url(w.amazon_id,
-                            w.cover_url)
-
        return None
    # }}}

--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -21,6 +21,21 @@ def create_log(ostream=None):
    log.outputs = [FileStream(ostream)]
    return log

+words = ("the", "a", "an", "of", "and")
+prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
+trailing_paren_pat = re.compile(r'\(.*\)$')
+whitespace_pat = re.compile(r'\s+')
+
+def cleanup_title(s):
+    if not s:
+        s = _('Unknown')
+    s = s.strip().lower()
+    s = prefix_pat.sub(' ', s)
+    s = trailing_paren_pat.sub('', s)
+    s = whitespace_pat.sub(' ', s)
+    return s.strip()
+
+
 class Source(Plugin):

    type = _('Metadata source')
@ -128,10 +143,91 @@ class Source(Plugin):
                gr.append(job)
        return [g for g in groups if g]

+    def test_fields(self, mi):
+        '''
+        Return the first field from self.touched_fields that is null on the
+        mi object
+        '''
+        for key in self.touched_fields:
+            if key.startswith('identifier:'):
+                key = key.partition(':')[-1]
+                if not mi.has_identifier(key):
+                    return 'identifier: ' + key
+            elif mi.is_null(key):
+                return key
+
+
    # }}}

    # Metadata API {{{

+    def get_cached_cover_url(self, identifiers):
+        '''
+        Return cached cover URL for the book identified by
+        the identifiers dict or Noneif no such URL exists
+        '''
+        return None
+
+    def compare_identify_results(self, x, y, title=None, authors=None,
+            identifiers={}):
+        '''
+        Method used to sort the results from a call to identify by relevance.
+        Uses the actual query and various heuristics to rank results.
+        Re-implement in your plugin if this generic algorithm is not suitable.
+        Note that this method assumes x and y have a source_relevance
+        attribute.
+
+        one < two iff one is more relevant than two
+        '''
+        # First, guarantee that if the query specifies an ISBN, the result with
+        # the same isbn is the most relevant
+        def isbn_test(mi):
+            return mi.isbn and mi.isbn == identifiers.get('isbn', None)
+
+        def boolcmp(a, b):
+            return -1 if a and not b else 1 if not a and b else 0
+
+        x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y)
+        result = boolcmp(x_has_isbn, y_has_isbn)
+        if result != 0:
+            return result
+
+        # Now prefer results that have complete metadata over those that don't
+        x_has_all_fields = self.test_fields(x) is None
+        y_has_all_fields = self.test_fields(y) is None
+
+        result = boolcmp(x_has_all_fields, y_has_all_fields)
+        if result != 0:
+            return result
+
+        # Now prefer results whose title matches the search query
+        if title:
+            x_title = cleanup_title(x.title)
+            y_title = cleanup_title(y.title)
+            t = cleanup_title(title)
+            x_has_title, y_has_title = x_title == t, y_title == t
+            result = boolcmp(x_has_title, y_has_title)
+            if result != 0:
+                return result
+
+        # Now prefer results with the longer comments, within 10%
+        cx = len(x.comments.strip() if x.comments else '')
+        cy = len(y.comments.strip() if y.comments else '')
+        t = (cx + cy) / 20
+        result = cy - cx
+        if result != 0 and abs(cx - cy) > t:
+            return result
+
+        # Now prefer results with cached cover URLs
+        x_has_cover = self.get_cached_cover_url(x.identifiers) is not None
+        y_has_cover = self.get_cached_cover_url(y.identifiers) is not None
+        result = boolcmp(x_has_cover, y_has_cover)
+        if result != 0:
+            return result
+
+        # Now use the relevance reported by the remote search engine
+        return x.source_relevance - y.source_relevance
+
    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=5):
        '''
@ -147,6 +243,15 @@ class Source(Plugin):
        the same ISBN/special identifier does not need to get the cover URL
        again. Use the caching API for this.

+        Every Metadata object put into result_queue by this method must have a
+        `source_relevance` attribute that is an integer indicating the order in
+        which the results were returned by the metadata source for this query.
+        This integer will be used by :meth:`compare_identify_results`. If the
+        order is unimportant, set it to zero for every result.
+
+        Make sure that any cover/isbn mapping information is cached before the
+        Metadata object is put into result_queue.
+
        :param log: A log object, use it to output debugging information/errors
        :param result_queue: A result Queue, results should be put into it.
                            Each result is a Metadata object
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -190,14 +190,15 @@ class GoogleBooks(Source):
        return raw and len(raw) > 17000 and raw[1:4] != 'PNG'

    def get_all_details(self, br, log, entries, abort, result_queue, timeout):
-        for i in entries:
+        for relevance, i in enumerate(entries):
            try:
                ans = to_metadata(br, log, i, timeout)
                if isinstance(ans, Metadata):
-                    result_queue.put(ans)
+                    ans.source_relevance = relevance
                    for isbn in getattr(ans, 'all_isbns', []):
                        self.cache_isbn_to_identifier(isbn,
                                ans.identifiers['google'])
+                    result_queue.put(ans)
            except:
                log.exception(
                    'Failed to get metadata for identify entry:',
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@ -46,15 +46,6 @@ def authors_test(authors):

    return test

-def _test_fields(touched_fields, mi):
-    for key in touched_fields:
-        if key.startswith('identifier:'):
-            key = key.partition(':')[-1]
-            if not mi.has_identifier(key):
-                return 'identifier: ' + key
-        elif mi.is_null(key):
-            return key
-

 def test_identify_plugin(name, tests):
    '''
@ -120,11 +111,10 @@ def test_identify_plugin(name, tests):
            prints('Log saved to', lf)
            raise SystemExit(1)

-        good = [x for x in possibles if _test_fields(plugin.touched_fields, x) is
+        good = [x for x in possibles if plugin.test_fields(x) is
                None]
        if not good:
-            prints('Failed to find', _test_fields(plugin.touched_fields,
-                possibles[0]))
+            prints('Failed to find', plugin.test_fields(possibles[0]))
            raise SystemExit(1)