Refactor the sort by relevance API

2025-07-09 03:04:10 -04:00 · 2011-03-22 21:42:29 -06:00 · 2011-03-22 21:42:29 -06:00 · 5f9032fa25
commit 5f9032fa25
parent 32ed063b5b
2 changed files with 69 additions and 59 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -21,6 +21,7 @@ def create_log(ostream=None):
    log.outputs = [FileStream(ostream)]
    return log

+# Comparing Metadata objects for relevance {{{
 words = ("the", "a", "an", "of", "and")
 prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
 trailing_paren_pat = re.compile(r'\(.*\)$')
@ -35,6 +36,55 @@ def cleanup_title(s):
    s = whitespace_pat.sub(' ', s)
    return s.strip()

+class InternalMetadataCompareKeyGen(object):
+
+    '''
+    Generate a sort key for comparison of the relevance of Metadata objects,
+    given a search query.
+
+    The sort key ensures that an ascending order sort is a sort by order of
+    decreasing relevance.
+
+    The algorithm is:
+
+        1. Prefer results that have the same ISBN as specified in the query
+        2. Prefer results with all available fields filled in
+        3. Prefer results that are an exact title match to the query
+        4. Prefer results with longer comments (greater than 10 % longer)
+        5. Prefer results with a cached cover URL
+        6. Use the relevance of the result as reported by the metadata source's search
+           engine
+    '''
+
+    def __init__(self, mi, source_plugin, title, authors, identifiers):
+        isbn = 1 if mi.isbn and mi.isbn == identifiers.get('isbn', None) else 2
+
+        all_fields = 1 if source_plugin.test_fields(mi) is None else 2
+
+        exact_title = 1 if title and \
+                cleanup_title(title) == cleanup_title(mi.title) else 2
+
+        has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\
+                is None else 1
+
+        self.base = (isbn, all_fields, exact_title)
+        self.comments_len = len(mi.comments.strip() if mi.comments else '')
+        self.extra = (has_cover, getattr(mi, 'source_relevance', 0))
+
+    def __cmp__(self, other):
+        result = cmp(self.base, other.base)
+        if result == 0:
+            # Now prefer results with the longer comments, within 10%
+            cx, cy = self.comments_len, other.comments_len
+            t = (cx + cy) / 20
+            delta = cy - cx
+            if abs(delta) > t:
+                result = delta
+            else:
+                result = cmp(self.extra, other.extra)
+        return result
+
+# }}}

 class Source(Plugin):

@ -70,7 +120,7 @@ class Source(Plugin):
    def browser(self):
        if self._browser is None:
            self._browser = browser(user_agent=random_user_agent())
-        return self._browser
+        return self._browser.clone_browser()

    # }}}

@ -164,69 +214,30 @@ class Source(Plugin):
    def get_cached_cover_url(self, identifiers):
        '''
        Return cached cover URL for the book identified by
-        the identifiers dict or Noneif no such URL exists
+        the identifiers dict or None if no such URL exists.
+
+        Note that this method must only return validated URLs, i.e. not URLS
+        that could result in a generic cover image or a not found error.
        '''
        return None

-    def compare_identify_results(self, x, y, title=None, authors=None,
+    def identify_results_keygen(self, title=None, authors=None,
            identifiers={}):
        '''
-        Method used to sort the results from a call to identify by relevance.
-        Uses the actual query and various heuristics to rank results.
-        Re-implement in your plugin if this generic algorithm is not suitable.
-        Note that this method assumes x and y have a source_relevance
-        attribute.
+        Return a function that is used to generate a key that can sort Metadata
+        objects by their relevance given a search query (title, authors,
+        identifiers).

-        one < two iff one is more relevant than two
+        These keys are used to sort the results of a call to :meth:`identify`.
+
+        For details on the default algorithm see
+        :class:`InternalMetadataCompareKeyGen`. Re-implement this function in
+        your plugin if the default algorithm is not suitable.
        '''
-        # First, guarantee that if the query specifies an ISBN, the result with
-        # the same isbn is the most relevant
-        def isbn_test(mi):
-            return mi.isbn and mi.isbn == identifiers.get('isbn', None)
-
-        def boolcmp(a, b):
-            return -1 if a and not b else 1 if not a and b else 0
-
-        x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y)
-        result = boolcmp(x_has_isbn, y_has_isbn)
-        if result != 0:
-            return result
-
-        # Now prefer results that have complete metadata over those that don't
-        x_has_all_fields = self.test_fields(x) is None
-        y_has_all_fields = self.test_fields(y) is None
-
-        result = boolcmp(x_has_all_fields, y_has_all_fields)
-        if result != 0:
-            return result
-
-        # Now prefer results whose title matches the search query
-        if title:
-            x_title = cleanup_title(x.title)
-            y_title = cleanup_title(y.title)
-            t = cleanup_title(title)
-            x_has_title, y_has_title = x_title == t, y_title == t
-            result = boolcmp(x_has_title, y_has_title)
-            if result != 0:
-                return result
-
-        # Now prefer results with the longer comments, within 10%
-        cx = len(x.comments.strip() if x.comments else '')
-        cy = len(y.comments.strip() if y.comments else '')
-        t = (cx + cy) / 20
-        result = cy - cx
-        if result != 0 and abs(cx - cy) > t:
-            return result
-
-        # Now prefer results with cached cover URLs
-        x_has_cover = self.get_cached_cover_url(x.identifiers) is not None
-        y_has_cover = self.get_cached_cover_url(y.identifiers) is not None
-        result = boolcmp(x_has_cover, y_has_cover)
-        if result != 0:
-            return result
-
-        # Now use the relevance reported by the remote search engine
-        return x.source_relevance - y.source_relevance
+        def keygen(mi):
+            return InternalMetadataCompareKeyGen(mi, self, title, authors,
+                identifiers)
+        return keygen

    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=5):
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
 import os, tempfile, time
 from Queue import Queue, Empty
 from threading import Event
-from functools import partial

 from calibre.customize.ui import metadata_plugins
 from calibre import prints
@ -93,7 +92,7 @@ def test_identify_plugin(name, tests):
        prints('Found', len(results), 'matches:', end=' ')
        prints('Smaller relevance means better match')

-        results.sort(cmp=partial(plugin.compare_identify_results,
+        results.sort(key=plugin.identify_results_keygen(
            title=kwargs.get('title', None), authors=kwargs.get('authors',
                None), identifiers=kwargs.get('identifiers', {})))