Refactor the sort by relevance API

2025-07-09 03:04:10 -04:00 · 2011-03-22 21:42:29 -06:00 · 2011-03-22 21:42:29 -06:00 · 5f9032fa25
commit 5f9032fa25
parent 32ed063b5b
2 changed files with 69 additions and 59 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -21,6 +21,7 @@ def create_log(ostream=None):
    log.outputs = [FileStream(ostream)]
    return log
 # Comparing Metadata objects for relevance {{{
 words = ("the", "a", "an", "of", "and")
 prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
 trailing_paren_pat = re.compile(r'\(.*\)$')
@ -35,6 +36,55 @@ def cleanup_title(s):
    s = whitespace_pat.sub(' ', s)
    return s.strip()
 class InternalMetadataCompareKeyGen(object):
    '''
    Generate a sort key for comparison of the relevance of Metadata objects,
    given a search query.
    The sort key ensures that an ascending order sort is a sort by order of
    decreasing relevance.
    The algorithm is:
        1. Prefer results that have the same ISBN as specified in the query
        2. Prefer results with all available fields filled in
        3. Prefer results that are an exact title match to the query
        4. Prefer results with longer comments (greater than 10 % longer)
        5. Prefer results with a cached cover URL
        6. Use the relevance of the result as reported by the metadata source's search
           engine
    '''
    def __init__(self, mi, source_plugin, title, authors, identifiers):
        isbn = 1 if mi.isbn and mi.isbn == identifiers.get('isbn', None) else 2
        all_fields = 1 if source_plugin.test_fields(mi) is None else 2
        exact_title = 1 if title and \
                cleanup_title(title) == cleanup_title(mi.title) else 2
        has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\
                is None else 1
        self.base = (isbn, all_fields, exact_title)
        self.comments_len = len(mi.comments.strip() if mi.comments else '')
        self.extra = (has_cover, getattr(mi, 'source_relevance', 0))
    def __cmp__(self, other):
        result = cmp(self.base, other.base)
        if result == 0:
            # Now prefer results with the longer comments, within 10%
            cx, cy = self.comments_len, other.comments_len
            t = (cx + cy) / 20
            delta = cy - cx
            if abs(delta) > t:
                result = delta
            else:
                result = cmp(self.extra, other.extra)
        return result
 # }}}
 class Source(Plugin):
@ -70,7 +120,7 @@ class Source(Plugin):
    def browser(self):
        if self._browser is None:
            self._browser = browser(user_agent=random_user_agent())
-        return self._browser
+        return self._browser.clone_browser()
    # }}}
@ -164,69 +214,30 @@ class Source(Plugin):
    def get_cached_cover_url(self, identifiers):
        '''
        Return cached cover URL for the book identified by
-        the identifiers dict or Noneif no such URL exists
+        the identifiers dict or None if no such URL exists.
        Note that this method must only return validated URLs, i.e. not URLS
        that could result in a generic cover image or a not found error.
        '''
        return None
-    def compare_identify_results(self, x, y, title=None, authors=None,
+    def identify_results_keygen(self, title=None, authors=None,
            identifiers={}):
        '''
-        Method used to sort the results from a call to identify by relevance.
+        Return a function that is used to generate a key that can sort Metadata
-        Uses the actual query and various heuristics to rank results.
+        objects by their relevance given a search query (title, authors,
-        Re-implement in your plugin if this generic algorithm is not suitable.
+        identifiers).
        Note that this method assumes x and y have a source_relevance
        attribute.
-        one < two iff one is more relevant than two
+        These keys are used to sort the results of a call to :meth:`identify`.
        For details on the default algorithm see
        :class:`InternalMetadataCompareKeyGen`. Re-implement this function in
        your plugin if the default algorithm is not suitable.
        '''
-        # First, guarantee that if the query specifies an ISBN, the result with
+        def keygen(mi):
-        # the same isbn is the most relevant
+            return InternalMetadataCompareKeyGen(mi, self, title, authors,
-        def isbn_test(mi):
+                identifiers)
-            return mi.isbn and mi.isbn == identifiers.get('isbn', None)
+        return keygen
        def boolcmp(a, b):
            return -1 if a and not b else 1 if not a and b else 0
        x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y)
        result = boolcmp(x_has_isbn, y_has_isbn)
        if result != 0:
            return result
        # Now prefer results that have complete metadata over those that don't
        x_has_all_fields = self.test_fields(x) is None
        y_has_all_fields = self.test_fields(y) is None
        result = boolcmp(x_has_all_fields, y_has_all_fields)
        if result != 0:
            return result
        # Now prefer results whose title matches the search query
        if title:
            x_title = cleanup_title(x.title)
            y_title = cleanup_title(y.title)
            t = cleanup_title(title)
            x_has_title, y_has_title = x_title == t, y_title == t
            result = boolcmp(x_has_title, y_has_title)
            if result != 0:
                return result
        # Now prefer results with the longer comments, within 10%
        cx = len(x.comments.strip() if x.comments else '')
        cy = len(y.comments.strip() if y.comments else '')
        t = (cx + cy) / 20
        result = cy - cx
        if result != 0 and abs(cx - cy) > t:
            return result
        # Now prefer results with cached cover URLs
        x_has_cover = self.get_cached_cover_url(x.identifiers) is not None
        y_has_cover = self.get_cached_cover_url(y.identifiers) is not None
        result = boolcmp(x_has_cover, y_has_cover)
        if result != 0:
            return result
        # Now use the relevance reported by the remote search engine
        return x.source_relevance - y.source_relevance
    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=5):
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
 import os, tempfile, time
 from Queue import Queue, Empty
 from threading import Event
 from functools import partial
 from calibre.customize.ui import metadata_plugins
 from calibre import prints
@ -93,7 +92,7 @@ def test_identify_plugin(name, tests):
        prints('Found', len(results), 'matches:', end=' ')
        prints('Smaller relevance means better match')
-        results.sort(cmp=partial(plugin.compare_identify_results,
+        results.sort(key=plugin.identify_results_keygen(
            title=kwargs.get('title', None), authors=kwargs.get('authors',
                None), identifiers=kwargs.get('identifiers', {})))