Refactor the sort by relevance API

This commit is contained in:
Kovid Goyal 2011-03-22 21:42:29 -06:00
parent 32ed063b5b
commit 5f9032fa25
2 changed files with 69 additions and 59 deletions

View File

@ -21,6 +21,7 @@ def create_log(ostream=None):
log.outputs = [FileStream(ostream)]
return log
# Comparing Metadata objects for relevance {{{
words = ("the", "a", "an", "of", "and")
prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
trailing_paren_pat = re.compile(r'\(.*\)$')
@ -35,6 +36,55 @@ def cleanup_title(s):
s = whitespace_pat.sub(' ', s)
return s.strip()
class InternalMetadataCompareKeyGen(object):
'''
Generate a sort key for comparison of the relevance of Metadata objects,
given a search query.
The sort key ensures that an ascending order sort is a sort by order of
decreasing relevance.
The algorithm is:
1. Prefer results that have the same ISBN as specified in the query
2. Prefer results with all available fields filled in
3. Prefer results that are an exact title match to the query
4. Prefer results with longer comments (greater than 10 % longer)
5. Prefer results with a cached cover URL
6. Use the relevance of the result as reported by the metadata source's search
engine
'''
def __init__(self, mi, source_plugin, title, authors, identifiers):
isbn = 1 if mi.isbn and mi.isbn == identifiers.get('isbn', None) else 2
all_fields = 1 if source_plugin.test_fields(mi) is None else 2
exact_title = 1 if title and \
cleanup_title(title) == cleanup_title(mi.title) else 2
has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\
is None else 1
self.base = (isbn, all_fields, exact_title)
self.comments_len = len(mi.comments.strip() if mi.comments else '')
self.extra = (has_cover, getattr(mi, 'source_relevance', 0))
def __cmp__(self, other):
result = cmp(self.base, other.base)
if result == 0:
# Now prefer results with the longer comments, within 10%
cx, cy = self.comments_len, other.comments_len
t = (cx + cy) / 20
delta = cy - cx
if abs(delta) > t:
result = delta
else:
result = cmp(self.extra, other.extra)
return result
# }}}
class Source(Plugin):
@ -70,7 +120,7 @@ class Source(Plugin):
def browser(self):
if self._browser is None:
self._browser = browser(user_agent=random_user_agent())
return self._browser
return self._browser.clone_browser()
# }}}
@ -164,69 +214,30 @@ class Source(Plugin):
def get_cached_cover_url(self, identifiers):
'''
Return cached cover URL for the book identified by
the identifiers dict or Noneif no such URL exists
the identifiers dict or None if no such URL exists.
Note that this method must only return validated URLs, i.e. not URLS
that could result in a generic cover image or a not found error.
'''
return None
def compare_identify_results(self, x, y, title=None, authors=None,
def identify_results_keygen(self, title=None, authors=None,
identifiers={}):
'''
Method used to sort the results from a call to identify by relevance.
Uses the actual query and various heuristics to rank results.
Re-implement in your plugin if this generic algorithm is not suitable.
Note that this method assumes x and y have a source_relevance
attribute.
Return a function that is used to generate a key that can sort Metadata
objects by their relevance given a search query (title, authors,
identifiers).
one < two iff one is more relevant than two
These keys are used to sort the results of a call to :meth:`identify`.
For details on the default algorithm see
:class:`InternalMetadataCompareKeyGen`. Re-implement this function in
your plugin if the default algorithm is not suitable.
'''
# First, guarantee that if the query specifies an ISBN, the result with
# the same isbn is the most relevant
def isbn_test(mi):
return mi.isbn and mi.isbn == identifiers.get('isbn', None)
def boolcmp(a, b):
return -1 if a and not b else 1 if not a and b else 0
x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y)
result = boolcmp(x_has_isbn, y_has_isbn)
if result != 0:
return result
# Now prefer results that have complete metadata over those that don't
x_has_all_fields = self.test_fields(x) is None
y_has_all_fields = self.test_fields(y) is None
result = boolcmp(x_has_all_fields, y_has_all_fields)
if result != 0:
return result
# Now prefer results whose title matches the search query
if title:
x_title = cleanup_title(x.title)
y_title = cleanup_title(y.title)
t = cleanup_title(title)
x_has_title, y_has_title = x_title == t, y_title == t
result = boolcmp(x_has_title, y_has_title)
if result != 0:
return result
# Now prefer results with the longer comments, within 10%
cx = len(x.comments.strip() if x.comments else '')
cy = len(y.comments.strip() if y.comments else '')
t = (cx + cy) / 20
result = cy - cx
if result != 0 and abs(cx - cy) > t:
return result
# Now prefer results with cached cover URLs
x_has_cover = self.get_cached_cover_url(x.identifiers) is not None
y_has_cover = self.get_cached_cover_url(y.identifiers) is not None
result = boolcmp(x_has_cover, y_has_cover)
if result != 0:
return result
# Now use the relevance reported by the remote search engine
return x.source_relevance - y.source_relevance
def keygen(mi):
return InternalMetadataCompareKeyGen(mi, self, title, authors,
identifiers)
return keygen
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=5):

View File

@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
import os, tempfile, time
from Queue import Queue, Empty
from threading import Event
from functools import partial
from calibre.customize.ui import metadata_plugins
from calibre import prints
@ -93,7 +92,7 @@ def test_identify_plugin(name, tests):
prints('Found', len(results), 'matches:', end=' ')
prints('Smaller relevance means better match')
results.sort(cmp=partial(plugin.compare_identify_results,
results.sort(key=plugin.identify_results_keygen(
title=kwargs.get('title', None), authors=kwargs.get('authors',
None), identifiers=kwargs.get('identifiers', {})))