mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Refactor the sort by relevance API
This commit is contained in:
parent
32ed063b5b
commit
5f9032fa25
@ -21,6 +21,7 @@ def create_log(ostream=None):
|
|||||||
log.outputs = [FileStream(ostream)]
|
log.outputs = [FileStream(ostream)]
|
||||||
return log
|
return log
|
||||||
|
|
||||||
|
# Comparing Metadata objects for relevance {{{
|
||||||
words = ("the", "a", "an", "of", "and")
|
words = ("the", "a", "an", "of", "and")
|
||||||
prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
|
prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
|
||||||
trailing_paren_pat = re.compile(r'\(.*\)$')
|
trailing_paren_pat = re.compile(r'\(.*\)$')
|
||||||
@ -35,6 +36,55 @@ def cleanup_title(s):
|
|||||||
s = whitespace_pat.sub(' ', s)
|
s = whitespace_pat.sub(' ', s)
|
||||||
return s.strip()
|
return s.strip()
|
||||||
|
|
||||||
|
class InternalMetadataCompareKeyGen(object):
|
||||||
|
|
||||||
|
'''
|
||||||
|
Generate a sort key for comparison of the relevance of Metadata objects,
|
||||||
|
given a search query.
|
||||||
|
|
||||||
|
The sort key ensures that an ascending order sort is a sort by order of
|
||||||
|
decreasing relevance.
|
||||||
|
|
||||||
|
The algorithm is:
|
||||||
|
|
||||||
|
1. Prefer results that have the same ISBN as specified in the query
|
||||||
|
2. Prefer results with all available fields filled in
|
||||||
|
3. Prefer results that are an exact title match to the query
|
||||||
|
4. Prefer results with longer comments (greater than 10 % longer)
|
||||||
|
5. Prefer results with a cached cover URL
|
||||||
|
6. Use the relevance of the result as reported by the metadata source's search
|
||||||
|
engine
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, mi, source_plugin, title, authors, identifiers):
|
||||||
|
isbn = 1 if mi.isbn and mi.isbn == identifiers.get('isbn', None) else 2
|
||||||
|
|
||||||
|
all_fields = 1 if source_plugin.test_fields(mi) is None else 2
|
||||||
|
|
||||||
|
exact_title = 1 if title and \
|
||||||
|
cleanup_title(title) == cleanup_title(mi.title) else 2
|
||||||
|
|
||||||
|
has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\
|
||||||
|
is None else 1
|
||||||
|
|
||||||
|
self.base = (isbn, all_fields, exact_title)
|
||||||
|
self.comments_len = len(mi.comments.strip() if mi.comments else '')
|
||||||
|
self.extra = (has_cover, getattr(mi, 'source_relevance', 0))
|
||||||
|
|
||||||
|
def __cmp__(self, other):
|
||||||
|
result = cmp(self.base, other.base)
|
||||||
|
if result == 0:
|
||||||
|
# Now prefer results with the longer comments, within 10%
|
||||||
|
cx, cy = self.comments_len, other.comments_len
|
||||||
|
t = (cx + cy) / 20
|
||||||
|
delta = cy - cx
|
||||||
|
if abs(delta) > t:
|
||||||
|
result = delta
|
||||||
|
else:
|
||||||
|
result = cmp(self.extra, other.extra)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# }}}
|
||||||
|
|
||||||
class Source(Plugin):
|
class Source(Plugin):
|
||||||
|
|
||||||
@ -70,7 +120,7 @@ class Source(Plugin):
|
|||||||
def browser(self):
|
def browser(self):
|
||||||
if self._browser is None:
|
if self._browser is None:
|
||||||
self._browser = browser(user_agent=random_user_agent())
|
self._browser = browser(user_agent=random_user_agent())
|
||||||
return self._browser
|
return self._browser.clone_browser()
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
@ -164,69 +214,30 @@ class Source(Plugin):
|
|||||||
def get_cached_cover_url(self, identifiers):
|
def get_cached_cover_url(self, identifiers):
|
||||||
'''
|
'''
|
||||||
Return cached cover URL for the book identified by
|
Return cached cover URL for the book identified by
|
||||||
the identifiers dict or Noneif no such URL exists
|
the identifiers dict or None if no such URL exists.
|
||||||
|
|
||||||
|
Note that this method must only return validated URLs, i.e. not URLS
|
||||||
|
that could result in a generic cover image or a not found error.
|
||||||
'''
|
'''
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def compare_identify_results(self, x, y, title=None, authors=None,
|
def identify_results_keygen(self, title=None, authors=None,
|
||||||
identifiers={}):
|
identifiers={}):
|
||||||
'''
|
'''
|
||||||
Method used to sort the results from a call to identify by relevance.
|
Return a function that is used to generate a key that can sort Metadata
|
||||||
Uses the actual query and various heuristics to rank results.
|
objects by their relevance given a search query (title, authors,
|
||||||
Re-implement in your plugin if this generic algorithm is not suitable.
|
identifiers).
|
||||||
Note that this method assumes x and y have a source_relevance
|
|
||||||
attribute.
|
|
||||||
|
|
||||||
one < two iff one is more relevant than two
|
These keys are used to sort the results of a call to :meth:`identify`.
|
||||||
|
|
||||||
|
For details on the default algorithm see
|
||||||
|
:class:`InternalMetadataCompareKeyGen`. Re-implement this function in
|
||||||
|
your plugin if the default algorithm is not suitable.
|
||||||
'''
|
'''
|
||||||
# First, guarantee that if the query specifies an ISBN, the result with
|
def keygen(mi):
|
||||||
# the same isbn is the most relevant
|
return InternalMetadataCompareKeyGen(mi, self, title, authors,
|
||||||
def isbn_test(mi):
|
identifiers)
|
||||||
return mi.isbn and mi.isbn == identifiers.get('isbn', None)
|
return keygen
|
||||||
|
|
||||||
def boolcmp(a, b):
|
|
||||||
return -1 if a and not b else 1 if not a and b else 0
|
|
||||||
|
|
||||||
x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y)
|
|
||||||
result = boolcmp(x_has_isbn, y_has_isbn)
|
|
||||||
if result != 0:
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Now prefer results that have complete metadata over those that don't
|
|
||||||
x_has_all_fields = self.test_fields(x) is None
|
|
||||||
y_has_all_fields = self.test_fields(y) is None
|
|
||||||
|
|
||||||
result = boolcmp(x_has_all_fields, y_has_all_fields)
|
|
||||||
if result != 0:
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Now prefer results whose title matches the search query
|
|
||||||
if title:
|
|
||||||
x_title = cleanup_title(x.title)
|
|
||||||
y_title = cleanup_title(y.title)
|
|
||||||
t = cleanup_title(title)
|
|
||||||
x_has_title, y_has_title = x_title == t, y_title == t
|
|
||||||
result = boolcmp(x_has_title, y_has_title)
|
|
||||||
if result != 0:
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Now prefer results with the longer comments, within 10%
|
|
||||||
cx = len(x.comments.strip() if x.comments else '')
|
|
||||||
cy = len(y.comments.strip() if y.comments else '')
|
|
||||||
t = (cx + cy) / 20
|
|
||||||
result = cy - cx
|
|
||||||
if result != 0 and abs(cx - cy) > t:
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Now prefer results with cached cover URLs
|
|
||||||
x_has_cover = self.get_cached_cover_url(x.identifiers) is not None
|
|
||||||
y_has_cover = self.get_cached_cover_url(y.identifiers) is not None
|
|
||||||
result = boolcmp(x_has_cover, y_has_cover)
|
|
||||||
if result != 0:
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Now use the relevance reported by the remote search engine
|
|
||||||
return x.source_relevance - y.source_relevance
|
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
identifiers={}, timeout=5):
|
identifiers={}, timeout=5):
|
||||||
|
@ -10,7 +10,6 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import os, tempfile, time
|
import os, tempfile, time
|
||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
from threading import Event
|
from threading import Event
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
from calibre.customize.ui import metadata_plugins
|
from calibre.customize.ui import metadata_plugins
|
||||||
from calibre import prints
|
from calibre import prints
|
||||||
@ -93,7 +92,7 @@ def test_identify_plugin(name, tests):
|
|||||||
prints('Found', len(results), 'matches:', end=' ')
|
prints('Found', len(results), 'matches:', end=' ')
|
||||||
prints('Smaller relevance means better match')
|
prints('Smaller relevance means better match')
|
||||||
|
|
||||||
results.sort(cmp=partial(plugin.compare_identify_results,
|
results.sort(key=plugin.identify_results_keygen(
|
||||||
title=kwargs.get('title', None), authors=kwargs.get('authors',
|
title=kwargs.get('title', None), authors=kwargs.get('authors',
|
||||||
None), identifiers=kwargs.get('identifiers', {})))
|
None), identifiers=kwargs.get('identifiers', {})))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user