mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Initial implementation of relevance sorting of metadata identify results. Needs testing
This commit is contained in:
parent
d37f302a0e
commit
74d1fb4c49
@ -28,11 +28,12 @@ class Worker(Thread): # {{{
|
|||||||
Get book details from amazons book page in a separate thread
|
Get book details from amazons book page in a separate thread
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def __init__(self, url, result_queue, browser, log, timeout=20):
|
def __init__(self, url, result_queue, browser, log, relevance, plugin, timeout=20):
|
||||||
Thread.__init__(self)
|
Thread.__init__(self)
|
||||||
self.daemon = True
|
self.daemon = True
|
||||||
self.url, self.result_queue = url, result_queue
|
self.url, self.result_queue = url, result_queue
|
||||||
self.log, self.timeout = log, timeout
|
self.log, self.timeout = log, timeout
|
||||||
|
self.relevance, self.plugin = relevance, plugin
|
||||||
self.browser = browser.clone_browser()
|
self.browser = browser.clone_browser()
|
||||||
self.cover_url = self.amazon_id = self.isbn = None
|
self.cover_url = self.amazon_id = self.isbn = None
|
||||||
|
|
||||||
@ -161,6 +162,15 @@ class Worker(Thread): # {{{
|
|||||||
else:
|
else:
|
||||||
self.log.warning('Failed to find product description for url: %r'%self.url)
|
self.log.warning('Failed to find product description for url: %r'%self.url)
|
||||||
|
|
||||||
|
mi.source_relevance = self.relevance
|
||||||
|
|
||||||
|
if self.amazon_id:
|
||||||
|
if self.isbn:
|
||||||
|
self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
|
||||||
|
if self.cover_url:
|
||||||
|
self.cache_identifier_to_cover_url(self.amazon_id,
|
||||||
|
self.cover_url)
|
||||||
|
|
||||||
self.result_queue.put(mi)
|
self.result_queue.put(mi)
|
||||||
|
|
||||||
def parse_asin(self, root):
|
def parse_asin(self, root):
|
||||||
@ -321,6 +331,20 @@ class Amazon(Source):
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
def get_cached_cover_url(self, identifiers):
|
||||||
|
url = None
|
||||||
|
asin = identifiers.get('amazon', None)
|
||||||
|
if asin is None:
|
||||||
|
asin = identifiers.get('asin', None)
|
||||||
|
if asin is None:
|
||||||
|
isbn = identifiers.get('isbn', None)
|
||||||
|
if isbn is not None:
|
||||||
|
asin = self.cached_isbn_to_identifier(isbn)
|
||||||
|
if asin is not None:
|
||||||
|
url = self.cached_identifier_to_cover_url(asin)
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
|
||||||
identifiers={}, timeout=30):
|
identifiers={}, timeout=30):
|
||||||
'''
|
'''
|
||||||
@ -396,7 +420,8 @@ class Amazon(Source):
|
|||||||
log.error('No matches found with query: %r'%query)
|
log.error('No matches found with query: %r'%query)
|
||||||
return
|
return
|
||||||
|
|
||||||
workers = [Worker(url, result_queue, br, log) for url in matches]
|
workers = [Worker(url, result_queue, br, log, i, self) for i, url in
|
||||||
|
enumerate(matches)]
|
||||||
|
|
||||||
for w in workers:
|
for w in workers:
|
||||||
w.start()
|
w.start()
|
||||||
@ -414,14 +439,6 @@ class Amazon(Source):
|
|||||||
if not a_worker_is_alive:
|
if not a_worker_is_alive:
|
||||||
break
|
break
|
||||||
|
|
||||||
for w in workers:
|
|
||||||
if w.amazon_id:
|
|
||||||
if w.isbn:
|
|
||||||
self.cache_isbn_to_identifier(w.isbn, w.amazon_id)
|
|
||||||
if w.cover_url:
|
|
||||||
self.cache_identifier_to_cover_url(w.amazon_id,
|
|
||||||
w.cover_url)
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
@ -21,6 +21,21 @@ def create_log(ostream=None):
|
|||||||
log.outputs = [FileStream(ostream)]
|
log.outputs = [FileStream(ostream)]
|
||||||
return log
|
return log
|
||||||
|
|
||||||
|
words = ("the", "a", "an", "of", "and")
|
||||||
|
prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
|
||||||
|
trailing_paren_pat = re.compile(r'\(.*\)$')
|
||||||
|
whitespace_pat = re.compile(r'\s+')
|
||||||
|
|
||||||
|
def cleanup_title(s):
|
||||||
|
if not s:
|
||||||
|
s = _('Unknown')
|
||||||
|
s = s.strip().lower()
|
||||||
|
s = prefix_pat.sub(' ', s)
|
||||||
|
s = trailing_paren_pat.sub('', s)
|
||||||
|
s = whitespace_pat.sub(' ', s)
|
||||||
|
return s.strip()
|
||||||
|
|
||||||
|
|
||||||
class Source(Plugin):
|
class Source(Plugin):
|
||||||
|
|
||||||
type = _('Metadata source')
|
type = _('Metadata source')
|
||||||
@ -128,10 +143,91 @@ class Source(Plugin):
|
|||||||
gr.append(job)
|
gr.append(job)
|
||||||
return [g for g in groups if g]
|
return [g for g in groups if g]
|
||||||
|
|
||||||
|
def test_fields(self, mi):
|
||||||
|
'''
|
||||||
|
Return the first field from self.touched_fields that is null on the
|
||||||
|
mi object
|
||||||
|
'''
|
||||||
|
for key in self.touched_fields:
|
||||||
|
if key.startswith('identifier:'):
|
||||||
|
key = key.partition(':')[-1]
|
||||||
|
if not mi.has_identifier(key):
|
||||||
|
return 'identifier: ' + key
|
||||||
|
elif mi.is_null(key):
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# Metadata API {{{
|
# Metadata API {{{
|
||||||
|
|
||||||
|
def get_cached_cover_url(self, identifiers):
|
||||||
|
'''
|
||||||
|
Return cached cover URL for the book identified by
|
||||||
|
the identifiers dict or Noneif no such URL exists
|
||||||
|
'''
|
||||||
|
return None
|
||||||
|
|
||||||
|
def compare_identify_results(self, x, y, title=None, authors=None,
|
||||||
|
identifiers={}):
|
||||||
|
'''
|
||||||
|
Method used to sort the results from a call to identify by relevance.
|
||||||
|
Uses the actual query and various heuristics to rank results.
|
||||||
|
Re-implement in your plugin if this generic algorithm is not suitable.
|
||||||
|
Note that this method assumes x and y have a source_relevance
|
||||||
|
attribute.
|
||||||
|
|
||||||
|
one < two iff one is more relevant than two
|
||||||
|
'''
|
||||||
|
# First, guarantee that if the query specifies an ISBN, the result with
|
||||||
|
# the same isbn is the most relevant
|
||||||
|
def isbn_test(mi):
|
||||||
|
return mi.isbn and mi.isbn == identifiers.get('isbn', None)
|
||||||
|
|
||||||
|
def boolcmp(a, b):
|
||||||
|
return -1 if a and not b else 1 if not a and b else 0
|
||||||
|
|
||||||
|
x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y)
|
||||||
|
result = boolcmp(x_has_isbn, y_has_isbn)
|
||||||
|
if result != 0:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Now prefer results that have complete metadata over those that don't
|
||||||
|
x_has_all_fields = self.test_fields(x) is None
|
||||||
|
y_has_all_fields = self.test_fields(y) is None
|
||||||
|
|
||||||
|
result = boolcmp(x_has_all_fields, y_has_all_fields)
|
||||||
|
if result != 0:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Now prefer results whose title matches the search query
|
||||||
|
if title:
|
||||||
|
x_title = cleanup_title(x.title)
|
||||||
|
y_title = cleanup_title(y.title)
|
||||||
|
t = cleanup_title(title)
|
||||||
|
x_has_title, y_has_title = x_title == t, y_title == t
|
||||||
|
result = boolcmp(x_has_title, y_has_title)
|
||||||
|
if result != 0:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Now prefer results with the longer comments, within 10%
|
||||||
|
cx = len(x.comments.strip() if x.comments else '')
|
||||||
|
cy = len(y.comments.strip() if y.comments else '')
|
||||||
|
t = (cx + cy) / 20
|
||||||
|
result = cy - cx
|
||||||
|
if result != 0 and abs(cx - cy) > t:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Now prefer results with cached cover URLs
|
||||||
|
x_has_cover = self.get_cached_cover_url(x.identifiers) is not None
|
||||||
|
y_has_cover = self.get_cached_cover_url(y.identifiers) is not None
|
||||||
|
result = boolcmp(x_has_cover, y_has_cover)
|
||||||
|
if result != 0:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Now use the relevance reported by the remote search engine
|
||||||
|
return x.source_relevance - y.source_relevance
|
||||||
|
|
||||||
def identify(self, log, result_queue, abort, title=None, authors=None,
|
def identify(self, log, result_queue, abort, title=None, authors=None,
|
||||||
identifiers={}, timeout=5):
|
identifiers={}, timeout=5):
|
||||||
'''
|
'''
|
||||||
@ -147,6 +243,15 @@ class Source(Plugin):
|
|||||||
the same ISBN/special identifier does not need to get the cover URL
|
the same ISBN/special identifier does not need to get the cover URL
|
||||||
again. Use the caching API for this.
|
again. Use the caching API for this.
|
||||||
|
|
||||||
|
Every Metadata object put into result_queue by this method must have a
|
||||||
|
`source_relevance` attribute that is an integer indicating the order in
|
||||||
|
which the results were returned by the metadata source for this query.
|
||||||
|
This integer will be used by :meth:`compare_identify_results`. If the
|
||||||
|
order is unimportant, set it to zero for every result.
|
||||||
|
|
||||||
|
Make sure that any cover/isbn mapping information is cached before the
|
||||||
|
Metadata object is put into result_queue.
|
||||||
|
|
||||||
:param log: A log object, use it to output debugging information/errors
|
:param log: A log object, use it to output debugging information/errors
|
||||||
:param result_queue: A result Queue, results should be put into it.
|
:param result_queue: A result Queue, results should be put into it.
|
||||||
Each result is a Metadata object
|
Each result is a Metadata object
|
||||||
|
@ -190,14 +190,15 @@ class GoogleBooks(Source):
|
|||||||
return raw and len(raw) > 17000 and raw[1:4] != 'PNG'
|
return raw and len(raw) > 17000 and raw[1:4] != 'PNG'
|
||||||
|
|
||||||
def get_all_details(self, br, log, entries, abort, result_queue, timeout):
|
def get_all_details(self, br, log, entries, abort, result_queue, timeout):
|
||||||
for i in entries:
|
for relevance, i in enumerate(entries):
|
||||||
try:
|
try:
|
||||||
ans = to_metadata(br, log, i, timeout)
|
ans = to_metadata(br, log, i, timeout)
|
||||||
if isinstance(ans, Metadata):
|
if isinstance(ans, Metadata):
|
||||||
result_queue.put(ans)
|
ans.source_relevance = relevance
|
||||||
for isbn in getattr(ans, 'all_isbns', []):
|
for isbn in getattr(ans, 'all_isbns', []):
|
||||||
self.cache_isbn_to_identifier(isbn,
|
self.cache_isbn_to_identifier(isbn,
|
||||||
ans.identifiers['google'])
|
ans.identifiers['google'])
|
||||||
|
result_queue.put(ans)
|
||||||
except:
|
except:
|
||||||
log.exception(
|
log.exception(
|
||||||
'Failed to get metadata for identify entry:',
|
'Failed to get metadata for identify entry:',
|
||||||
|
@ -46,15 +46,6 @@ def authors_test(authors):
|
|||||||
|
|
||||||
return test
|
return test
|
||||||
|
|
||||||
def _test_fields(touched_fields, mi):
|
|
||||||
for key in touched_fields:
|
|
||||||
if key.startswith('identifier:'):
|
|
||||||
key = key.partition(':')[-1]
|
|
||||||
if not mi.has_identifier(key):
|
|
||||||
return 'identifier: ' + key
|
|
||||||
elif mi.is_null(key):
|
|
||||||
return key
|
|
||||||
|
|
||||||
|
|
||||||
def test_identify_plugin(name, tests):
|
def test_identify_plugin(name, tests):
|
||||||
'''
|
'''
|
||||||
@ -120,11 +111,10 @@ def test_identify_plugin(name, tests):
|
|||||||
prints('Log saved to', lf)
|
prints('Log saved to', lf)
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
good = [x for x in possibles if _test_fields(plugin.touched_fields, x) is
|
good = [x for x in possibles if plugin.test_fields(x) is
|
||||||
None]
|
None]
|
||||||
if not good:
|
if not good:
|
||||||
prints('Failed to find', _test_fields(plugin.touched_fields,
|
prints('Failed to find', plugin.test_fields(possibles[0]))
|
||||||
possibles[0]))
|
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user