diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index b99893ccba..9460ed7ace 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -28,11 +28,12 @@ class Worker(Thread): # {{{ Get book details from amazons book page in a separate thread ''' - def __init__(self, url, result_queue, browser, log, timeout=20): + def __init__(self, url, result_queue, browser, log, relevance, plugin, timeout=20): Thread.__init__(self) self.daemon = True self.url, self.result_queue = url, result_queue self.log, self.timeout = log, timeout + self.relevance, self.plugin = relevance, plugin self.browser = browser.clone_browser() self.cover_url = self.amazon_id = self.isbn = None @@ -161,6 +162,15 @@ class Worker(Thread): # {{{ else: self.log.warning('Failed to find product description for url: %r'%self.url) + mi.source_relevance = self.relevance + + if self.amazon_id: + if self.isbn: + self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) + if self.cover_url: + self.cache_identifier_to_cover_url(self.amazon_id, + self.cover_url) + self.result_queue.put(mi) def parse_asin(self, root): @@ -321,6 +331,20 @@ class Amazon(Source): # }}} + def get_cached_cover_url(self, identifiers): + url = None + asin = identifiers.get('amazon', None) + if asin is None: + asin = identifiers.get('asin', None) + if asin is None: + isbn = identifiers.get('isbn', None) + if isbn is not None: + asin = self.cached_isbn_to_identifier(isbn) + if asin is not None: + url = self.cached_identifier_to_cover_url(asin) + + return url + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ''' @@ -396,7 +420,8 @@ class Amazon(Source): log.error('No matches found with query: %r'%query) return - workers = [Worker(url, result_queue, br, log) for url in matches] + workers = [Worker(url, result_queue, br, log, i, self) for i, url in + enumerate(matches)] for w in workers: w.start() @@ -414,14 +439,6 @@ class Amazon(Source): if not a_worker_is_alive: break - for w in workers: - if w.amazon_id: - if w.isbn: - self.cache_isbn_to_identifier(w.isbn, w.amazon_id) - if w.cover_url: - self.cache_identifier_to_cover_url(w.amazon_id, - w.cover_url) - return None # }}} diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 55cc996cf7..90d7f82d65 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -21,6 +21,21 @@ def create_log(ostream=None): log.outputs = [FileStream(ostream)] return log +words = ("the", "a", "an", "of", "and") +prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words))) +trailing_paren_pat = re.compile(r'\(.*\)$') +whitespace_pat = re.compile(r'\s+') + +def cleanup_title(s): + if not s: + s = _('Unknown') + s = s.strip().lower() + s = prefix_pat.sub(' ', s) + s = trailing_paren_pat.sub('', s) + s = whitespace_pat.sub(' ', s) + return s.strip() + + class Source(Plugin): type = _('Metadata source') @@ -128,10 +143,91 @@ class Source(Plugin): gr.append(job) return [g for g in groups if g] + def test_fields(self, mi): + ''' + Return the first field from self.touched_fields that is null on the + mi object + ''' + for key in self.touched_fields: + if key.startswith('identifier:'): + key = key.partition(':')[-1] + if not mi.has_identifier(key): + return 'identifier: ' + key + elif mi.is_null(key): + return key + + # }}} # Metadata API {{{ + def get_cached_cover_url(self, identifiers): + ''' + Return cached cover URL for the book identified by + the identifiers dict or Noneif no such URL exists + ''' + return None + + def compare_identify_results(self, x, y, title=None, authors=None, + identifiers={}): + ''' + Method used to sort the results from a call to identify by relevance. + Uses the actual query and various heuristics to rank results. + Re-implement in your plugin if this generic algorithm is not suitable. + Note that this method assumes x and y have a source_relevance + attribute. + + one < two iff one is more relevant than two + ''' + # First, guarantee that if the query specifies an ISBN, the result with + # the same isbn is the most relevant + def isbn_test(mi): + return mi.isbn and mi.isbn == identifiers.get('isbn', None) + + def boolcmp(a, b): + return -1 if a and not b else 1 if not a and b else 0 + + x_has_isbn, y_has_isbn = isbn_test(x), isbn_test(y) + result = boolcmp(x_has_isbn, y_has_isbn) + if result != 0: + return result + + # Now prefer results that have complete metadata over those that don't + x_has_all_fields = self.test_fields(x) is None + y_has_all_fields = self.test_fields(y) is None + + result = boolcmp(x_has_all_fields, y_has_all_fields) + if result != 0: + return result + + # Now prefer results whose title matches the search query + if title: + x_title = cleanup_title(x.title) + y_title = cleanup_title(y.title) + t = cleanup_title(title) + x_has_title, y_has_title = x_title == t, y_title == t + result = boolcmp(x_has_title, y_has_title) + if result != 0: + return result + + # Now prefer results with the longer comments, within 10% + cx = len(x.comments.strip() if x.comments else '') + cy = len(y.comments.strip() if y.comments else '') + t = (cx + cy) / 20 + result = cy - cx + if result != 0 and abs(cx - cy) > t: + return result + + # Now prefer results with cached cover URLs + x_has_cover = self.get_cached_cover_url(x.identifiers) is not None + y_has_cover = self.get_cached_cover_url(y.identifiers) is not None + result = boolcmp(x_has_cover, y_has_cover) + if result != 0: + return result + + # Now use the relevance reported by the remote search engine + return x.source_relevance - y.source_relevance + def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=5): ''' @@ -147,6 +243,15 @@ class Source(Plugin): the same ISBN/special identifier does not need to get the cover URL again. Use the caching API for this. + Every Metadata object put into result_queue by this method must have a + `source_relevance` attribute that is an integer indicating the order in + which the results were returned by the metadata source for this query. + This integer will be used by :meth:`compare_identify_results`. If the + order is unimportant, set it to zero for every result. + + Make sure that any cover/isbn mapping information is cached before the + Metadata object is put into result_queue. + :param log: A log object, use it to output debugging information/errors :param result_queue: A result Queue, results should be put into it. Each result is a Metadata object diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index c44ad81b6c..b7298c0099 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -190,14 +190,15 @@ class GoogleBooks(Source): return raw and len(raw) > 17000 and raw[1:4] != 'PNG' def get_all_details(self, br, log, entries, abort, result_queue, timeout): - for i in entries: + for relevance, i in enumerate(entries): try: ans = to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): - result_queue.put(ans) + ans.source_relevance = relevance for isbn in getattr(ans, 'all_isbns', []): self.cache_isbn_to_identifier(isbn, ans.identifiers['google']) + result_queue.put(ans) except: log.exception( 'Failed to get metadata for identify entry:', diff --git a/src/calibre/ebooks/metadata/sources/test.py b/src/calibre/ebooks/metadata/sources/test.py index 2af9a47078..032041ef29 100644 --- a/src/calibre/ebooks/metadata/sources/test.py +++ b/src/calibre/ebooks/metadata/sources/test.py @@ -46,15 +46,6 @@ def authors_test(authors): return test -def _test_fields(touched_fields, mi): - for key in touched_fields: - if key.startswith('identifier:'): - key = key.partition(':')[-1] - if not mi.has_identifier(key): - return 'identifier: ' + key - elif mi.is_null(key): - return key - def test_identify_plugin(name, tests): ''' @@ -120,11 +111,10 @@ def test_identify_plugin(name, tests): prints('Log saved to', lf) raise SystemExit(1) - good = [x for x in possibles if _test_fields(plugin.touched_fields, x) is + good = [x for x in possibles if plugin.test_fields(x) is None] if not good: - prints('Failed to find', _test_fields(plugin.touched_fields, - possibles[0])) + prints('Failed to find', plugin.test_fields(possibles[0])) raise SystemExit(1)