Amazon metadata download: Fix spurious results when searching for books that are not present on amazon using a search engine.

This commit is contained in:
Kovid Goyal 2017-03-15 09:43:50 +05:30
parent c8131cf507
commit b2e38cd0d4

View File

@ -6,6 +6,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import re import re
import socket import socket
import time import time
from functools import partial
from Queue import Empty, Queue from Queue import Empty, Queue
from threading import Thread from threading import Thread
from urlparse import urlparse from urlparse import urlparse
@ -105,7 +106,8 @@ class Worker(Thread): # Get details {{{
''' '''
def __init__(self, url, result_queue, browser, log, relevance, domain, def __init__(self, url, result_queue, browser, log, relevance, domain,
plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None): plugin, timeout=20, testing=False, preparsed_root=None,
cover_url_processor=None, filter_result=None):
Thread.__init__(self) Thread.__init__(self)
self.cover_url_processor = cover_url_processor self.cover_url_processor = cover_url_processor
self.preparsed_root = preparsed_root self.preparsed_root = preparsed_root
@ -113,6 +115,7 @@ class Worker(Thread): # Get details {{{
self.testing = testing self.testing = testing
self.url, self.result_queue = url, result_queue self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout self.log, self.timeout = log, timeout
self.filter_result = filter_result or (lambda x, log: True)
self.relevance, self.plugin = relevance, plugin self.relevance, self.plugin = relevance, plugin
self.browser = browser self.browser = browser
self.cover_url = self.amazon_id = self.isbn = None self.cover_url = self.amazon_id = self.isbn = None
@ -447,7 +450,8 @@ class Worker(Thread): # Get details {{{
self.plugin.clean_downloaded_metadata(mi) self.plugin.clean_downloaded_metadata(mi)
self.result_queue.put(mi) if self.filter_result(mi, self.log):
self.result_queue.put(mi)
def totext(self, elem): def totext(self, elem):
return self.tostring(elem, encoding=unicode, method='text').strip() return self.tostring(elem, encoding=unicode, method='text').strip()
@ -817,7 +821,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon.com' name = 'Amazon.com'
version = (1, 0, 0) version = (1, 1, 0)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
description = _('Downloads metadata and covers from Amazon') description = _('Downloads metadata and covers from Amazon')
@ -1338,8 +1342,10 @@ class Amazon(Source):
log.error('No matches found with query: %r' % query) log.error('No matches found with query: %r' % query)
return return
workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout, workers = [Worker(
cover_url_processor=cover_url_processor) for i, url in enumerate(matches)] url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout,
cover_url_processor=cover_url_processor, filter_result=partial(
self.filter_result, title, authors, identifiers)) for i, url in enumerate(matches)]
for w in workers: for w in workers:
# Don't send all requests at the same time # Don't send all requests at the same time
@ -1362,6 +1368,29 @@ class Amazon(Source):
return None return None
# }}} # }}}
def filter_result(self, title, authors, identifiers, mi, log): # {{{
if not self.use_search_engine:
return True
if title is not None:
tokens = {icu_lower(x) for x in title.split() if len(x) > 3}
if tokens:
result_tokens = {icu_lower(x) for x in mi.title.split()}
if not tokens.intersection(result_tokens):
log('Ignoring result:', mi.title, 'as its title does not match')
return False
if authors:
author_tokens = set()
for author in authors:
author_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}
result_tokens = set()
for author in mi.authors:
result_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}
if author_tokens and not author_tokens.intersection(result_tokens):
log('Ignoring result:', mi.title, 'by', ' & '.join(mi.authors), 'as its author does not match')
return False
return True
# }}}
def download_cover(self, log, result_queue, abort, # {{{ def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False): title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers) cached_url = self.get_cached_cover_url(identifiers)
@ -1450,14 +1479,6 @@ if __name__ == '__main__': # tests {{{
] ]
), ),
( # Description has links
{'identifiers': {'isbn': '9780671578275'}},
[title_test('A Civil Campaign: A Comedy of Biology and Manners',
exact=True), authors_test(['Lois McMaster Bujold'])
]
),
( # Sophisticated comment formatting ( # Sophisticated comment formatting
{'identifiers': {'isbn': '9781416580829'}}, {'identifiers': {'isbn': '9781416580829'}},
[title_test('Angels & Demons - Movie Tie-In: A Novel', [title_test('Angels & Demons - Movie Tie-In: A Novel',