Amazon metadata download: Fix spurious results when searching for books that are not present on amazon using a search engine.

This commit is contained in:
Kovid Goyal 2017-03-15 09:43:50 +05:30
parent c8131cf507
commit b2e38cd0d4

View File

@ -6,6 +6,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import re
import socket
import time
from functools import partial
from Queue import Empty, Queue
from threading import Thread
from urlparse import urlparse
@ -105,7 +106,8 @@ class Worker(Thread): # Get details {{{
'''
def __init__(self, url, result_queue, browser, log, relevance, domain,
plugin, timeout=20, testing=False, preparsed_root=None, cover_url_processor=None):
plugin, timeout=20, testing=False, preparsed_root=None,
cover_url_processor=None, filter_result=None):
Thread.__init__(self)
self.cover_url_processor = cover_url_processor
self.preparsed_root = preparsed_root
@ -113,6 +115,7 @@ class Worker(Thread): # Get details {{{
self.testing = testing
self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout
self.filter_result = filter_result or (lambda x, log: True)
self.relevance, self.plugin = relevance, plugin
self.browser = browser
self.cover_url = self.amazon_id = self.isbn = None
@ -447,6 +450,7 @@ class Worker(Thread): # Get details {{{
self.plugin.clean_downloaded_metadata(mi)
if self.filter_result(mi, self.log):
self.result_queue.put(mi)
def totext(self, elem):
@ -817,7 +821,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source):
name = 'Amazon.com'
version = (1, 0, 0)
version = (1, 1, 0)
minimum_calibre_version = (2, 80, 0)
description = _('Downloads metadata and covers from Amazon')
@ -1338,8 +1342,10 @@ class Amazon(Source):
log.error('No matches found with query: %r' % query)
return
workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout,
cover_url_processor=cover_url_processor) for i, url in enumerate(matches)]
workers = [Worker(
url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout,
cover_url_processor=cover_url_processor, filter_result=partial(
self.filter_result, title, authors, identifiers)) for i, url in enumerate(matches)]
for w in workers:
# Don't send all requests at the same time
@ -1362,6 +1368,29 @@ class Amazon(Source):
return None
# }}}
def filter_result(self, title, authors, identifiers, mi, log): # {{{
if not self.use_search_engine:
return True
if title is not None:
tokens = {icu_lower(x) for x in title.split() if len(x) > 3}
if tokens:
result_tokens = {icu_lower(x) for x in mi.title.split()}
if not tokens.intersection(result_tokens):
log('Ignoring result:', mi.title, 'as its title does not match')
return False
if authors:
author_tokens = set()
for author in authors:
author_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}
result_tokens = set()
for author in mi.authors:
result_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}
if author_tokens and not author_tokens.intersection(result_tokens):
log('Ignoring result:', mi.title, 'by', ' & '.join(mi.authors), 'as its author does not match')
return False
return True
# }}}
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
cached_url = self.get_cached_cover_url(identifiers)
@ -1450,14 +1479,6 @@ if __name__ == '__main__': # tests {{{
]
),
( # Description has links
{'identifiers': {'isbn': '9780671578275'}},
[title_test('A Civil Campaign: A Comedy of Biology and Manners',
exact=True), authors_test(['Lois McMaster Bujold'])
]
),
( # Sophisticated comment formatting
{'identifiers': {'isbn': '9781416580829'}},
[title_test('Angels & Demons - Movie Tie-In: A Novel',