Amazon metadata: When filtering search engine results by title ignore words of the title that are purely punctuation

This commit is contained in:
Kovid Goyal 2024-06-17 10:10:55 +05:30
parent 43121af37d
commit 44bceacf03
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1082,7 +1082,7 @@ class Worker(Thread): # Get details {{{
class Amazon(Source): class Amazon(Source):
name = 'Amazon.com' name = 'Amazon.com'
version = (1, 3, 7) version = (1, 3, 8)
minimum_calibre_version = (2, 82, 0) minimum_calibre_version = (2, 82, 0)
description = _('Downloads metadata and covers from Amazon') description = _('Downloads metadata and covers from Amazon')
@ -1684,13 +1684,20 @@ class Amazon(Source):
if not self.use_search_engine: if not self.use_search_engine:
return True return True
if title is not None: if title is not None:
import regex
only_punctuation_pat = regex.compile(r'^\p{P}+$')
def tokenize_title(x): def tokenize_title(x):
return icu_lower(x).replace("'", '').replace('"', '').rstrip(':') ans = icu_lower(x).replace("'", '').replace('"', '').rstrip(':')
if only_punctuation_pat.match(ans) is not None:
ans = ''
return ans
tokens = {tokenize_title(x) for x in title.split() if len(x) > 3} tokens = {tokenize_title(x) for x in title.split() if len(x) > 3}
tokens.discard('')
if tokens: if tokens:
result_tokens = {tokenize_title(x) for x in mi.title.split()} result_tokens = {tokenize_title(x) for x in mi.title.split()}
result_tokens.discard('')
if not tokens.intersection(result_tokens): if not tokens.intersection(result_tokens):
log('Ignoring result:', mi.title, 'as its title does not match') log('Ignoring result:', mi.title, 'as its title does not match')
return False return False