Amazon metadata: When filtering search engine results by title ignore words of the title that are purely punctuation

2026-01-07 04:30:19 -05:00 · 2024-06-17 10:10:55 +05:30 · 2024-06-17 10:10:55 +05:30 · 44bceacf03
commit 44bceacf03
parent 43121af37d
1 changed files with 9 additions and 2 deletions
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -1082,7 +1082,7 @@ class Worker(Thread):  # Get details {{{
 class Amazon(Source):

    name = 'Amazon.com'
-    version = (1, 3, 7)
+    version = (1, 3, 8)
    minimum_calibre_version = (2, 82, 0)
    description = _('Downloads metadata and covers from Amazon')

@ -1684,13 +1684,20 @@ class Amazon(Source):
        if not self.use_search_engine:
            return True
        if title is not None:
+            import regex
+            only_punctuation_pat = regex.compile(r'^\p{P}+$')

            def tokenize_title(x):
-                return icu_lower(x).replace("'", '').replace('"', '').rstrip(':')
+                ans = icu_lower(x).replace("'", '').replace('"', '').rstrip(':')
+                if only_punctuation_pat.match(ans) is not None:
+                    ans = ''
+                return ans

            tokens = {tokenize_title(x) for x in title.split() if len(x) > 3}
+            tokens.discard('')
            if tokens:
                result_tokens = {tokenize_title(x) for x in mi.title.split()}
+                result_tokens.discard('')
                if not tokens.intersection(result_tokens):
                    log('Ignoring result:', mi.title, 'as its title does not match')
                    return False