Improved get_title_tokens

2025-07-09 03:04:10 -04:00 · 2011-04-17 11:13:57 -06:00 · 2011-04-17 11:13:57 -06:00 · 45adf46944
commit 45adf46944
parent 8ee74bd2fb
1 changed files with 10 additions and 2 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -294,8 +294,16 @@ class Source(Plugin):
        Excludes connectives and punctuation.
        '''
        if title:
-            pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
-            title = pat.sub(' ', title)
+            title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
+            [
+                (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|paperback|mass\s*market|edition|ed\.)[\])}]', ''),
+                (r'(\d+),(\d+)', r'\1\2'),
+                (r'(\s-)', ' '),
+                (r"'", ''),
+                (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
+            ]]
+            for pat, repl in title_patterns:
+                title = pat.sub(repl, title)
            tokens = title.split()
            for token in tokens:
                token = token.strip()