From 45adf469440382c8e68eeddde1a66064c5e865a0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 17 Apr 2011 11:13:57 -0600 Subject: [PATCH] Improved get_title_tokens --- src/calibre/ebooks/metadata/sources/base.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index f322fcdb56..f98209e580 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -294,8 +294,16 @@ class Source(Plugin): Excludes connectives and punctuation. ''' if title: - pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''') - title = pat.sub(' ', title) + title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in + [ + (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|paperback|mass\s*market|edition|ed\.)[\])}]', ''), + (r'(\d+),(\d+)', r'\1\2'), + (r'(\s-)', ' '), + (r"'", ''), + (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ') + ]] + for pat, repl in title_patterns: + title = pat.sub(repl, title) tokens = title.split() for token in tokens: token = token.strip()