From 3b4f584a91913c574d9e6effde9db209b92a4834 Mon Sep 17 00:00:00 2001 From: xcffl <--list> Date: Tue, 3 Mar 2020 09:51:36 +0800 Subject: [PATCH] Improve tokenization of CJK author names --- src/calibre/ebooks/metadata/sources/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index bd7ed3d9a0..2d05d17814 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -343,8 +343,8 @@ class Source(Plugin): if authors: # Leave ' in there for Irish names - remove_pat = re.compile(r'[!@#$%^&*(){}`~"\s\[\]/]') - replace_pat = re.compile(r'[-+.:;,]') + remove_pat = re.compile(r'[!@#$%^&*()()「」{}`~"\s\[\]/]') + replace_pat = re.compile(r'[-+.:;,,。;:]') if only_first_author: authors = authors[:1] for au in authors: @@ -384,7 +384,7 @@ class Source(Plugin): # Remove hyphens only if they have whitespace before them (r'(\s-)', ' '), # Replace other special chars with a space - (r'''[:,;!@$%^&*(){}.`~"\s\[\]/]《》''', ' '), + (r'''[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”''', ' '), ]] for pat, repl in title_patterns: