Improve tokenization of CJK author names

This commit is contained in:
xcffl 2020-03-03 09:51:36 +08:00
parent 348338f75c
commit 3b4f584a91
No known key found for this signature in database
GPG Key ID: C64681FA6C2FA680

View File

@ -343,8 +343,8 @@ class Source(Plugin):
if authors: if authors:
# Leave ' in there for Irish names # Leave ' in there for Irish names
remove_pat = re.compile(r'[!@#$%^&*(){}`~"\s\[\]/]') remove_pat = re.compile(r'[!@#$%^&*()()「」{}`~"\s\[\]/]')
replace_pat = re.compile(r'[-+.:;,]') replace_pat = re.compile(r'[-+.:;,,。;:]')
if only_first_author: if only_first_author:
authors = authors[:1] authors = authors[:1]
for au in authors: for au in authors:
@ -384,7 +384,7 @@ class Source(Plugin):
# Remove hyphens only if they have whitespace before them # Remove hyphens only if they have whitespace before them
(r'(\s-)', ' '), (r'(\s-)', ' '),
# Replace other special chars with a space # Replace other special chars with a space
(r'''[:,;!@$%^&*(){}.`~"\s\[\]/]《》''', ' '), (r'''[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”''', ' '),
]] ]]
for pat, repl in title_patterns: for pat, repl in title_patterns: