From 2a4d733eace934ebab115324033f981d41c4198e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 18 Jul 2018 14:30:23 +0530 Subject: [PATCH] Intelligent name capitalization for the author mapper --- src/calibre/ebooks/metadata/author_mapper.py | 37 ++++++++++++++++++-- src/calibre/ebooks/metadata/sources/base.py | 30 +--------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/calibre/ebooks/metadata/author_mapper.py b/src/calibre/ebooks/metadata/author_mapper.py index 38996cb83f..d4e10d118b 100644 --- a/src/calibre/ebooks/metadata/author_mapper.py +++ b/src/calibre/ebooks/metadata/author_mapper.py @@ -2,10 +2,41 @@ # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2015, Kovid Goyal -from __future__ import (unicode_literals, division, absolute_import, - print_function) +from __future__ import absolute_import, division, print_function, unicode_literals + +import re from collections import deque +from calibre.utils.icu import capitalize, lower, upper + + +def cap_author_token(token): + lt = lower(token) + if lt in ('von', 'de', 'el', 'van', 'le'): + return lt + # no digits no spez. characters + if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None: + # Normalize tokens of the form J.K. to J. K. + parts = token.split('.') + return '. '.join(map(capitalize, parts)).strip() + scots_name = None + for x in ('mc', 'mac'): + if (token.lower().startswith(x) and len(token) > len(x) and + ( + token[len(x)] == upper(token[len(x)]) or + lt == token + )): + scots_name = len(x) + break + ans = capitalize(token) + if scots_name is not None: + ans = ans[:scots_name] + upper(ans[scots_name]) + ans[scots_name+1:] + for x in ('-', "'"): + idx = ans.find(x) + if idx > -1 and len(ans) > idx+2: + ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:] + return ans + def compile_pat(pat): import regex @@ -74,7 +105,7 @@ def apply_rules(author, rules): authors.appendleft(author) break if ac == 'capitalize': - ans.append(author.capitalize()) + ans.append(' '.join(map(cap_author_token, author.split()))) break if ac == 'lower': ans.append(icu_lower(author)) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index c8dfd02ac5..baf793b4d0 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -12,8 +12,8 @@ from future_builtins import map from calibre import browser, random_user_agent from calibre.customize import Plugin -from calibre.utils.icu import capitalize, lower, upper from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.author_mapper import cap_author_token from calibre.utils.localization import canonicalize_lang, get_lang @@ -128,34 +128,6 @@ def load_caches(dump): p.load_caches(cache) -def cap_author_token(token): - lt = lower(token) - if lt in ('von', 'de', 'el', 'van', 'le'): - return lt - # no digits no spez. characters - if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None: - # Normalize tokens of the form J.K. to J. K. - parts = token.split('.') - return '. '.join(map(capitalize, parts)).strip() - scots_name = None - for x in ('mc', 'mac'): - if (token.lower().startswith(x) and len(token) > len(x) and - ( - token[len(x)] == upper(token[len(x)]) or - lt == token - )): - scots_name = len(x) - break - ans = capitalize(token) - if scots_name is not None: - ans = ans[:scots_name] + upper(ans[scots_name]) + ans[scots_name+1:] - for x in ('-', "'"): - idx = ans.find(x) - if idx > -1 and len(ans) > idx+2: - ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:] - return ans - - def fixauthors(authors): if not authors: return authors