Intelligent name capitalization for the author mapper

This commit is contained in:
Kovid Goyal 2018-07-18 14:30:23 +05:30
parent 848a7267d2
commit 2a4d733eac
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 35 additions and 32 deletions

View File

@ -2,10 +2,41 @@
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from __future__ import absolute_import, division, print_function, unicode_literals
import re
from collections import deque
from calibre.utils.icu import capitalize, lower, upper
def cap_author_token(token):
lt = lower(token)
if lt in ('von', 'de', 'el', 'van', 'le'):
return lt
# no digits no spez. characters
if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
# Normalize tokens of the form J.K. to J. K.
parts = token.split('.')
return '. '.join(map(capitalize, parts)).strip()
scots_name = None
for x in ('mc', 'mac'):
if (token.lower().startswith(x) and len(token) > len(x) and
(
token[len(x)] == upper(token[len(x)]) or
lt == token
)):
scots_name = len(x)
break
ans = capitalize(token)
if scots_name is not None:
ans = ans[:scots_name] + upper(ans[scots_name]) + ans[scots_name+1:]
for x in ('-', "'"):
idx = ans.find(x)
if idx > -1 and len(ans) > idx+2:
ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:]
return ans
def compile_pat(pat):
import regex
@ -74,7 +105,7 @@ def apply_rules(author, rules):
authors.appendleft(author)
break
if ac == 'capitalize':
ans.append(author.capitalize())
ans.append(' '.join(map(cap_author_token, author.split())))
break
if ac == 'lower':
ans.append(icu_lower(author))

View File

@ -12,8 +12,8 @@ from future_builtins import map
from calibre import browser, random_user_agent
from calibre.customize import Plugin
from calibre.utils.icu import capitalize, lower, upper
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.author_mapper import cap_author_token
from calibre.utils.localization import canonicalize_lang, get_lang
@ -128,34 +128,6 @@ def load_caches(dump):
p.load_caches(cache)
def cap_author_token(token):
lt = lower(token)
if lt in ('von', 'de', 'el', 'van', 'le'):
return lt
# no digits no spez. characters
if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
# Normalize tokens of the form J.K. to J. K.
parts = token.split('.')
return '. '.join(map(capitalize, parts)).strip()
scots_name = None
for x in ('mc', 'mac'):
if (token.lower().startswith(x) and len(token) > len(x) and
(
token[len(x)] == upper(token[len(x)]) or
lt == token
)):
scots_name = len(x)
break
ans = capitalize(token)
if scots_name is not None:
ans = ans[:scots_name] + upper(ans[scots_name]) + ans[scots_name+1:]
for x in ('-', "'"):
idx = ans.find(x)
if idx > -1 and len(ans) > idx+2:
ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:]
return ans
def fixauthors(authors):
if not authors:
return authors