mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Intelligent name capitalization for the author mapper
This commit is contained in:
parent
848a7267d2
commit
2a4d733eac
@ -2,10 +2,41 @@
|
|||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
from __future__ import (unicode_literals, division, absolute_import,
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
print_function)
|
|
||||||
|
import re
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
|
from calibre.utils.icu import capitalize, lower, upper
|
||||||
|
|
||||||
|
|
||||||
|
def cap_author_token(token):
|
||||||
|
lt = lower(token)
|
||||||
|
if lt in ('von', 'de', 'el', 'van', 'le'):
|
||||||
|
return lt
|
||||||
|
# no digits no spez. characters
|
||||||
|
if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
|
||||||
|
# Normalize tokens of the form J.K. to J. K.
|
||||||
|
parts = token.split('.')
|
||||||
|
return '. '.join(map(capitalize, parts)).strip()
|
||||||
|
scots_name = None
|
||||||
|
for x in ('mc', 'mac'):
|
||||||
|
if (token.lower().startswith(x) and len(token) > len(x) and
|
||||||
|
(
|
||||||
|
token[len(x)] == upper(token[len(x)]) or
|
||||||
|
lt == token
|
||||||
|
)):
|
||||||
|
scots_name = len(x)
|
||||||
|
break
|
||||||
|
ans = capitalize(token)
|
||||||
|
if scots_name is not None:
|
||||||
|
ans = ans[:scots_name] + upper(ans[scots_name]) + ans[scots_name+1:]
|
||||||
|
for x in ('-', "'"):
|
||||||
|
idx = ans.find(x)
|
||||||
|
if idx > -1 and len(ans) > idx+2:
|
||||||
|
ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:]
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def compile_pat(pat):
|
def compile_pat(pat):
|
||||||
import regex
|
import regex
|
||||||
@ -74,7 +105,7 @@ def apply_rules(author, rules):
|
|||||||
authors.appendleft(author)
|
authors.appendleft(author)
|
||||||
break
|
break
|
||||||
if ac == 'capitalize':
|
if ac == 'capitalize':
|
||||||
ans.append(author.capitalize())
|
ans.append(' '.join(map(cap_author_token, author.split())))
|
||||||
break
|
break
|
||||||
if ac == 'lower':
|
if ac == 'lower':
|
||||||
ans.append(icu_lower(author))
|
ans.append(icu_lower(author))
|
||||||
|
@ -12,8 +12,8 @@ from future_builtins import map
|
|||||||
|
|
||||||
from calibre import browser, random_user_agent
|
from calibre import browser, random_user_agent
|
||||||
from calibre.customize import Plugin
|
from calibre.customize import Plugin
|
||||||
from calibre.utils.icu import capitalize, lower, upper
|
|
||||||
from calibre.ebooks.metadata import check_isbn
|
from calibre.ebooks.metadata import check_isbn
|
||||||
|
from calibre.ebooks.metadata.author_mapper import cap_author_token
|
||||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||||
|
|
||||||
|
|
||||||
@ -128,34 +128,6 @@ def load_caches(dump):
|
|||||||
p.load_caches(cache)
|
p.load_caches(cache)
|
||||||
|
|
||||||
|
|
||||||
def cap_author_token(token):
|
|
||||||
lt = lower(token)
|
|
||||||
if lt in ('von', 'de', 'el', 'van', 'le'):
|
|
||||||
return lt
|
|
||||||
# no digits no spez. characters
|
|
||||||
if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
|
|
||||||
# Normalize tokens of the form J.K. to J. K.
|
|
||||||
parts = token.split('.')
|
|
||||||
return '. '.join(map(capitalize, parts)).strip()
|
|
||||||
scots_name = None
|
|
||||||
for x in ('mc', 'mac'):
|
|
||||||
if (token.lower().startswith(x) and len(token) > len(x) and
|
|
||||||
(
|
|
||||||
token[len(x)] == upper(token[len(x)]) or
|
|
||||||
lt == token
|
|
||||||
)):
|
|
||||||
scots_name = len(x)
|
|
||||||
break
|
|
||||||
ans = capitalize(token)
|
|
||||||
if scots_name is not None:
|
|
||||||
ans = ans[:scots_name] + upper(ans[scots_name]) + ans[scots_name+1:]
|
|
||||||
for x in ('-', "'"):
|
|
||||||
idx = ans.find(x)
|
|
||||||
if idx > -1 and len(ans) > idx+2:
|
|
||||||
ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
|
|
||||||
def fixauthors(authors):
|
def fixauthors(authors):
|
||||||
if not authors:
|
if not authors:
|
||||||
return authors
|
return authors
|
||||||
|
Loading…
x
Reference in New Issue
Block a user