ENH: Added surname prefixes to author sort

If an author last name is preceded by von, van, di, la, le, da, de, etc. include that in the last name
This commit is contained in:
Joseph R. Fox-Rabinovitz 2021-01-15 08:05:05 -05:00
parent ed0f7e9684
commit b1ecfe1fdf
2 changed files with 29 additions and 24 deletions

View File

@ -69,6 +69,7 @@ author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd',
author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof') author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof')
author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council', author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council',
'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team') 'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team')
author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von')
#: Splitting multiple author names #: Splitting multiple author names
# By default, calibre splits a string containing multiple author names on # By default, calibre splits a string containing multiple author names on

View File

@ -46,23 +46,26 @@ def remove_bracketed_text(src, brackets=None):
brackets = {'(': ')', '[': ']', '{': '}'} brackets = {'(': ')', '[': ']', '{': '}'}
from collections import Counter from collections import Counter
counts = Counter() counts = Counter()
total = 0
buf = [] buf = []
src = force_unicode(src) src = force_unicode(src)
rmap = {v: k for k, v in iteritems(brackets)} rmap = {v: k for k, v in iteritems(brackets)}
for char in src: for char in src:
if char in brackets: if char in brackets:
counts[char] += 1 counts[char] += 1
total += 1
elif char in rmap: elif char in rmap:
idx = rmap[char] idx = rmap[char]
if counts[idx] > 0: if counts[idx] > 0:
counts[idx] -= 1 counts[idx] -= 1
elif sum(itervalues(counts)) < 1: total -= 1
elif total < 1:
buf.append(char) buf.append(char)
return ''.join(buf) return ''.join(buf)
def author_to_author_sort(author, method=None): def author_to_author_sort(author, method=None):
if not author: if not author or method == 'copy':
return '' return ''
sauthor = remove_bracketed_text(author).strip() sauthor = remove_bracketed_text(author).strip()
tokens = sauthor.split() tokens = sauthor.split()
@ -70,45 +73,46 @@ def author_to_author_sort(author, method=None):
return author return author
if method is None: if method is None:
method = tweaks['author_sort_copy_method'] method = tweaks['author_sort_copy_method']
if method == 'copy':
return author
ltoks = frozenset(x.lower() for x in tokens) ltoks = frozenset(x.lower() for x in tokens)
copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords']) copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
if ltoks.intersection(copy_words): if ltoks.intersection(copy_words):
method = 'copy' return author
if method == 'copy': author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes'])
if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes:
return author
if method == 'comma' and any(',' in t for t in tokens):
return author return author
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']} prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
prefixes |= {y+'.' for y in prefixes} prefixes |= {y+'.' for y in prefixes}
while True:
if not tokens: for first in range(len(tokens)):
return author if tokens[first].lower() not in prefixes:
tok = tokens[0].lower()
if tok in prefixes:
tokens = tokens[1:]
else:
break break
else:
return author
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']} suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
suffixes |= {y+'.' for y in suffixes} suffixes |= {y+'.' for y in suffixes}
suffix = '' for last in range(len(tokens) - 1, first - 1, -1):
while True: if tokens[last].lower() not in suffixes:
if not tokens:
return author
last = tokens[-1].lower()
if last in suffixes:
suffix = tokens[-1] + ' ' + suffix
tokens = tokens[:-1]
else:
break break
suffix = suffix.strip() else:
if method == 'comma' and ',' in ''.join(tokens):
return author return author
atokens = tokens[-1:] + tokens[:-1] suffix = ' '.join(tokens[last + 1:])
if last > first and tokens[last - 1].lower() in author_surname_prefixes:
tokens[last - 1] += ' ' + tokens[last]
last -= 1
atokens = tokens[last:last + 1] + tokens[first:last]
num_toks = len(atokens) num_toks = len(atokens)
if suffix: if suffix:
atokens.append(suffix) atokens.append(suffix)