ENH: Added surname prefixes to author sort

If an author last name is preceded by von, van, di, la, le, da, de, etc. include that in the last name
This commit is contained in:
Joseph R. Fox-Rabinovitz 2021-01-15 08:05:05 -05:00
parent ed0f7e9684
commit b1ecfe1fdf
2 changed files with 29 additions and 24 deletions

View File

@ -69,6 +69,7 @@ author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd',
author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof')
author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council',
'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team')
author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von')
#: Splitting multiple author names
# By default, calibre splits a string containing multiple author names on

View File

@ -46,23 +46,26 @@ def remove_bracketed_text(src, brackets=None):
brackets = {'(': ')', '[': ']', '{': '}'}
from collections import Counter
counts = Counter()
total = 0
buf = []
src = force_unicode(src)
rmap = {v: k for k, v in iteritems(brackets)}
for char in src:
if char in brackets:
counts[char] += 1
total += 1
elif char in rmap:
idx = rmap[char]
if counts[idx] > 0:
counts[idx] -= 1
elif sum(itervalues(counts)) < 1:
total -= 1
elif total < 1:
buf.append(char)
return ''.join(buf)
def author_to_author_sort(author, method=None):
if not author:
if not author or method == 'copy':
return ''
sauthor = remove_bracketed_text(author).strip()
tokens = sauthor.split()
@ -70,45 +73,46 @@ def author_to_author_sort(author, method=None):
return author
if method is None:
method = tweaks['author_sort_copy_method']
if method == 'copy':
return author
ltoks = frozenset(x.lower() for x in tokens)
copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
if ltoks.intersection(copy_words):
method = 'copy'
return author
if method == 'copy':
author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes'])
if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes:
return author
if method == 'comma' and any(',' in t for t in tokens):
return author
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
prefixes |= {y+'.' for y in prefixes}
while True:
if not tokens:
return author
tok = tokens[0].lower()
if tok in prefixes:
tokens = tokens[1:]
else:
for first in range(len(tokens)):
if tokens[first].lower() not in prefixes:
break
else:
return author
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
suffixes |= {y+'.' for y in suffixes}
suffix = ''
while True:
if not tokens:
return author
last = tokens[-1].lower()
if last in suffixes:
suffix = tokens[-1] + ' ' + suffix
tokens = tokens[:-1]
else:
for last in range(len(tokens) - 1, first - 1, -1):
if tokens[last].lower() not in suffixes:
break
suffix = suffix.strip()
if method == 'comma' and ',' in ''.join(tokens):
else:
return author
atokens = tokens[-1:] + tokens[:-1]
suffix = ' '.join(tokens[last + 1:])
if last > first and tokens[last - 1].lower() in author_surname_prefixes:
tokens[last - 1] += ' ' + tokens[last]
last -= 1
atokens = tokens[last:last + 1] + tokens[first:last]
num_toks = len(atokens)
if suffix:
atokens.append(suffix)