ENH: Added surname prefixes to author sort

If an author last name is preceded by von, van, di, la, le, da, de, etc. include that in the last name
2025-12-08 06:05:04 -05:00 · 2021-01-15 08:05:05 -05:00 · 2021-01-15 08:05:05 -05:00 · b1ecfe1fdf
commit b1ecfe1fdf
parent ed0f7e9684
2 changed files with 29 additions and 24 deletions
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@ -69,6 +69,7 @@ author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd',
 author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof')
 author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council',
        'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team')
 author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von')
 #: Splitting multiple author names
 # By default, calibre splits a string containing multiple author names on
--- a/src/calibre/ebooks/metadata/init.py
+++ b/src/calibre/ebooks/metadata/init.py
@ -46,23 +46,26 @@ def remove_bracketed_text(src, brackets=None):
        brackets = {'(': ')', '[': ']', '{': '}'}
    from collections import Counter
    counts = Counter()
    total = 0
    buf = []
    src = force_unicode(src)
    rmap = {v: k for k, v in iteritems(brackets)}
    for char in src:
        if char in brackets:
            counts[char] += 1
            total += 1
        elif char in rmap:
            idx = rmap[char]
            if counts[idx] > 0:
                counts[idx] -= 1
-        elif sum(itervalues(counts)) < 1:
+                total -= 1
        elif total < 1:
            buf.append(char)
    return ''.join(buf)
 def author_to_author_sort(author, method=None):
-    if not author:
+    if not author or method == 'copy':
        return ''
    sauthor = remove_bracketed_text(author).strip()
    tokens = sauthor.split()
@ -70,45 +73,46 @@ def author_to_author_sort(author, method=None):
        return author
    if method is None:
        method = tweaks['author_sort_copy_method']
    if method == 'copy':
        return author
    ltoks = frozenset(x.lower() for x in tokens)
    copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
    if ltoks.intersection(copy_words):
-        method = 'copy'
+        return author
-    if method == 'copy':
+    author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes'])
    if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes:
        return author
    if method == 'comma' and any(',' in t for t in tokens):
        return author
    prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
    prefixes |= {y+'.' for y in prefixes}
-    while True:
+
-        if not tokens:
+    for first in range(len(tokens)):
-            return author
+        if tokens[first].lower() not in prefixes:
        tok = tokens[0].lower()
        if tok in prefixes:
            tokens = tokens[1:]
        else:
            break
    else:
        return author
    suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
    suffixes |= {y+'.' for y in suffixes}
-    suffix = ''
+    for last in range(len(tokens) - 1, first - 1, -1):
-    while True:
+        if tokens[last].lower() not in suffixes:
        if not tokens:
            return author
        last = tokens[-1].lower()
        if last in suffixes:
            suffix = tokens[-1] + ' ' + suffix
            tokens = tokens[:-1]
        else:
            break
-    suffix = suffix.strip()
+    else:
    if method == 'comma' and ',' in ''.join(tokens):
        return author
-    atokens = tokens[-1:] + tokens[:-1]
+    suffix = ' '.join(tokens[last + 1:])
    if last > first and tokens[last - 1].lower() in author_surname_prefixes:
        tokens[last - 1] += ' ' + tokens[last]
        last -= 1
    atokens = tokens[last:last + 1] + tokens[first:last]
    num_toks = len(atokens)
    if suffix:
        atokens.append(suffix)