Add a new tweak under "Author sort name algorithm" to optionally recognize common surname prefixes such as von, van, de etc.

Merge branch 'author-sort' of https://github.com/madphysicist/calibre
2025-08-30 23:00:21 -04:00 · 2021-01-21 05:49:58 +05:30 · 2021-01-21 05:49:58 +05:30 · 6dc39a511a
commit 6dc39a511a
parent 310f92b868 f55fadd2ed
3 changed files with 192 additions and 37 deletions
--- a/resources/default_tweaks.py
+++ b/resources/default_tweaks.py
@ -53,22 +53,35 @@ authors_completer_append_separator = False
 #  comma : use 'copy' if there is a ',' in the name, otherwise use 'invert'
 #  nocomma : "fn ln" -> "ln fn" (without the comma)
 # When this tweak is changed, the author_sort values stored with each author
-# must be recomputed by right-clicking on an author in the left-hand tags panel,
+# must be recomputed by right-clicking on an author in the left-hand tags
-# selecting 'manage authors', and pressing 'Recalculate all author sort values'.
+# panel, selecting 'manage authors', and pressing
-# The author name suffixes are words that are ignored when they occur at the
+# 'Recalculate all author sort values'.
 #
 # The author_name_suffixes are words that are ignored when they occur at the
 # end of an author name. The case of the suffix is ignored and trailing
-# periods are automatically handled. The same is true for prefixes.
+# periods are automatically handled.
-# The author name copy words are a set of words which if they occur in an
+#
-# author name cause the automatically generated author sort string to be
+# The same is true for author_name_prefixes.
-# identical to the author name. This means that the sort for a string like Acme
+#
-# Inc. will be Acme Inc. instead of Inc., Acme
+# The author_name_copywords are a set of words which, if they occur in an
 # author name, cause the automatically generated author sort string to be
 # identical to the author name. This means that the sort for a string like
 # "Acme Inc." will be "Acme Inc." instead of "Inc., Acme".
 #
 # If author_use_surname_prefixes is enabled, any of the words in
 # author_surname_prefixes will be treated as a prefix to the surname, if they
 # occur before the surname. So for example, "John von Neumann" would be sorted
 # as "von Neumann, John" and not "Neumann, John von".
 author_sort_copy_method = 'comma'
 author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd',
                        'MD', 'M.D', 'I', 'II', 'III', 'IV',
                        'Junior', 'Senior')
 author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof')
-author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council',
+author_name_copywords = ('Agency', 'Corporation', 'Company', 'Co.', 'Council',
-        'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team')
+                         'Committee', 'Inc.', 'Institute', 'National',
                         'Society', 'Club', 'Team')
 author_use_surname_prefixes = False
 author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von')
 #: Splitting multiple author names
 # By default, calibre splits a string containing multiple author names on
--- a/setup/test.py
+++ b/setup/test.py
@ -121,6 +121,8 @@ def find_tests(which_tests=None, exclude_tests=None):
        from calibre.gui2.viewer.annotations import find_tests
        a(find_tests())
    if ok('misc'):
        from calibre.ebooks.metadata import find_tests
        a(find_tests())
        from calibre.ebooks.metadata.tag_mapper import find_tests
        a(find_tests())
        from calibre.ebooks.metadata.author_mapper import find_tests
--- a/src/calibre/ebooks/metadata/init.py
+++ b/src/calibre/ebooks/metadata/init.py
@ -13,7 +13,7 @@ import os, sys, re
 from calibre import relpath, guess_type, prints, force_unicode
 from calibre.utils.config_base import tweaks
-from polyglot.builtins import codepoint_to_chr, unicode_type, range, map, zip, getcwd, iteritems, itervalues, as_unicode
+from polyglot.builtins import codepoint_to_chr, unicode_type, range, map, zip, getcwd, iteritems, as_unicode
 from polyglot.urllib import quote, unquote, urlparse
@ -46,17 +46,20 @@ def remove_bracketed_text(src, brackets=None):
        brackets = {'(': ')', '[': ']', '{': '}'}
    from collections import Counter
    counts = Counter()
    total = 0
    buf = []
    src = force_unicode(src)
    rmap = {v: k for k, v in iteritems(brackets)}
    for char in src:
        if char in brackets:
            counts[char] += 1
            total += 1
        elif char in rmap:
            idx = rmap[char]
            if counts[idx] > 0:
                counts[idx] -= 1
-        elif sum(itervalues(counts)) < 1:
+                total -= 1
        elif total < 1:
            buf.append(char)
    return ''.join(buf)
@ -64,51 +67,57 @@ def remove_bracketed_text(src, brackets=None):
 def author_to_author_sort(author, method=None):
    if not author:
        return ''
    if method is None:
        method = tweaks['author_sort_copy_method']
    if method == 'copy':
        return author
    sauthor = remove_bracketed_text(author).strip()
    if method == 'comma' and ',' in sauthor:
        return author
    tokens = sauthor.split()
    if len(tokens) < 2:
        return author
    if method is None:
        method = tweaks['author_sort_copy_method']
    ltoks = frozenset(x.lower() for x in tokens)
    copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
    if ltoks.intersection(copy_words):
        method = 'copy'
    if method == 'copy':
        return author
    author_use_surname_prefixes = tweaks['author_use_surname_prefixes']
    if author_use_surname_prefixes:
        author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes'])
        if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes:
            return author
    prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
    prefixes |= {y+'.' for y in prefixes}
-    while True:
+
-        if not tokens:
+    for first in range(len(tokens)):
-            return author
+        if tokens[first].lower() not in prefixes:
        tok = tokens[0].lower()
        if tok in prefixes:
            tokens = tokens[1:]
        else:
            break
    else:
        return author
    suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
    suffixes |= {y+'.' for y in suffixes}
-    suffix = ''
+    for last in range(len(tokens) - 1, first - 1, -1):
-    while True:
+        if tokens[last].lower() not in suffixes:
        if not tokens:
            return author
        last = tokens[-1].lower()
        if last in suffixes:
            suffix = tokens[-1] + ' ' + suffix
            tokens = tokens[:-1]
        else:
            break
-    suffix = suffix.strip()
+    else:
    if method == 'comma' and ',' in ''.join(tokens):
        return author
-    atokens = tokens[-1:] + tokens[:-1]
+    suffix = ' '.join(tokens[last + 1:])
    if author_use_surname_prefixes:
        if last > first and tokens[last - 1].lower() in author_surname_prefixes:
            tokens[last - 1] += ' ' + tokens[last]
            last -= 1
    atokens = tokens[last:last + 1] + tokens[first:last]
    num_toks = len(atokens)
    if suffix:
        atokens.append(suffix)
@ -438,3 +447,134 @@ def rating_to_stars(value, allow_half_stars=False, star='★', half='⯨'):
    if allow_half_stars and r % 2:
        ans += half
    return ans
 def find_tests():
    import unittest
    from calibre.utils.config_base import Tweak
    class TestRemoveBracketedText(unittest.TestCase):
        def test_brackets(self):
            self.assertEqual(remove_bracketed_text('a[b]c(d)e{f}g<h>i'), 'aceg<h>i')
        def test_nested(self):
            self.assertEqual(remove_bracketed_text('a[[b]c(d)e{f}]g(h(i)j[k]l{m})n{{{o}}}p'), 'agnp')
        def test_mismatched(self):
            self.assertEqual(remove_bracketed_text('a[b(c]d)e'), 'ae')
            self.assertEqual(remove_bracketed_text('a{b(c}d)e'), 'ae')
        def test_extra_closed(self):
            self.assertEqual(remove_bracketed_text('a]b}c)d'), 'abcd')
            self.assertEqual(remove_bracketed_text('a[b]c]d(e)f{g)h}i}j)k]l'), 'acdfijkl')
        def test_unclosed(self):
            self.assertEqual(remove_bracketed_text('a]b[c'), 'ab')
            self.assertEqual(remove_bracketed_text('a(b[c]d{e}f'), 'a')
            self.assertEqual(remove_bracketed_text('a{b}c{d[e]f(g)h'), 'ac')
    class TestAuthorToAuthorSort(unittest.TestCase):
        def check_all_methods(self, name, invert=None, comma=None,
                              nocomma=None, copy=None):
            methods = ('invert', 'copy', 'comma', 'nocomma')
            if invert is None:
                invert = name
            if comma is None:
                comma = invert
            if nocomma is None:
                nocomma = comma
            if copy is None:
                copy = name
            results = (invert, copy, comma, nocomma)
            for method, result in zip(methods, results):
                self.assertEqual(author_to_author_sort(name, method), result)
        def test_single(self):
            self.check_all_methods('Aristotle')
        def test_all_prefix(self):
            self.check_all_methods('Mr. Dr Prof.')
        def test_all_suffix(self):
            self.check_all_methods('Senior Inc')
        def test_copywords(self):
            self.check_all_methods('Don "Team" Smith',
                                   invert='Smith, Don "Team"',
                                   nocomma='Smith Don "Team"')
            self.check_all_methods('Don Team Smith')
        def test_national(self):
            c = tweaks['author_name_copywords']
            try:
                # Assume that 'author_name_copywords' is a common sequence type
                i = c.index('National')
            except ValueError:
                # If "National" not found, check first without, then temporarily add
                self.check_all_methods('National Lampoon',
                                       invert='Lampoon, National',
                                       nocomma='Lampoon National')
                t = type(c)
                with Tweak('author_name_copywords', c + t(['National'])):
                    self.check_all_methods('National Lampoon')
            else:
                # If "National" found, check with, then temporarily remove
                self.check_all_methods('National Lampoon')
                with Tweak('author_name_copywords', c[:i] + c[i + 1:]):
                    self.check_all_methods('National Lampoon',
                                           invert='Lampoon, National',
                                           nocomma='Lampoon National')
        def test_method(self):
            self.check_all_methods('Jane Doe',
                                   invert='Doe, Jane',
                                   nocomma='Doe Jane')
        def test_invalid_methos(self):
            # Invalid string defaults to invert
            name = 'Jane, Q. van Doe[ed] Jr.'
            self.assertEqual(author_to_author_sort(name, 'invert'),
                             author_to_author_sort(name, '__unknown__!(*T^U$'))
        def test_prefix_suffix(self):
            self.check_all_methods('Mrs. Jane Q. Doe III',
                                   invert='Doe, Jane Q. III',
                                   nocomma='Doe Jane Q. III')
        def test_surname_prefix(self):
            with Tweak('author_use_surname_prefixes', True):
                self.check_all_methods('Leonardo Da Vinci',
                                       invert='Da Vinci, Leonardo',
                                       nocomma='Da Vinci Leonardo')
                self.check_all_methods('Van Gogh')
                self.check_all_methods('Van')
            with Tweak('author_use_surname_prefixes', False):
                self.check_all_methods('Leonardo Da Vinci',
                                       invert='Vinci, Leonardo Da',
                                       nocomma='Vinci Leonardo Da')
                self.check_all_methods('Van Gogh',
                                       invert='Gogh, Van',
                                       nocomma='Gogh Van')
        def test_comma(self):
            self.check_all_methods('James Wesley, Rawles',
                                   invert='Rawles, James Wesley,',
                                   comma='James Wesley, Rawles',
                                   nocomma='Rawles James Wesley,')
        def test_brackets(self):
            self.check_all_methods('Seventh Author [7]',
                                   invert='Author, Seventh',
                                   nocomma='Author Seventh')
            self.check_all_methods('John [x]von Neumann (III)',
                                   invert='Neumann, John von',
                                   nocomma='Neumann John von')
        def test_falsy(self):
            self.check_all_methods('')
            self.check_all_methods(None, '', '', '', '')
            self.check_all_methods([], '', '', '', '')
    ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestRemoveBracketedText)
    ans.addTests(unittest.defaultTestLoader.loadTestsFromTestCase(TestAuthorToAuthorSort))
    return ans