From b1ecfe1fdfa646e5735e64bb0f4314ed29517afb Mon Sep 17 00:00:00 2001 From: "Joseph R. Fox-Rabinovitz" Date: Fri, 15 Jan 2021 08:05:05 -0500 Subject: [PATCH 1/5] ENH: Added surname prefixes to author sort If an author last name is preceded by von, van, di, la, le, da, de, etc. include that in the last name --- resources/default_tweaks.py | 1 + src/calibre/ebooks/metadata/__init__.py | 52 +++++++++++++------------ 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 3ee6cd5561..cb54af8f9d 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -69,6 +69,7 @@ author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd', author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof') author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council', 'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team') +author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von') #: Splitting multiple author names # By default, calibre splits a string containing multiple author names on diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 609918588e..68ff128651 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -46,23 +46,26 @@ def remove_bracketed_text(src, brackets=None): brackets = {'(': ')', '[': ']', '{': '}'} from collections import Counter counts = Counter() + total = 0 buf = [] src = force_unicode(src) rmap = {v: k for k, v in iteritems(brackets)} for char in src: if char in brackets: counts[char] += 1 + total += 1 elif char in rmap: idx = rmap[char] if counts[idx] > 0: counts[idx] -= 1 - elif sum(itervalues(counts)) < 1: + total -= 1 + elif total < 1: buf.append(char) return ''.join(buf) def author_to_author_sort(author, method=None): - if not author: + if not author or method == 'copy': return '' sauthor = remove_bracketed_text(author).strip() tokens = sauthor.split() @@ -70,45 +73,46 @@ def author_to_author_sort(author, method=None): return author if method is None: method = tweaks['author_sort_copy_method'] + if method == 'copy': + return author ltoks = frozenset(x.lower() for x in tokens) copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords']) if ltoks.intersection(copy_words): - method = 'copy' + return author - if method == 'copy': + author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes']) + if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes: + return author + + if method == 'comma' and any(',' in t for t in tokens): return author prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']} prefixes |= {y+'.' for y in prefixes} - while True: - if not tokens: - return author - tok = tokens[0].lower() - if tok in prefixes: - tokens = tokens[1:] - else: + + for first in range(len(tokens)): + if tokens[first].lower() not in prefixes: break + else: + return author suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']} suffixes |= {y+'.' for y in suffixes} - suffix = '' - while True: - if not tokens: - return author - last = tokens[-1].lower() - if last in suffixes: - suffix = tokens[-1] + ' ' + suffix - tokens = tokens[:-1] - else: + for last in range(len(tokens) - 1, first - 1, -1): + if tokens[last].lower() not in suffixes: break - suffix = suffix.strip() - - if method == 'comma' and ',' in ''.join(tokens): + else: return author - atokens = tokens[-1:] + tokens[:-1] + suffix = ' '.join(tokens[last + 1:]) + + if last > first and tokens[last - 1].lower() in author_surname_prefixes: + tokens[last - 1] += ' ' + tokens[last] + last -= 1 + + atokens = tokens[last:last + 1] + tokens[first:last] num_toks = len(atokens) if suffix: atokens.append(suffix) From efe490b1b70fb4895d0add2b59ce572c4e74fc84 Mon Sep 17 00:00:00 2001 From: "Joseph R. Fox-Rabinovitz" Date: Fri, 15 Jan 2021 09:34:43 -0500 Subject: [PATCH 2/5] Added unit tests for author sort function --- setup/test.py | 2 + src/calibre/ebooks/metadata/__init__.py | 95 ++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 8 deletions(-) diff --git a/setup/test.py b/setup/test.py index d4a6cc02bb..0256ee2433 100644 --- a/setup/test.py +++ b/setup/test.py @@ -121,6 +121,8 @@ def find_tests(which_tests=None, exclude_tests=None): from calibre.gui2.viewer.annotations import find_tests a(find_tests()) if ok('misc'): + from calibre.ebooks.metadata import find_tests + a(find_tests()) from calibre.ebooks.metadata.tag_mapper import find_tests a(find_tests()) from calibre.ebooks.metadata.author_mapper import find_tests diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 68ff128651..a517d566fb 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -65,17 +65,22 @@ def remove_bracketed_text(src, brackets=None): def author_to_author_sort(author, method=None): - if not author or method == 'copy': + if not author: return '' - sauthor = remove_bracketed_text(author).strip() - tokens = sauthor.split() - if len(tokens) < 2: - return author + if method is None: method = tweaks['author_sort_copy_method'] if method == 'copy': return author + sauthor = remove_bracketed_text(author).strip() + if method == 'comma' and ',' in sauthor: + return author + + tokens = sauthor.split() + if len(tokens) < 2: + return author + ltoks = frozenset(x.lower() for x in tokens) copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords']) if ltoks.intersection(copy_words): @@ -85,9 +90,6 @@ def author_to_author_sort(author, method=None): if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes: return author - if method == 'comma' and any(',' in t for t in tokens): - return author - prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']} prefixes |= {y+'.' for y in prefixes} @@ -442,3 +444,80 @@ def rating_to_stars(value, allow_half_stars=False, star='★', half='⯨'): if allow_half_stars and r % 2: ans += half return ans + + +def find_tests(): + import unittest + + class TestRemoveBracketedText(unittest.TestCase): + def test_brackets(self): + self.assertEqual(remove_bracketed_text('a[b]c(d)e{f}gi'), 'acegi') + + def test_nested(self): + self.assertEqual(remove_bracketed_text('a[[b]c(d)e{f}]g(h(i)j[k]l{m})n{{{o}}}p'), 'agnp') + + def test_mismatched(self): + self.assertEqual(remove_bracketed_text('a[b(c]d)e'), 'ae') + self.assertEqual(remove_bracketed_text('a{b(c}d)e'), 'ae') + + def test_extra_closed(self): + self.assertEqual(remove_bracketed_text('a]b}c)d'), 'abcd') + self.assertEqual(remove_bracketed_text('a[b]c]d(e)f{g)h}i}j)k]l'), 'acdfijkl') + + def test_unclosed(self): + self.assertEqual(remove_bracketed_text('a]b[c'), 'ab') + self.assertEqual(remove_bracketed_text('a(b[c]d{e}f'), 'a') + self.assertEqual(remove_bracketed_text('a{b}c{d[e]f(g)h'), 'ac') + + class TestAuthorToAuthorSort(unittest.TestCase): + def check_all_methods(self, name, comma=None, nocomma=None, copy=None): + methods = ('copy', 'comma', 'nocomma') + if comma is None: + comma = name + if nocomma is None: + nocomma = comma + if copy is None: + copy = name + results = (copy, comma, nocomma) + for method, result in zip(methods, results): + self.assertEqual(author_to_author_sort(name, method), result) + + def test_single(self): + self.check_all_methods('Aristotle') + + def test_all_prefix(self): + self.check_all_methods('Mr. Dr Prof.') + + def test_all_suffix(self): + self.check_all_methods('Senior Inc') + + def test_copywords(self): + self.check_all_methods('Don "Team" Smith', 'Smith, Don "Team"', 'Smith Don "Team"') + self.check_all_methods('Don Team Smith') + + def test_method(self): + self.check_all_methods('Jane Doe', 'Doe, Jane', 'Doe Jane') + + def test_prefix_suffix(self): + self.check_all_methods('Mrs. Jane Q. Doe III', 'Doe, Jane Q. III', 'Doe Jane Q. III') + + def test_surname_prefix(self): + self.check_all_methods('Leonardo Da Vinci', 'Da Vinci, Leonardo', 'Da Vinci Leonardo') + self.check_all_methods('Van Gogh') + + def test_comma(self): + self.check_all_methods('James Wesley, Rawles', nocomma='Rawles James Wesley,') + + def test_brackets(self): + self.check_all_methods('Seventh Author [7]', 'Author, Seventh', 'Author Seventh') + self.check_all_methods('John [x]von Neumann (III)', 'von Neumann, John', 'von Neumann John') + + def test_falsy(self): + self.check_all_methods('') + self.check_all_methods(None, '', '', '') + self.check_all_methods([], '', '', '') + + ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestRemoveBracketedText) + ans.addTests(unittest.defaultTestLoader.loadTestsFromTestCase(TestAuthorToAuthorSort)) + return ans + From f699b436a048e68ace6b49feea2dee5537ea84cc Mon Sep 17 00:00:00 2001 From: "Joseph R. Fox-Rabinovitz" Date: Wed, 20 Jan 2021 12:31:35 -0500 Subject: [PATCH 3/5] Added config key author_use_surname_prefixes and tests --- resources/default_tweaks.py | 6 ++-- src/calibre/ebooks/metadata/__init__.py | 43 +++++++++++++++++++------ 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index cb54af8f9d..bb48897314 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -67,8 +67,10 @@ author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd', 'MD', 'M.D', 'I', 'II', 'III', 'IV', 'Junior', 'Senior') author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof') -author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council', - 'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team') +author_name_copywords = ('Agency', 'Corporation', 'Company', 'Co.', 'Council', + 'Committee', 'Inc.', 'Institute', 'National', + 'Society', 'Club', 'Team') +author_use_surname_prefixes = False author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von') #: Splitting multiple author names diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index a517d566fb..cecd12d3ae 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -86,9 +86,11 @@ def author_to_author_sort(author, method=None): if ltoks.intersection(copy_words): return author - author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes']) - if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes: - return author + author_use_surname_prefixes = tweaks['author_use_surname_prefixes'] + if author_use_surname_prefixes: + author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes']) + if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes: + return author prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']} prefixes |= {y+'.' for y in prefixes} @@ -110,9 +112,10 @@ def author_to_author_sort(author, method=None): suffix = ' '.join(tokens[last + 1:]) - if last > first and tokens[last - 1].lower() in author_surname_prefixes: - tokens[last - 1] += ' ' + tokens[last] - last -= 1 + if author_use_surname_prefixes: + if last > first and tokens[last - 1].lower() in author_surname_prefixes: + tokens[last - 1] += ' ' + tokens[last] + last -= 1 atokens = tokens[last:last + 1] + tokens[first:last] num_toks = len(atokens) @@ -448,6 +451,7 @@ def rating_to_stars(value, allow_half_stars=False, star='★', half='⯨'): def find_tests(): import unittest + from calibre.utils.config_base import Tweak class TestRemoveBracketedText(unittest.TestCase): def test_brackets(self): @@ -495,6 +499,23 @@ def find_tests(): self.check_all_methods('Don "Team" Smith', 'Smith, Don "Team"', 'Smith Don "Team"') self.check_all_methods('Don Team Smith') + def test_national(self): + c = tweaks['author_name_copywords'] + try: + # Assume that 'author_name_copywords' is a common sequence type + i = c.index('National') + except ValueError: + # If "National" not found, check first without, then temporarily add + self.check_all_methods('National Lampoon', 'Lampoon, National', 'Lampoon National') + t = type(c) + with Tweak('author_name_copywords', c + t(['National'])): + self.check_all_methods('National Lampoon') + else: + # If "National" found, check with, then temporarily remove + self.check_all_methods('National Lampoon') + with Tweak('author_name_copywords', c[:i] + c[i + 1:]): + self.check_all_methods('National Lampoon', 'Lampoon, National', 'Lampoon National') + def test_method(self): self.check_all_methods('Jane Doe', 'Doe, Jane', 'Doe Jane') @@ -502,15 +523,19 @@ def find_tests(): self.check_all_methods('Mrs. Jane Q. Doe III', 'Doe, Jane Q. III', 'Doe Jane Q. III') def test_surname_prefix(self): - self.check_all_methods('Leonardo Da Vinci', 'Da Vinci, Leonardo', 'Da Vinci Leonardo') - self.check_all_methods('Van Gogh') + with Tweak('author_use_surname_prefixes', True): + self.check_all_methods('Leonardo Da Vinci', 'Da Vinci, Leonardo', 'Da Vinci Leonardo') + self.check_all_methods('Van Gogh') + with Tweak('author_use_surname_prefixes', False): + self.check_all_methods('Leonardo Da Vinci', 'Vinci, Leonardo Da', 'Vinci Leonardo Da') + self.check_all_methods('Van Gogh', 'Gogh, Van', 'Gogh Van') def test_comma(self): self.check_all_methods('James Wesley, Rawles', nocomma='Rawles James Wesley,') def test_brackets(self): self.check_all_methods('Seventh Author [7]', 'Author, Seventh', 'Author Seventh') - self.check_all_methods('John [x]von Neumann (III)', 'von Neumann, John', 'von Neumann John') + self.check_all_methods('John [x]von Neumann (III)', 'Neumann, John von', 'Neumann John von') def test_falsy(self): self.check_all_methods('') From 4391ef0b0e081956cecd941577374b26469adbb1 Mon Sep 17 00:00:00 2001 From: "Joseph R. Fox-Rabinovitz" Date: Wed, 20 Jan 2021 12:45:15 -0500 Subject: [PATCH 4/5] Added "invert" method to tests --- src/calibre/ebooks/metadata/__init__.py | 67 ++++++++++++++++++------- 1 file changed, 50 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index cecd12d3ae..673cb3fca9 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -474,15 +474,18 @@ def find_tests(): self.assertEqual(remove_bracketed_text('a{b}c{d[e]f(g)h'), 'ac') class TestAuthorToAuthorSort(unittest.TestCase): - def check_all_methods(self, name, comma=None, nocomma=None, copy=None): - methods = ('copy', 'comma', 'nocomma') + def check_all_methods(self, name, invert=None, comma=None, + nocomma=None, copy=None): + methods = ('invert', 'copy', 'comma', 'nocomma') + if invert is None: + invert = name if comma is None: - comma = name + comma = invert if nocomma is None: nocomma = comma if copy is None: copy = name - results = (copy, comma, nocomma) + results = (invert, copy, comma, nocomma) for method, result in zip(methods, results): self.assertEqual(author_to_author_sort(name, method), result) @@ -496,7 +499,9 @@ def find_tests(): self.check_all_methods('Senior Inc') def test_copywords(self): - self.check_all_methods('Don "Team" Smith', 'Smith, Don "Team"', 'Smith Don "Team"') + self.check_all_methods('Don "Team" Smith', + invert='Smith, Don "Team"', + nocomma='Smith Don "Team"') self.check_all_methods('Don Team Smith') def test_national(self): @@ -506,7 +511,9 @@ def find_tests(): i = c.index('National') except ValueError: # If "National" not found, check first without, then temporarily add - self.check_all_methods('National Lampoon', 'Lampoon, National', 'Lampoon National') + self.check_all_methods('National Lampoon', + invert='Lampoon, National', + nocomma='Lampoon National') t = type(c) with Tweak('author_name_copywords', c + t(['National'])): self.check_all_methods('National Lampoon') @@ -514,33 +521,59 @@ def find_tests(): # If "National" found, check with, then temporarily remove self.check_all_methods('National Lampoon') with Tweak('author_name_copywords', c[:i] + c[i + 1:]): - self.check_all_methods('National Lampoon', 'Lampoon, National', 'Lampoon National') + self.check_all_methods('National Lampoon', + invert='Lampoon, National', + nocomma='Lampoon National') def test_method(self): - self.check_all_methods('Jane Doe', 'Doe, Jane', 'Doe Jane') + self.check_all_methods('Jane Doe', + invert='Doe, Jane', + nocomma='Doe Jane') + + + def test_invalid_methos(self): + # Invalid string defaults to invert + name = 'Jane, Q. van Doe[ed] Jr.' + self.assertEqual(author_to_author_sort(name, 'invert'), + author_to_author_sort(name, '__unknown__!(*T^U$')) def test_prefix_suffix(self): - self.check_all_methods('Mrs. Jane Q. Doe III', 'Doe, Jane Q. III', 'Doe Jane Q. III') + self.check_all_methods('Mrs. Jane Q. Doe III', + invert='Doe, Jane Q. III', + nocomma='Doe Jane Q. III') def test_surname_prefix(self): with Tweak('author_use_surname_prefixes', True): - self.check_all_methods('Leonardo Da Vinci', 'Da Vinci, Leonardo', 'Da Vinci Leonardo') + self.check_all_methods('Leonardo Da Vinci', + invert='Da Vinci, Leonardo', + nocomma='Da Vinci Leonardo') self.check_all_methods('Van Gogh') with Tweak('author_use_surname_prefixes', False): - self.check_all_methods('Leonardo Da Vinci', 'Vinci, Leonardo Da', 'Vinci Leonardo Da') - self.check_all_methods('Van Gogh', 'Gogh, Van', 'Gogh Van') + self.check_all_methods('Leonardo Da Vinci', + invert='Vinci, Leonardo Da', + nocomma='Vinci Leonardo Da') + self.check_all_methods('Van Gogh', + invert='Gogh, Van', + nocomma='Gogh Van') def test_comma(self): - self.check_all_methods('James Wesley, Rawles', nocomma='Rawles James Wesley,') + self.check_all_methods('James Wesley, Rawles', + invert='Rawles, James Wesley,', + comma='James Wesley, Rawles', + nocomma='Rawles James Wesley,') def test_brackets(self): - self.check_all_methods('Seventh Author [7]', 'Author, Seventh', 'Author Seventh') - self.check_all_methods('John [x]von Neumann (III)', 'Neumann, John von', 'Neumann John von') + self.check_all_methods('Seventh Author [7]', + invert='Author, Seventh', + nocomma='Author Seventh') + self.check_all_methods('John [x]von Neumann (III)', + invert='Neumann, John von', + nocomma='Neumann John von') def test_falsy(self): self.check_all_methods('') - self.check_all_methods(None, '', '', '') - self.check_all_methods([], '', '', '') + self.check_all_methods(None, '', '', '', '') + self.check_all_methods([], '', '', '', '') ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestRemoveBracketedText) ans.addTests(unittest.defaultTestLoader.loadTestsFromTestCase(TestAuthorToAuthorSort)) From f55fadd2ed9c4f849cb4a1f442e2fbed6f4141c4 Mon Sep 17 00:00:00 2001 From: "Joseph R. Fox-Rabinovitz" Date: Wed, 20 Jan 2021 12:52:15 -0500 Subject: [PATCH 5/5] Added new tweak to docs --- resources/default_tweaks.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index bb48897314..aec9ca2001 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -53,15 +53,25 @@ authors_completer_append_separator = False # comma : use 'copy' if there is a ',' in the name, otherwise use 'invert' # nocomma : "fn ln" -> "ln fn" (without the comma) # When this tweak is changed, the author_sort values stored with each author -# must be recomputed by right-clicking on an author in the left-hand tags panel, -# selecting 'manage authors', and pressing 'Recalculate all author sort values'. -# The author name suffixes are words that are ignored when they occur at the +# must be recomputed by right-clicking on an author in the left-hand tags +# panel, selecting 'manage authors', and pressing +# 'Recalculate all author sort values'. +# +# The author_name_suffixes are words that are ignored when they occur at the # end of an author name. The case of the suffix is ignored and trailing -# periods are automatically handled. The same is true for prefixes. -# The author name copy words are a set of words which if they occur in an -# author name cause the automatically generated author sort string to be -# identical to the author name. This means that the sort for a string like Acme -# Inc. will be Acme Inc. instead of Inc., Acme +# periods are automatically handled. +# +# The same is true for author_name_prefixes. +# +# The author_name_copywords are a set of words which, if they occur in an +# author name, cause the automatically generated author sort string to be +# identical to the author name. This means that the sort for a string like +# "Acme Inc." will be "Acme Inc." instead of "Inc., Acme". +# +# If author_use_surname_prefixes is enabled, any of the words in +# author_surname_prefixes will be treated as a prefix to the surname, if they +# occur before the surname. So for example, "John von Neumann" would be sorted +# as "von Neumann, John" and not "Neumann, John von". author_sort_copy_method = 'comma' author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd', 'MD', 'M.D', 'I', 'II', 'III', 'IV',