From b1ecfe1fdfa646e5735e64bb0f4314ed29517afb Mon Sep 17 00:00:00 2001 From: "Joseph R. Fox-Rabinovitz" Date: Fri, 15 Jan 2021 08:05:05 -0500 Subject: [PATCH] ENH: Added surname prefixes to author sort If an author last name is preceded by von, van, di, la, le, da, de, etc. include that in the last name --- resources/default_tweaks.py | 1 + src/calibre/ebooks/metadata/__init__.py | 52 +++++++++++++------------ 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index 3ee6cd5561..cb54af8f9d 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -69,6 +69,7 @@ author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd', author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof') author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council', 'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team') +author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von') #: Splitting multiple author names # By default, calibre splits a string containing multiple author names on diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index 609918588e..68ff128651 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -46,23 +46,26 @@ def remove_bracketed_text(src, brackets=None): brackets = {'(': ')', '[': ']', '{': '}'} from collections import Counter counts = Counter() + total = 0 buf = [] src = force_unicode(src) rmap = {v: k for k, v in iteritems(brackets)} for char in src: if char in brackets: counts[char] += 1 + total += 1 elif char in rmap: idx = rmap[char] if counts[idx] > 0: counts[idx] -= 1 - elif sum(itervalues(counts)) < 1: + total -= 1 + elif total < 1: buf.append(char) return ''.join(buf) def author_to_author_sort(author, method=None): - if not author: + if not author or method == 'copy': return '' sauthor = remove_bracketed_text(author).strip() tokens = sauthor.split() @@ -70,45 +73,46 @@ def author_to_author_sort(author, method=None): return author if method is None: method = tweaks['author_sort_copy_method'] + if method == 'copy': + return author ltoks = frozenset(x.lower() for x in tokens) copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords']) if ltoks.intersection(copy_words): - method = 'copy' + return author - if method == 'copy': + author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes']) + if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes: + return author + + if method == 'comma' and any(',' in t for t in tokens): return author prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']} prefixes |= {y+'.' for y in prefixes} - while True: - if not tokens: - return author - tok = tokens[0].lower() - if tok in prefixes: - tokens = tokens[1:] - else: + + for first in range(len(tokens)): + if tokens[first].lower() not in prefixes: break + else: + return author suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']} suffixes |= {y+'.' for y in suffixes} - suffix = '' - while True: - if not tokens: - return author - last = tokens[-1].lower() - if last in suffixes: - suffix = tokens[-1] + ' ' + suffix - tokens = tokens[:-1] - else: + for last in range(len(tokens) - 1, first - 1, -1): + if tokens[last].lower() not in suffixes: break - suffix = suffix.strip() - - if method == 'comma' and ',' in ''.join(tokens): + else: return author - atokens = tokens[-1:] + tokens[:-1] + suffix = ' '.join(tokens[last + 1:]) + + if last > first and tokens[last - 1].lower() in author_surname_prefixes: + tokens[last - 1] += ' ' + tokens[last] + last -= 1 + + atokens = tokens[last:last + 1] + tokens[first:last] num_toks = len(atokens) if suffix: atokens.append(suffix)