mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
ENH: Added surname prefixes to author sort
If an author last name is preceded by von, van, di, la, le, da, de, etc. include that in the last name
This commit is contained in:
parent
ed0f7e9684
commit
b1ecfe1fdf
@ -69,6 +69,7 @@ author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd',
|
|||||||
author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof')
|
author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof')
|
||||||
author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council',
|
author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council',
|
||||||
'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team')
|
'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team')
|
||||||
|
author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von')
|
||||||
|
|
||||||
#: Splitting multiple author names
|
#: Splitting multiple author names
|
||||||
# By default, calibre splits a string containing multiple author names on
|
# By default, calibre splits a string containing multiple author names on
|
||||||
|
@ -46,23 +46,26 @@ def remove_bracketed_text(src, brackets=None):
|
|||||||
brackets = {'(': ')', '[': ']', '{': '}'}
|
brackets = {'(': ')', '[': ']', '{': '}'}
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
counts = Counter()
|
counts = Counter()
|
||||||
|
total = 0
|
||||||
buf = []
|
buf = []
|
||||||
src = force_unicode(src)
|
src = force_unicode(src)
|
||||||
rmap = {v: k for k, v in iteritems(brackets)}
|
rmap = {v: k for k, v in iteritems(brackets)}
|
||||||
for char in src:
|
for char in src:
|
||||||
if char in brackets:
|
if char in brackets:
|
||||||
counts[char] += 1
|
counts[char] += 1
|
||||||
|
total += 1
|
||||||
elif char in rmap:
|
elif char in rmap:
|
||||||
idx = rmap[char]
|
idx = rmap[char]
|
||||||
if counts[idx] > 0:
|
if counts[idx] > 0:
|
||||||
counts[idx] -= 1
|
counts[idx] -= 1
|
||||||
elif sum(itervalues(counts)) < 1:
|
total -= 1
|
||||||
|
elif total < 1:
|
||||||
buf.append(char)
|
buf.append(char)
|
||||||
return ''.join(buf)
|
return ''.join(buf)
|
||||||
|
|
||||||
|
|
||||||
def author_to_author_sort(author, method=None):
|
def author_to_author_sort(author, method=None):
|
||||||
if not author:
|
if not author or method == 'copy':
|
||||||
return ''
|
return ''
|
||||||
sauthor = remove_bracketed_text(author).strip()
|
sauthor = remove_bracketed_text(author).strip()
|
||||||
tokens = sauthor.split()
|
tokens = sauthor.split()
|
||||||
@ -70,45 +73,46 @@ def author_to_author_sort(author, method=None):
|
|||||||
return author
|
return author
|
||||||
if method is None:
|
if method is None:
|
||||||
method = tweaks['author_sort_copy_method']
|
method = tweaks['author_sort_copy_method']
|
||||||
|
if method == 'copy':
|
||||||
|
return author
|
||||||
|
|
||||||
ltoks = frozenset(x.lower() for x in tokens)
|
ltoks = frozenset(x.lower() for x in tokens)
|
||||||
copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
|
copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
|
||||||
if ltoks.intersection(copy_words):
|
if ltoks.intersection(copy_words):
|
||||||
method = 'copy'
|
return author
|
||||||
|
|
||||||
if method == 'copy':
|
author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes'])
|
||||||
|
if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes:
|
||||||
|
return author
|
||||||
|
|
||||||
|
if method == 'comma' and any(',' in t for t in tokens):
|
||||||
return author
|
return author
|
||||||
|
|
||||||
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
|
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
|
||||||
prefixes |= {y+'.' for y in prefixes}
|
prefixes |= {y+'.' for y in prefixes}
|
||||||
while True:
|
|
||||||
if not tokens:
|
for first in range(len(tokens)):
|
||||||
return author
|
if tokens[first].lower() not in prefixes:
|
||||||
tok = tokens[0].lower()
|
|
||||||
if tok in prefixes:
|
|
||||||
tokens = tokens[1:]
|
|
||||||
else:
|
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
return author
|
||||||
|
|
||||||
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
|
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
|
||||||
suffixes |= {y+'.' for y in suffixes}
|
suffixes |= {y+'.' for y in suffixes}
|
||||||
|
|
||||||
suffix = ''
|
for last in range(len(tokens) - 1, first - 1, -1):
|
||||||
while True:
|
if tokens[last].lower() not in suffixes:
|
||||||
if not tokens:
|
|
||||||
return author
|
|
||||||
last = tokens[-1].lower()
|
|
||||||
if last in suffixes:
|
|
||||||
suffix = tokens[-1] + ' ' + suffix
|
|
||||||
tokens = tokens[:-1]
|
|
||||||
else:
|
|
||||||
break
|
break
|
||||||
suffix = suffix.strip()
|
else:
|
||||||
|
|
||||||
if method == 'comma' and ',' in ''.join(tokens):
|
|
||||||
return author
|
return author
|
||||||
|
|
||||||
atokens = tokens[-1:] + tokens[:-1]
|
suffix = ' '.join(tokens[last + 1:])
|
||||||
|
|
||||||
|
if last > first and tokens[last - 1].lower() in author_surname_prefixes:
|
||||||
|
tokens[last - 1] += ' ' + tokens[last]
|
||||||
|
last -= 1
|
||||||
|
|
||||||
|
atokens = tokens[last:last + 1] + tokens[first:last]
|
||||||
num_toks = len(atokens)
|
num_toks = len(atokens)
|
||||||
if suffix:
|
if suffix:
|
||||||
atokens.append(suffix)
|
atokens.append(suffix)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user