mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add a new tweak under "Author sort name algorithm" to optionally recognize common surname prefixes such as von, van, de etc.
Merge branch 'author-sort' of https://github.com/madphysicist/calibre
This commit is contained in:
commit
6dc39a511a
@ -53,22 +53,35 @@ authors_completer_append_separator = False
|
|||||||
# comma : use 'copy' if there is a ',' in the name, otherwise use 'invert'
|
# comma : use 'copy' if there is a ',' in the name, otherwise use 'invert'
|
||||||
# nocomma : "fn ln" -> "ln fn" (without the comma)
|
# nocomma : "fn ln" -> "ln fn" (without the comma)
|
||||||
# When this tweak is changed, the author_sort values stored with each author
|
# When this tweak is changed, the author_sort values stored with each author
|
||||||
# must be recomputed by right-clicking on an author in the left-hand tags panel,
|
# must be recomputed by right-clicking on an author in the left-hand tags
|
||||||
# selecting 'manage authors', and pressing 'Recalculate all author sort values'.
|
# panel, selecting 'manage authors', and pressing
|
||||||
# The author name suffixes are words that are ignored when they occur at the
|
# 'Recalculate all author sort values'.
|
||||||
|
#
|
||||||
|
# The author_name_suffixes are words that are ignored when they occur at the
|
||||||
# end of an author name. The case of the suffix is ignored and trailing
|
# end of an author name. The case of the suffix is ignored and trailing
|
||||||
# periods are automatically handled. The same is true for prefixes.
|
# periods are automatically handled.
|
||||||
# The author name copy words are a set of words which if they occur in an
|
#
|
||||||
# author name cause the automatically generated author sort string to be
|
# The same is true for author_name_prefixes.
|
||||||
# identical to the author name. This means that the sort for a string like Acme
|
#
|
||||||
# Inc. will be Acme Inc. instead of Inc., Acme
|
# The author_name_copywords are a set of words which, if they occur in an
|
||||||
|
# author name, cause the automatically generated author sort string to be
|
||||||
|
# identical to the author name. This means that the sort for a string like
|
||||||
|
# "Acme Inc." will be "Acme Inc." instead of "Inc., Acme".
|
||||||
|
#
|
||||||
|
# If author_use_surname_prefixes is enabled, any of the words in
|
||||||
|
# author_surname_prefixes will be treated as a prefix to the surname, if they
|
||||||
|
# occur before the surname. So for example, "John von Neumann" would be sorted
|
||||||
|
# as "von Neumann, John" and not "Neumann, John von".
|
||||||
author_sort_copy_method = 'comma'
|
author_sort_copy_method = 'comma'
|
||||||
author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd',
|
author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd',
|
||||||
'MD', 'M.D', 'I', 'II', 'III', 'IV',
|
'MD', 'M.D', 'I', 'II', 'III', 'IV',
|
||||||
'Junior', 'Senior')
|
'Junior', 'Senior')
|
||||||
author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof')
|
author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof')
|
||||||
author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council',
|
author_name_copywords = ('Agency', 'Corporation', 'Company', 'Co.', 'Council',
|
||||||
'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team')
|
'Committee', 'Inc.', 'Institute', 'National',
|
||||||
|
'Society', 'Club', 'Team')
|
||||||
|
author_use_surname_prefixes = False
|
||||||
|
author_surname_prefixes = ('da', 'de', 'di', 'la', 'le', 'van', 'von')
|
||||||
|
|
||||||
#: Splitting multiple author names
|
#: Splitting multiple author names
|
||||||
# By default, calibre splits a string containing multiple author names on
|
# By default, calibre splits a string containing multiple author names on
|
||||||
|
@ -121,6 +121,8 @@ def find_tests(which_tests=None, exclude_tests=None):
|
|||||||
from calibre.gui2.viewer.annotations import find_tests
|
from calibre.gui2.viewer.annotations import find_tests
|
||||||
a(find_tests())
|
a(find_tests())
|
||||||
if ok('misc'):
|
if ok('misc'):
|
||||||
|
from calibre.ebooks.metadata import find_tests
|
||||||
|
a(find_tests())
|
||||||
from calibre.ebooks.metadata.tag_mapper import find_tests
|
from calibre.ebooks.metadata.tag_mapper import find_tests
|
||||||
a(find_tests())
|
a(find_tests())
|
||||||
from calibre.ebooks.metadata.author_mapper import find_tests
|
from calibre.ebooks.metadata.author_mapper import find_tests
|
||||||
|
@ -13,7 +13,7 @@ import os, sys, re
|
|||||||
|
|
||||||
from calibre import relpath, guess_type, prints, force_unicode
|
from calibre import relpath, guess_type, prints, force_unicode
|
||||||
from calibre.utils.config_base import tweaks
|
from calibre.utils.config_base import tweaks
|
||||||
from polyglot.builtins import codepoint_to_chr, unicode_type, range, map, zip, getcwd, iteritems, itervalues, as_unicode
|
from polyglot.builtins import codepoint_to_chr, unicode_type, range, map, zip, getcwd, iteritems, as_unicode
|
||||||
from polyglot.urllib import quote, unquote, urlparse
|
from polyglot.urllib import quote, unquote, urlparse
|
||||||
|
|
||||||
|
|
||||||
@ -46,17 +46,20 @@ def remove_bracketed_text(src, brackets=None):
|
|||||||
brackets = {'(': ')', '[': ']', '{': '}'}
|
brackets = {'(': ')', '[': ']', '{': '}'}
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
counts = Counter()
|
counts = Counter()
|
||||||
|
total = 0
|
||||||
buf = []
|
buf = []
|
||||||
src = force_unicode(src)
|
src = force_unicode(src)
|
||||||
rmap = {v: k for k, v in iteritems(brackets)}
|
rmap = {v: k for k, v in iteritems(brackets)}
|
||||||
for char in src:
|
for char in src:
|
||||||
if char in brackets:
|
if char in brackets:
|
||||||
counts[char] += 1
|
counts[char] += 1
|
||||||
|
total += 1
|
||||||
elif char in rmap:
|
elif char in rmap:
|
||||||
idx = rmap[char]
|
idx = rmap[char]
|
||||||
if counts[idx] > 0:
|
if counts[idx] > 0:
|
||||||
counts[idx] -= 1
|
counts[idx] -= 1
|
||||||
elif sum(itervalues(counts)) < 1:
|
total -= 1
|
||||||
|
elif total < 1:
|
||||||
buf.append(char)
|
buf.append(char)
|
||||||
return ''.join(buf)
|
return ''.join(buf)
|
||||||
|
|
||||||
@ -64,51 +67,57 @@ def remove_bracketed_text(src, brackets=None):
|
|||||||
def author_to_author_sort(author, method=None):
|
def author_to_author_sort(author, method=None):
|
||||||
if not author:
|
if not author:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
if method is None:
|
||||||
|
method = tweaks['author_sort_copy_method']
|
||||||
|
if method == 'copy':
|
||||||
|
return author
|
||||||
|
|
||||||
sauthor = remove_bracketed_text(author).strip()
|
sauthor = remove_bracketed_text(author).strip()
|
||||||
|
if method == 'comma' and ',' in sauthor:
|
||||||
|
return author
|
||||||
|
|
||||||
tokens = sauthor.split()
|
tokens = sauthor.split()
|
||||||
if len(tokens) < 2:
|
if len(tokens) < 2:
|
||||||
return author
|
return author
|
||||||
if method is None:
|
|
||||||
method = tweaks['author_sort_copy_method']
|
|
||||||
|
|
||||||
ltoks = frozenset(x.lower() for x in tokens)
|
ltoks = frozenset(x.lower() for x in tokens)
|
||||||
copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
|
copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
|
||||||
if ltoks.intersection(copy_words):
|
if ltoks.intersection(copy_words):
|
||||||
method = 'copy'
|
|
||||||
|
|
||||||
if method == 'copy':
|
|
||||||
return author
|
return author
|
||||||
|
|
||||||
|
author_use_surname_prefixes = tweaks['author_use_surname_prefixes']
|
||||||
|
if author_use_surname_prefixes:
|
||||||
|
author_surname_prefixes = frozenset(x.lower() for x in tweaks['author_surname_prefixes'])
|
||||||
|
if len(tokens) == 2 and tokens[0].lower() in author_surname_prefixes:
|
||||||
|
return author
|
||||||
|
|
||||||
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
|
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
|
||||||
prefixes |= {y+'.' for y in prefixes}
|
prefixes |= {y+'.' for y in prefixes}
|
||||||
while True:
|
|
||||||
if not tokens:
|
for first in range(len(tokens)):
|
||||||
return author
|
if tokens[first].lower() not in prefixes:
|
||||||
tok = tokens[0].lower()
|
|
||||||
if tok in prefixes:
|
|
||||||
tokens = tokens[1:]
|
|
||||||
else:
|
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
return author
|
||||||
|
|
||||||
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
|
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
|
||||||
suffixes |= {y+'.' for y in suffixes}
|
suffixes |= {y+'.' for y in suffixes}
|
||||||
|
|
||||||
suffix = ''
|
for last in range(len(tokens) - 1, first - 1, -1):
|
||||||
while True:
|
if tokens[last].lower() not in suffixes:
|
||||||
if not tokens:
|
|
||||||
return author
|
|
||||||
last = tokens[-1].lower()
|
|
||||||
if last in suffixes:
|
|
||||||
suffix = tokens[-1] + ' ' + suffix
|
|
||||||
tokens = tokens[:-1]
|
|
||||||
else:
|
|
||||||
break
|
break
|
||||||
suffix = suffix.strip()
|
else:
|
||||||
|
|
||||||
if method == 'comma' and ',' in ''.join(tokens):
|
|
||||||
return author
|
return author
|
||||||
|
|
||||||
atokens = tokens[-1:] + tokens[:-1]
|
suffix = ' '.join(tokens[last + 1:])
|
||||||
|
|
||||||
|
if author_use_surname_prefixes:
|
||||||
|
if last > first and tokens[last - 1].lower() in author_surname_prefixes:
|
||||||
|
tokens[last - 1] += ' ' + tokens[last]
|
||||||
|
last -= 1
|
||||||
|
|
||||||
|
atokens = tokens[last:last + 1] + tokens[first:last]
|
||||||
num_toks = len(atokens)
|
num_toks = len(atokens)
|
||||||
if suffix:
|
if suffix:
|
||||||
atokens.append(suffix)
|
atokens.append(suffix)
|
||||||
@ -438,3 +447,134 @@ def rating_to_stars(value, allow_half_stars=False, star='★', half='⯨'):
|
|||||||
if allow_half_stars and r % 2:
|
if allow_half_stars and r % 2:
|
||||||
ans += half
|
ans += half
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def find_tests():
|
||||||
|
import unittest
|
||||||
|
from calibre.utils.config_base import Tweak
|
||||||
|
|
||||||
|
class TestRemoveBracketedText(unittest.TestCase):
|
||||||
|
def test_brackets(self):
|
||||||
|
self.assertEqual(remove_bracketed_text('a[b]c(d)e{f}g<h>i'), 'aceg<h>i')
|
||||||
|
|
||||||
|
def test_nested(self):
|
||||||
|
self.assertEqual(remove_bracketed_text('a[[b]c(d)e{f}]g(h(i)j[k]l{m})n{{{o}}}p'), 'agnp')
|
||||||
|
|
||||||
|
def test_mismatched(self):
|
||||||
|
self.assertEqual(remove_bracketed_text('a[b(c]d)e'), 'ae')
|
||||||
|
self.assertEqual(remove_bracketed_text('a{b(c}d)e'), 'ae')
|
||||||
|
|
||||||
|
def test_extra_closed(self):
|
||||||
|
self.assertEqual(remove_bracketed_text('a]b}c)d'), 'abcd')
|
||||||
|
self.assertEqual(remove_bracketed_text('a[b]c]d(e)f{g)h}i}j)k]l'), 'acdfijkl')
|
||||||
|
|
||||||
|
def test_unclosed(self):
|
||||||
|
self.assertEqual(remove_bracketed_text('a]b[c'), 'ab')
|
||||||
|
self.assertEqual(remove_bracketed_text('a(b[c]d{e}f'), 'a')
|
||||||
|
self.assertEqual(remove_bracketed_text('a{b}c{d[e]f(g)h'), 'ac')
|
||||||
|
|
||||||
|
class TestAuthorToAuthorSort(unittest.TestCase):
|
||||||
|
def check_all_methods(self, name, invert=None, comma=None,
|
||||||
|
nocomma=None, copy=None):
|
||||||
|
methods = ('invert', 'copy', 'comma', 'nocomma')
|
||||||
|
if invert is None:
|
||||||
|
invert = name
|
||||||
|
if comma is None:
|
||||||
|
comma = invert
|
||||||
|
if nocomma is None:
|
||||||
|
nocomma = comma
|
||||||
|
if copy is None:
|
||||||
|
copy = name
|
||||||
|
results = (invert, copy, comma, nocomma)
|
||||||
|
for method, result in zip(methods, results):
|
||||||
|
self.assertEqual(author_to_author_sort(name, method), result)
|
||||||
|
|
||||||
|
def test_single(self):
|
||||||
|
self.check_all_methods('Aristotle')
|
||||||
|
|
||||||
|
def test_all_prefix(self):
|
||||||
|
self.check_all_methods('Mr. Dr Prof.')
|
||||||
|
|
||||||
|
def test_all_suffix(self):
|
||||||
|
self.check_all_methods('Senior Inc')
|
||||||
|
|
||||||
|
def test_copywords(self):
|
||||||
|
self.check_all_methods('Don "Team" Smith',
|
||||||
|
invert='Smith, Don "Team"',
|
||||||
|
nocomma='Smith Don "Team"')
|
||||||
|
self.check_all_methods('Don Team Smith')
|
||||||
|
|
||||||
|
def test_national(self):
|
||||||
|
c = tweaks['author_name_copywords']
|
||||||
|
try:
|
||||||
|
# Assume that 'author_name_copywords' is a common sequence type
|
||||||
|
i = c.index('National')
|
||||||
|
except ValueError:
|
||||||
|
# If "National" not found, check first without, then temporarily add
|
||||||
|
self.check_all_methods('National Lampoon',
|
||||||
|
invert='Lampoon, National',
|
||||||
|
nocomma='Lampoon National')
|
||||||
|
t = type(c)
|
||||||
|
with Tweak('author_name_copywords', c + t(['National'])):
|
||||||
|
self.check_all_methods('National Lampoon')
|
||||||
|
else:
|
||||||
|
# If "National" found, check with, then temporarily remove
|
||||||
|
self.check_all_methods('National Lampoon')
|
||||||
|
with Tweak('author_name_copywords', c[:i] + c[i + 1:]):
|
||||||
|
self.check_all_methods('National Lampoon',
|
||||||
|
invert='Lampoon, National',
|
||||||
|
nocomma='Lampoon National')
|
||||||
|
|
||||||
|
def test_method(self):
|
||||||
|
self.check_all_methods('Jane Doe',
|
||||||
|
invert='Doe, Jane',
|
||||||
|
nocomma='Doe Jane')
|
||||||
|
|
||||||
|
def test_invalid_methos(self):
|
||||||
|
# Invalid string defaults to invert
|
||||||
|
name = 'Jane, Q. van Doe[ed] Jr.'
|
||||||
|
self.assertEqual(author_to_author_sort(name, 'invert'),
|
||||||
|
author_to_author_sort(name, '__unknown__!(*T^U$'))
|
||||||
|
|
||||||
|
def test_prefix_suffix(self):
|
||||||
|
self.check_all_methods('Mrs. Jane Q. Doe III',
|
||||||
|
invert='Doe, Jane Q. III',
|
||||||
|
nocomma='Doe Jane Q. III')
|
||||||
|
|
||||||
|
def test_surname_prefix(self):
|
||||||
|
with Tweak('author_use_surname_prefixes', True):
|
||||||
|
self.check_all_methods('Leonardo Da Vinci',
|
||||||
|
invert='Da Vinci, Leonardo',
|
||||||
|
nocomma='Da Vinci Leonardo')
|
||||||
|
self.check_all_methods('Van Gogh')
|
||||||
|
self.check_all_methods('Van')
|
||||||
|
with Tweak('author_use_surname_prefixes', False):
|
||||||
|
self.check_all_methods('Leonardo Da Vinci',
|
||||||
|
invert='Vinci, Leonardo Da',
|
||||||
|
nocomma='Vinci Leonardo Da')
|
||||||
|
self.check_all_methods('Van Gogh',
|
||||||
|
invert='Gogh, Van',
|
||||||
|
nocomma='Gogh Van')
|
||||||
|
|
||||||
|
def test_comma(self):
|
||||||
|
self.check_all_methods('James Wesley, Rawles',
|
||||||
|
invert='Rawles, James Wesley,',
|
||||||
|
comma='James Wesley, Rawles',
|
||||||
|
nocomma='Rawles James Wesley,')
|
||||||
|
|
||||||
|
def test_brackets(self):
|
||||||
|
self.check_all_methods('Seventh Author [7]',
|
||||||
|
invert='Author, Seventh',
|
||||||
|
nocomma='Author Seventh')
|
||||||
|
self.check_all_methods('John [x]von Neumann (III)',
|
||||||
|
invert='Neumann, John von',
|
||||||
|
nocomma='Neumann John von')
|
||||||
|
|
||||||
|
def test_falsy(self):
|
||||||
|
self.check_all_methods('')
|
||||||
|
self.check_all_methods(None, '', '', '', '')
|
||||||
|
self.check_all_methods([], '', '', '', '')
|
||||||
|
|
||||||
|
ans = unittest.defaultTestLoader.loadTestsFromTestCase(TestRemoveBracketedText)
|
||||||
|
ans.addTests(unittest.defaultTestLoader.loadTestsFromTestCase(TestAuthorToAuthorSort))
|
||||||
|
return ans
|
||||||
|
Loading…
x
Reference in New Issue
Block a user