From c77684a03314a93b9f8343e9ce99e2b7129098e4 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 4 Sep 2011 09:52:22 -0600 Subject: [PATCH] When automatically generating author sort for author name, ignore common prefixes like Mr. Dr. etc. Controllable via tweak. Also add a tweak to allow control of how a string is split up into multiple authors. Fixes #795984 ([Request] Extra author sorting options) --- resources/default_tweaks.py | 9 +++++++++ src/calibre/ebooks/metadata/__init__.py | 21 +++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index b385511d56..ead9995eb3 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -70,9 +70,18 @@ author_sort_copy_method = 'comma' author_name_suffixes = ('Jr', 'Sr', 'Inc', 'Ph.D', 'Phd', 'MD', 'M.D', 'I', 'II', 'III', 'IV', 'Junior', 'Senior') +author_name_prefixes = ('Mr', 'Mrs', 'Ms', 'Dr', 'Prof') author_name_copywords = ('Corporation', 'Company', 'Co.', 'Agency', 'Council', 'Committee', 'Inc.', 'Institute', 'Society', 'Club', 'Team') +#: Splitting multiple author names +# By default, calibre splits a string containing multiple author names on +# ampersands and the words "and" and "with". You can customize the splitting +# by changing the regular expression below. Strings are split on whatever the +# specified regular expression matches. +# Default: r'(?i),?\s+(and|with)\s+' +authors_split_regex = r'(?i),?\s+(and|with)\s+' + #: Use author sort in Tag Browser # Set which author field to display in the tags pane (the list of authors, # series, publishers etc on the left hand side). The choices are author and diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py index a9816db5ae..c3a229fe3c 100644 --- a/src/calibre/ebooks/metadata/__init__.py +++ b/src/calibre/ebooks/metadata/__init__.py @@ -10,11 +10,17 @@ import os, sys, re from urllib import unquote, quote from urlparse import urlparse -from calibre import relpath, guess_type, remove_bracketed_text +from calibre import relpath, guess_type, remove_bracketed_text, prints from calibre.utils.config import tweaks -_author_pat = re.compile(',?\s+(and|with)\s+', re.IGNORECASE) +try: + _author_pat = re.compile(tweaks['authors_split_regex']) +except: + prints ('Author split regexp:', tweaks['authors_split_regex'], + 'is invalid, using default') + _author_pat = re.compile(r'(?i),?\s+(and|with)\s+') + def string_to_authors(raw): raw = raw.replace('&&', u'\uffff') raw = _author_pat.sub('&', raw) @@ -45,6 +51,17 @@ def author_to_author_sort(author, method=None): if method == u'copy': return author + prefixes = set([x.lower() for x in tweaks['author_name_prefixes']]) + prefixes |= set([x+u'.' for x in prefixes]) + while True: + if not tokens: + return author + tok = tokens[0].lower() + if tok in prefixes: + tokens = tokens[1:] + else: + break + suffixes = set([x.lower() for x in tweaks['author_name_suffixes']]) suffixes |= set([x+u'.' for x in suffixes])