Change the algorithm that generates title sort strings to strip leading articles from both english and the current language set for the calibre user interface. In addition, in the edit metadata dialog, calibre will use the book's language when calculating the sort string. This behavior can be adjusted via Preferences->Tweaks. Fixes #886763 ([Enhancement] multi-lingual adjustment of (in)definite articles in title_sort)

This commit is contained in:
Kovid Goyal 2011-11-23 11:50:33 +05:30
parent 6d155607fd
commit e4575abba4
5 changed files with 92 additions and 31 deletions

View File

@ -201,15 +201,49 @@ save_template_title_series_sorting = 'library_order'
#: Set the list of words considered to be "articles" for sort strings #: Set the list of words considered to be "articles" for sort strings
# Set the list of words that are to be considered 'articles' when computing the # Set the list of words that are to be considered 'articles' when computing the
# title sort strings. The list is a regular expression, with the articles # title sort strings. The articles differ by language. By default, calibre uses
# separated by 'or' bars. Comparisons are case insensitive, and that cannot be # a combination of articles from English and whatever language the calibre user
# changed. Changes to this tweak won't have an effect until the book is modified # interface is set to. In addition, in some contexts where the book language is
# in some way. If you enter an invalid pattern, it is silently ignored. # available, the language of the book is used. You can change the list of
# To disable use the expression: '^$' # articles for a given language or add a new language by editing
# This expression is designed for articles that are followed by spaces. If you # per_language_title_sort_articles. To tell calibre to use a language other
# also need to match articles that are followed by other characters, for example L' # than the user interface language, set, default_language_for_title_sort. For
# in French, use: "^(A\s+|The\s+|An\s+|L')" instead. # example, to use German, set it to 'deu'. A value of None means the user
# Default: '^(A|The|An)\s+' # interface language is used. The setting title_sort_articles is ignored
# (present only for legacy reasons).
per_language_title_sort_articles = {
# English
'eng' : (r'A\s+', r'The\s+', r'An\s+'),
# Spanish
'spa' : (r'El\s+', r'La\s+', r'Lo\s+', r'Los\s+', r'Las\s+', r'Un\s+',
r'Una\s+', r'Unos\s+', r'Unas\s+'),
# French
'fra' : (r'Le\s+', r'La\s+', r"L'", r'Les\s+', r'Un\s+', r'Une\s+',
r'Des\s+'),
# Italian
'ita' : (r'Lo\s+', r'Il\s+', r"L'", r'La\s+', r'Gli\s+', r'I\s+',
r'Le\s+', ),
# Portuguese
'por' : (r'A\s+', r'O\s+', r'Os\s+', r'As\s+', r'Um\s+', r'Uns\s+',
r'Uma\s+', r'Umas\s+', ),
# Romanian
'ron' : (r'Un\s+', r'O\s+', r'Nişte\s+', ),
# German
'deu' : (r'Der\s+', r'Die\s+', r'Das\s+', r'Den\s+', r'Ein\s+',
r'Eine\s+', r'Einen\s+', ),
# Dutch
'nld' : (r'De\s+', r'Het\s+', r'Een\s+', ),
# Swedish
'swe' : (r'En\s+', r'Ett\s+', r'Det\s+', r'Den\s+', r'De\s+', ),
# Turkish
'tur' : (r'Bir\s+', ),
# Afrikaans
'afr' : (r"'n\s+", r'Die\s+', ),
# Greek
'ell' : (r'O\s+', r'I\s+', r'To\s+', r'Ta\s+', r'Tus\s+', r'Tis\s+',
r"'Enas\s+", r"'Mia\s+", r"'Ena\s+", r"'Enan\s+", ),
}
default_language_for_title_sort = None
title_sort_articles=r'^(A|The|An)\s+' title_sort_articles=r'^(A|The|An)\s+'
#: Specify a folder calibre should connect to at startup #: Specify a folder calibre should connect to at startup

View File

@ -95,18 +95,33 @@ def author_to_author_sort(author, method=None):
def authors_to_sort_string(authors): def authors_to_sort_string(authors):
return ' & '.join(map(author_to_author_sort, authors)) return ' & '.join(map(author_to_author_sort, authors))
try: _title_pats = {}
_title_pat = re.compile(tweaks.get('title_sort_articles', def get_title_sort_pat(lang=None):
r'^(A|The|An)\s+'), re.IGNORECASE) ans = _title_pats.get(lang, None)
except: if ans is not None:
print 'Error in title sort pattern' return ans
import traceback q = lang
traceback.print_exc() from calibre.utils.localization import canonicalize_lang, get_lang
_title_pat = re.compile('^(A|The|An)\s+', re.IGNORECASE) if lang is None:
q = tweaks['default_language_for_title_sort']
if q is None:
q = get_lang()
q = canonicalize_lang(q) if q else q
data = tweaks['per_language_title_sort_articles']
ans = data.get(q, None)
if ans is None:
ans = data['eng']
ans = frozenset(ans + data['eng'])
ans = '|'.join(ans)
ans = '^(%s)'%ans
ans = re.compile(ans, re.IGNORECASE)
_title_pats[lang] = ans
return ans
_ignore_starts = u'\'"'+u''.join(unichr(x) for x in range(0x2018, 0x201e)+[0x2032, 0x2033]) _ignore_starts = u'\'"'+u''.join(unichr(x) for x in
range(0x2018, 0x201e)+[0x2032, 0x2033])
def title_sort(title, order=None): def title_sort(title, order=None, lang=None):
if order is None: if order is None:
order = tweaks['title_series_sorting'] order = tweaks['title_series_sorting']
title = title.strip() title = title.strip()
@ -114,7 +129,7 @@ def title_sort(title, order=None):
return title return title
if title and title[0] in _ignore_starts: if title and title[0] in _ignore_starts:
title = title[1:] title = title[1:]
match = _title_pat.search(title) match = get_title_sort_pat(lang).search(title)
if match: if match:
try: try:
prep = match.group(1) prep = match.group(1)

View File

@ -138,9 +138,10 @@ class TitleSortEdit(TitleEdit):
' For example, The Exorcist might be sorted as Exorcist, The.') ' For example, The Exorcist might be sorted as Exorcist, The.')
LABEL = _('Title &sort:') LABEL = _('Title &sort:')
def __init__(self, parent, title_edit, autogen_button): def __init__(self, parent, title_edit, autogen_button, languages_edit):
TitleEdit.__init__(self, parent) TitleEdit.__init__(self, parent)
self.title_edit = title_edit self.title_edit = title_edit
self.languages_edit = languages_edit
base = self.TOOLTIP base = self.TOOLTIP
ok_tooltip = '<p>' + textwrap.fill(base+'<br><br>'+ ok_tooltip = '<p>' + textwrap.fill(base+'<br><br>'+
@ -157,10 +158,20 @@ class TitleSortEdit(TitleEdit):
self.autogen_button = autogen_button self.autogen_button = autogen_button
autogen_button.clicked.connect(self.auto_generate) autogen_button.clicked.connect(self.auto_generate)
languages_edit.editTextChanged.connect(self.update_state)
languages_edit.currentIndexChanged.connect(self.update_state)
self.update_state() self.update_state()
@property
def book_lang(self):
try:
book_lang = self.languages_edit.lang_codes[0]
except:
book_lang = None
return book_lang
def update_state(self, *args): def update_state(self, *args):
ts = title_sort(self.title_edit.current_val) ts = title_sort(self.title_edit.current_val, lang=self.book_lang)
normal = ts == self.current_val normal = ts == self.current_val
if normal: if normal:
col = 'rgb(0, 255, 0, 20%)' col = 'rgb(0, 255, 0, 20%)'
@ -173,7 +184,8 @@ class TitleSortEdit(TitleEdit):
self.setWhatsThis(tt) self.setWhatsThis(tt)
def auto_generate(self, *args): def auto_generate(self, *args):
self.current_val = title_sort(self.title_edit.current_val) self.current_val = title_sort(self.title_edit.current_val,
lang=self.book_lang)
def break_cycles(self): def break_cycles(self):
try: try:

View File

@ -109,6 +109,9 @@ class MetadataSingleDialogBase(ResizableDialog):
def create_basic_metadata_widgets(self): # {{{ def create_basic_metadata_widgets(self): # {{{
self.basic_metadata_widgets = [] self.basic_metadata_widgets = []
self.languages = LanguagesEdit(self)
self.basic_metadata_widgets.append(self.languages)
self.title = TitleEdit(self) self.title = TitleEdit(self)
self.title.textChanged.connect(self.update_window_title) self.title.textChanged.connect(self.update_window_title)
self.deduce_title_sort_button = QToolButton(self) self.deduce_title_sort_button = QToolButton(self)
@ -119,7 +122,7 @@ class MetadataSingleDialogBase(ResizableDialog):
self.deduce_title_sort_button.setWhatsThis( self.deduce_title_sort_button.setWhatsThis(
self.deduce_title_sort_button.toolTip()) self.deduce_title_sort_button.toolTip())
self.title_sort = TitleSortEdit(self, self.title, self.title_sort = TitleSortEdit(self, self.title,
self.deduce_title_sort_button) self.deduce_title_sort_button, self.languages)
self.basic_metadata_widgets.extend([self.title, self.title_sort]) self.basic_metadata_widgets.extend([self.title, self.title_sort])
self.deduce_author_sort_button = b = QToolButton(self) self.deduce_author_sort_button = b = QToolButton(self)
@ -203,9 +206,6 @@ class MetadataSingleDialogBase(ResizableDialog):
self.publisher = PublisherEdit(self) self.publisher = PublisherEdit(self)
self.basic_metadata_widgets.append(self.publisher) self.basic_metadata_widgets.append(self.publisher)
self.languages = LanguagesEdit(self)
self.basic_metadata_widgets.append(self.languages)
self.timestamp = DateEdit(self) self.timestamp = DateEdit(self)
self.pubdate = PubdateEdit(self) self.pubdate = PubdateEdit(self)
self.basic_metadata_widgets.extend([self.timestamp, self.pubdate]) self.basic_metadata_widgets.extend([self.timestamp, self.pubdate])
@ -282,7 +282,6 @@ class MetadataSingleDialogBase(ResizableDialog):
# Commented out as it doesn't play nice with Next, Prev buttons # Commented out as it doesn't play nice with Next, Prev buttons
#self.fetch_metadata_button.setFocus(Qt.OtherFocusReason) #self.fetch_metadata_button.setFocus(Qt.OtherFocusReason)
# Miscellaneous interaction methods {{{ # Miscellaneous interaction methods {{{
def update_window_title(self, *args): def update_window_title(self, *args):
title = self.title.current_val title = self.title.current_val

View File

@ -15,7 +15,7 @@ from math import ceil
from calibre import prints from calibre import prints
from calibre.ebooks.metadata import (title_sort, author_to_author_sort, from calibre.ebooks.metadata import (title_sort, author_to_author_sort,
string_to_authors, authors_to_string) string_to_authors, authors_to_string, get_title_sort_pat)
from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ebooks.metadata.opf2 import metadata_to_opf
from calibre.library.database import LibraryDatabase from calibre.library.database import LibraryDatabase
from calibre.library.field_metadata import FieldMetadata, TagsIcons from calibre.library.field_metadata import FieldMetadata, TagsIcons
@ -1004,10 +1004,11 @@ class LibraryDatabase2(LibraryDatabase, SchemaUpgrade, CustomColumns):
return False return False
def find_identical_books(self, mi): def find_identical_books(self, mi):
fuzzy_title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in fuzzy_title_patterns = [(re.compile(pat, re.IGNORECASE) if
isinstance(pat, basestring) else pat, repl) for pat, repl in
[ [
(r'[\[\](){}<>\'";,:#]', ''), (r'[\[\](){}<>\'";,:#]', ''),
(tweaks.get('title_sort_articles', r'^(a|the|an)\s+'), ''), (get_title_sort_pat(), ''),
(r'[-._]', ' '), (r'[-._]', ' '),
(r'\s+', ' ') (r'\s+', ' ')
] ]