From 826221c0f520b5fcbd5d931032a3cf5035f12df7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 10 Dec 2010 12:27:06 -0700 Subject: [PATCH] Various fixes to Title Casing. Fixes #7846 (Title Case doesn't quite work) --- src/calibre/utils/icu.py | 15 +++++++++------ src/calibre/utils/titlecase.py | 15 ++++++++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/calibre/utils/icu.py b/src/calibre/utils/icu.py index 4b0f6d4821..22611813f4 100644 --- a/src/calibre/utils/icu.py +++ b/src/calibre/utils/icu.py @@ -77,6 +77,9 @@ def py_strcmp(a, b): def icu_case_sensitive_strcmp(collator, a, b): return collator.strcmp(a, b) +def icu_capitalize(s): + s = lower(s) + return s.replace(s[0], upper(s[0]), 1) load_icu() load_collator() @@ -104,10 +107,6 @@ lower = (lambda s: s.lower()) if _icu_not_ok else \ title_case = (lambda s: s.title()) if _icu_not_ok else \ partial(_icu.title, get_locale()) -def icu_capitalize(s): - s = lower(s) - return s.replace(s[0], upper(s[0])) - capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \ (lambda s: icu_capitalize(s)) @@ -226,12 +225,16 @@ pêché''' test_strcmp(german + french) print '\nTesting case transforms in current locale' - for x in ('a', 'Alice\'s code'): + from calibre.utils.titlecase import titlecase + for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'): print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8') print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8') - print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8') + print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8') print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8') print # }}} +if __name__ == '__main__': + test() + diff --git a/src/calibre/utils/titlecase.py b/src/calibre/utils/titlecase.py index bbc4c26688..bf2f9a78d4 100755 --- a/src/calibre/utils/titlecase.py +++ b/src/calibre/utils/titlecase.py @@ -23,11 +23,12 @@ UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT) CAPFIRST = re.compile(r"^[%s]*?([A-Za-z])" % PUNCT) SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I) SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I) +SMALL_AFTER_NUM = re.compile(r'(\d+\s+)(a|an|the)\b', re.I) SUBPHRASE = re.compile(r'([:.;?!][ ])(%s)' % SMALL) APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+$", re.I) ALL_CAPS = re.compile(r'^[A-Z\s%s]+$' % PUNCT) UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+$") -MAC_MC = re.compile(r"^([Mm]a?c)(\w+)") +MAC_MC = re.compile(r"^([Mm]a?c)(.+)") def titlecase(text): @@ -44,7 +45,7 @@ def titlecase(text): all_caps = ALL_CAPS.match(text) - words = re.split('\s', text) + words = re.split('\s+', text) line = [] for word in words: if all_caps: @@ -55,8 +56,8 @@ def titlecase(text): word = icu_lower(word) if APOS_SECOND.match(word): - word = word.replace(word[0], icu_upper(word[0])) - word = word.replace(word[2], icu_upper(word[2])) + word = word.replace(word[0], icu_upper(word[0]), 1) + word = word[:2] + icu_upper(word[2]) + word[3:] line.append(word) continue if INLINE_PERIOD.search(word) or UC_ELSEWHERE.match(word): @@ -67,7 +68,7 @@ def titlecase(text): continue match = MAC_MC.match(word) - if match: + if match and not match.group(2).startswith('hin'): line.append("%s%s" % (capitalize(match.group(1)), capitalize(match.group(2)))) continue @@ -85,6 +86,10 @@ def titlecase(text): capitalize(m.group(2)) ), result) + result = SMALL_AFTER_NUM.sub(lambda m: '%s%s' % (m.group(1), + capitalize(m.group(2)) + ), result) + result = SMALL_LAST.sub(lambda m: capitalize(m.group(0)), result) result = SUBPHRASE.sub(lambda m: '%s%s' % (