Various fixes to Title Casing. Fixes #7846 (Title Case doesn't quite work)

2025-07-09 03:04:10 -04:00 · 2010-12-10 12:27:06 -07:00 · 2010-12-10 12:27:06 -07:00 · 826221c0f5
commit 826221c0f5
parent dd2aa48916
2 changed files with 19 additions and 11 deletions
--- a/src/calibre/utils/icu.py
+++ b/src/calibre/utils/icu.py
@ -77,6 +77,9 @@ def py_strcmp(a, b):
 def icu_case_sensitive_strcmp(collator, a, b):
    return collator.strcmp(a, b)

+def icu_capitalize(s):
+    s = lower(s)
+    return s.replace(s[0], upper(s[0]), 1)

 load_icu()
 load_collator()
@ -104,10 +107,6 @@ lower = (lambda s: s.lower()) if _icu_not_ok else \
 title_case = (lambda s: s.title()) if _icu_not_ok else \
    partial(_icu.title, get_locale())

-def icu_capitalize(s):
-    s = lower(s)
-    return s.replace(s[0], upper(s[0]))
-
 capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
    (lambda s: icu_capitalize(s))

@ -226,12 +225,16 @@ pêché'''
    test_strcmp(german + french)

    print '\nTesting case transforms in current locale'
-    for x in ('a', 'Alice\'s code'):
+    from calibre.utils.titlecase import titlecase
+    for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
        print 'Upper:     ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
        print 'Lower:     ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
-        print 'Title:     ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8')
+        print 'Title:     ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
        print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
        print

 # }}}

+if __name__ == '__main__':
+    test()
+
--- a/src/calibre/utils/titlecase.py
+++ b/src/calibre/utils/titlecase.py
@ -23,11 +23,12 @@ UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT)
 CAPFIRST = re.compile(r"^[%s]*?([A-Za-z])" % PUNCT)
 SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I)
 SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I)
+SMALL_AFTER_NUM = re.compile(r'(\d+\s+)(a|an|the)\b', re.I)
 SUBPHRASE = re.compile(r'([:.;?!][ ])(%s)' % SMALL)
 APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+$", re.I)
 ALL_CAPS = re.compile(r'^[A-Z\s%s]+$' % PUNCT)
 UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+$")
-MAC_MC = re.compile(r"^([Mm]a?c)(\w+)")
+MAC_MC = re.compile(r"^([Mm]a?c)(.+)")

 def titlecase(text):

@ -44,7 +45,7 @@ def titlecase(text):

    all_caps = ALL_CAPS.match(text)

-    words = re.split('\s', text)
+    words = re.split('\s+', text)
    line = []
    for word in words:
        if all_caps:
@ -55,8 +56,8 @@ def titlecase(text):
                word = icu_lower(word)

        if APOS_SECOND.match(word):
-            word = word.replace(word[0], icu_upper(word[0]))
-            word = word.replace(word[2], icu_upper(word[2]))
+            word = word.replace(word[0], icu_upper(word[0]), 1)
+            word = word[:2] + icu_upper(word[2]) + word[3:]
            line.append(word)
            continue
        if INLINE_PERIOD.search(word) or UC_ELSEWHERE.match(word):
@ -67,7 +68,7 @@ def titlecase(text):
            continue

        match = MAC_MC.match(word)
-        if match:
+        if match and not match.group(2).startswith('hin'):
            line.append("%s%s" % (capitalize(match.group(1)),
                                  capitalize(match.group(2))))
            continue
@ -85,6 +86,10 @@ def titlecase(text):
        capitalize(m.group(2))
    ), result)

+    result = SMALL_AFTER_NUM.sub(lambda m: '%s%s' % (m.group(1),
+        capitalize(m.group(2))
+    ), result)
+
    result = SMALL_LAST.sub(lambda m: capitalize(m.group(0)), result)

    result = SUBPHRASE.sub(lambda m: '%s%s' % (