Various fixes to Title Casing. Fixes #7846 (Title Case doesn't quite work)

This commit is contained in:
Kovid Goyal 2010-12-10 12:27:06 -07:00
parent dd2aa48916
commit 826221c0f5
2 changed files with 19 additions and 11 deletions

View File

@ -77,6 +77,9 @@ def py_strcmp(a, b):
def icu_case_sensitive_strcmp(collator, a, b):
return collator.strcmp(a, b)
def icu_capitalize(s):
s = lower(s)
return s.replace(s[0], upper(s[0]), 1)
load_icu()
load_collator()
@ -104,10 +107,6 @@ lower = (lambda s: s.lower()) if _icu_not_ok else \
title_case = (lambda s: s.title()) if _icu_not_ok else \
partial(_icu.title, get_locale())
def icu_capitalize(s):
s = lower(s)
return s.replace(s[0], upper(s[0]))
capitalize = (lambda s: s.capitalize()) if _icu_not_ok else \
(lambda s: icu_capitalize(s))
@ -226,12 +225,16 @@ pêché'''
test_strcmp(german + french)
print '\nTesting case transforms in current locale'
for x in ('a', 'Alice\'s code'):
from calibre.utils.titlecase import titlecase
for x in ('a', 'Alice\'s code', 'macdonald\'s machine', '02 the wars'):
print 'Upper: ', x, '->', 'py:', x.upper().encode('utf-8'), 'icu:', upper(x).encode('utf-8')
print 'Lower: ', x, '->', 'py:', x.lower().encode('utf-8'), 'icu:', lower(x).encode('utf-8')
print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8')
print 'Title: ', x, '->', 'py:', x.title().encode('utf-8'), 'icu:', title_case(x).encode('utf-8'), 'titlecase:', titlecase(x).encode('utf-8')
print 'Capitalize:', x, '->', 'py:', x.capitalize().encode('utf-8'), 'icu:', capitalize(x).encode('utf-8')
print
# }}}
if __name__ == '__main__':
test()

View File

@ -23,11 +23,12 @@ UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT)
CAPFIRST = re.compile(r"^[%s]*?([A-Za-z])" % PUNCT)
SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I)
SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I)
SMALL_AFTER_NUM = re.compile(r'(\d+\s+)(a|an|the)\b', re.I)
SUBPHRASE = re.compile(r'([:.;?!][ ])(%s)' % SMALL)
APOS_SECOND = re.compile(r"^[dol]{1}[']{1}[a-z]+$", re.I)
ALL_CAPS = re.compile(r'^[A-Z\s%s]+$' % PUNCT)
UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+$")
MAC_MC = re.compile(r"^([Mm]a?c)(\w+)")
MAC_MC = re.compile(r"^([Mm]a?c)(.+)")
def titlecase(text):
@ -44,7 +45,7 @@ def titlecase(text):
all_caps = ALL_CAPS.match(text)
words = re.split('\s', text)
words = re.split('\s+', text)
line = []
for word in words:
if all_caps:
@ -55,8 +56,8 @@ def titlecase(text):
word = icu_lower(word)
if APOS_SECOND.match(word):
word = word.replace(word[0], icu_upper(word[0]))
word = word.replace(word[2], icu_upper(word[2]))
word = word.replace(word[0], icu_upper(word[0]), 1)
word = word[:2] + icu_upper(word[2]) + word[3:]
line.append(word)
continue
if INLINE_PERIOD.search(word) or UC_ELSEWHERE.match(word):
@ -67,7 +68,7 @@ def titlecase(text):
continue
match = MAC_MC.match(word)
if match:
if match and not match.group(2).startswith('hin'):
line.append("%s%s" % (capitalize(match.group(1)),
capitalize(match.group(2))))
continue
@ -85,6 +86,10 @@ def titlecase(text):
capitalize(m.group(2))
), result)
result = SMALL_AFTER_NUM.sub(lambda m: '%s%s' % (m.group(1),
capitalize(m.group(2))
), result)
result = SMALL_LAST.sub(lambda m: capitalize(m.group(0)), result)
result = SUBPHRASE.sub(lambda m: '%s%s' % (