Title casing: Fix presence of some non-english characters or smart punctuation causing all-caps text to not be properly lowercased

This commit is contained in:
Kovid Goyal 2013-12-22 08:47:51 +05:30
parent f588e2d9e2
commit 2f711f84fa

View File

@ -9,13 +9,13 @@ License: http://www.opensource.org/licenses/mit-license.php
import re
from calibre.utils.icu import capitalize
from calibre.utils.icu import capitalize, upper
__all__ = ['titlecase']
__version__ = '0.5'
SMALL = 'a|an|and|as|at|but|by|en|for|if|in|of|on|or|the|to|v\.?|via|vs\.?'
PUNCT = r"""!"#$%&'()*+,\-‒–—―./:;?@[\\\]_`{|}~"""
PUNCT = r"""!"#$%&'()*+,\-‒–—―./:;?@[\\\]_`{|}~"""
SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I)
INLINE_PERIOD = re.compile(r'[a-z][.][a-z]', re.I)
@ -26,7 +26,6 @@ SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I|re.U)
SMALL_AFTER_NUM = re.compile(r'(\d+\s+)(a|an|the)\b', re.I|re.U)
SUBPHRASE = re.compile(r'([:.;?!][ ])(%s)' % SMALL)
APOS_SECOND = re.compile(r"^[dol]{1}[']{1}[a-z]+$", re.I)
ALL_CAPS = re.compile(r'^[A-Z0-9\s%s]+$' % PUNCT)
UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+$")
_lang = None
@ -51,7 +50,7 @@ def titlecase(text):
"""
all_caps = ALL_CAPS.match(text)
all_caps = upper(text) == text
words = re.split('\s+', text)
line = []