From a300879f7055ed0c9ddc5cbe44484fdfc940f05c Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 19 Mar 2011 15:32:05 -0400 Subject: [PATCH] TXT Input: Texttile: Simplify code for handing macros and glyphs. --- src/calibre/ebooks/textile/functions.py | 408 +++++++----------------- 1 file changed, 114 insertions(+), 294 deletions(-) diff --git a/src/calibre/ebooks/textile/functions.py b/src/calibre/ebooks/textile/functions.py index 891211de30..b37cd4aab8 100755 --- a/src/calibre/ebooks/textile/functions.py +++ b/src/calibre/ebooks/textile/functions.py @@ -121,97 +121,113 @@ class Textile(object): btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn\d+', 'p') btag_lite = ('bq', 'bc', 'p') - glyph_defaults = ( - ('mac_cent', '¢'), - ('mac_pound', '£'), - ('mac_yen', '¥'), - ('mac_quarter', '¼'), - ('mac_half', '½'), - ('mac_three-quarter', '¾'), - ('mac_cA-grave', 'À'), - ('mac_cA-acute', 'Á'), - ('mac_cA-circumflex', 'Â'), - ('mac_cA-tilde', 'Ã'), - ('mac_cA-diaeresis', 'Ä'), - ('mac_cA-ring', 'Å'), - ('mac_cAE', 'Æ'), - ('mac_cC-cedilla', 'Ç'), - ('mac_cE-grave', 'È'), - ('mac_cE-acute', 'É'), - ('mac_cE-circumflex', 'Ê'), - ('mac_cE-diaeresis', 'Ë'), - ('mac_cI-grave', 'Ì'), - ('mac_cI-acute', 'Í'), - ('mac_cI-circumflex', 'Î'), - ('mac_cI-diaeresis', 'Ï'), - ('mac_cEth', 'Ð'), - ('mac_cN-tilde', 'Ñ'), - ('mac_cO-grave', 'Ò'), - ('mac_cO-acute', 'Ó'), - ('mac_cO-circumflex', 'Ô'), - ('mac_cO-tilde', 'Õ'), - ('mac_cO-diaeresis', 'Ö'), - ('mac_cO-stroke', 'Ø'), - ('mac_cU-grave', 'Ù'), - ('mac_cU-acute', 'Ú'), - ('mac_cU-circumflex', 'Û'), - ('mac_cU-diaeresis', 'Ü'), - ('mac_cY-acute', 'Ý'), - ('mac_sa-grave', 'à'), - ('mac_sa-acute', 'á'), - ('mac_sa-circumflex', 'â'), - ('mac_sa-tilde', 'ã'), - ('mac_sa-diaeresis', 'ä'), - ('mac_sa-ring', 'å'), - ('mac_sae', 'æ'), - ('mac_sc-cedilla', 'ç'), - ('mac_se-grave', 'è'), - ('mac_se-acute', 'é'), - ('mac_se-circumflex', 'ê'), - ('mac_se-diaeresis', 'ë'), - ('mac_si-grave', 'ì'), - ('mac_si-acute', 'í'), - ('mac_si-circumflex', 'î'), - ('mac_si-diaeresis', 'ï'), - ('mac_sn-tilde', 'ñ'), - ('mac_so-grave', 'ò'), - ('mac_so-acute', 'ó'), - ('mac_so-circumflex', 'ô'), - ('mac_so-tilde', 'õ'), - ('mac_so-diaeresis', 'ö'), - ('mac_so-stroke', 'ø'), - ('mac_su-grave', 'ù'), - ('mac_su-acute', 'ú'), - ('mac_su-circumflex', 'û'), - ('mac_su-diaeresis', 'ü'), - ('mac_sy-acute', 'ý'), - ('mac_sy-diaeresis', 'ÿ'), - ('mac_cOE', 'Œ'), - ('mac_soe', 'œ'), - ('mac_bullet', '•'), - ('mac_franc', '₣'), - ('mac_lira', '₤'), - ('mac_rupee', '₨'), - ('mac_euro', '€'), - ('mac_spade', '♠'), - ('mac_club', '♣'), - ('mac_heart', '♥'), - ('mac_diamond', '♦'), - ('txt_dimension', '×'), - ('txt_quote_single_open', '‘'), - ('txt_quote_single_close', '’'), - ('txt_quote_double_open', '“'), - ('txt_quote_double_close', '”'), - ('txt_apostrophe', '’'), - ('txt_prime', '′'), - ('txt_prime_double', '″'), - ('txt_ellipsis', '…'), - ('txt_emdash', '—'), - ('txt_endash', '–'), - ('txt_trademark', '™'), - ('txt_registered', '®'), - ('txt_copyright', '©'), - ) + macro_defaults = [ + (re.compile(r'{(c\||\|c)}'), r'¢'), # cent + (re.compile(r'{(L-|-L)}'), r'£'), # pound + (re.compile(r'{(Y=|=Y)}'), r'¥'), # yen + (re.compile(r'{\(c\)}'), r'©'), # copyright + (re.compile(r'{\(r\)}'), r'®'), # registered + (re.compile(r'{(\+_|_\+)}'), r'±'), # plus-minus + (re.compile(r'{1/4}'), r'¼'), # quarter + (re.compile(r'{1/2}'), r'½'), # half + (re.compile(r'{3/4}'), r'¾'), # three-quarter + (re.compile(r'{(A`|`A)}'), r'À'), # A-acute + (re.compile(r'{(A\'|\'A)}'), r'Á'), # A-grave + (re.compile(r'{(A\^|\^A)}'), r'Â'), # A-circumflex + (re.compile(r'{(A~|~A)}'), r'Ã'), # A-tilde + (re.compile(r'{(A\"|\"A)}'), r'Ä'), # A-diaeresis + (re.compile(r'{(Ao|oA)}'), r'Å'), # A-ring + (re.compile(r'{(AE)}'), r'Æ'), # AE + (re.compile(r'{(C,|,C)}'), r'Ç'), # C-cedilla + (re.compile(r'{(E`|`E)}'), r'È'), # E-acute + (re.compile(r'{(E\'|\'E)}'), r'É'), # E-grave + (re.compile(r'{(E\^|\^E)}'), r'Ê'), # E-circumflex + (re.compile(r'{(E\"|\"E)}'), r'Ë'), # E-diaeresis + (re.compile(r'{(I`|`I)}'), r'Ì'), # I-acute + (re.compile(r'{(I\'|\'I)}'), r'Í'), # I-grave + (re.compile(r'{(I\^|\^I)}'), r'Î'), # I-circumflex + (re.compile(r'{(I\"|\"I)}'), r'Ï'), # I-diaeresis + (re.compile(r'{(D-|-D)}'), r'Ð'), # ETH + (re.compile(r'{(N~|~N)}'), r'Ñ'), # N-tilde + (re.compile(r'{(O`|`O)}'), r'Ò'), # O-acute + (re.compile(r'{(O\'|\'O)}'), r'Ó'), # O-grave + (re.compile(r'{(O\^|\^O)}'), r'Ô'), # O-circumflex + (re.compile(r'{(O~|~O)}'), r'Õ'), # O-tilde + (re.compile(r'{(O\"|\"O)}'), r'Ö'), # O-diaeresis + (re.compile(r'{x}'), r'×'), # dimension + (re.compile(r'{(O\/|\/O)}'), r'Ø'), # O-slash + (re.compile(r'{(U`|`U)}'), r'Ù'), # U-acute + (re.compile(r'{(U\'|\'U)}'), r'Ú'), # U-grave + (re.compile(r'{(U\^|\^U)}'), r'Û'), # U-circumflex + (re.compile(r'{(U\"|\"U)}'), r'Ü'), # U-diaeresis + (re.compile(r'{(Y\'|\'Y)}'), r'Ý'), # Y-grave + (re.compile(r'{sz}'), r'ß'), # sharp-s + (re.compile(r'{(a`|`a)}'), r'à'), # a-grave + (re.compile(r'{(a\'|\'a)}'), r'á'), # a-acute + (re.compile(r'{(a\^|\^a)}'), r'â'), # a-circumflex + (re.compile(r'{(a~|~a)}'), r'ã'), # a-tilde + (re.compile(r'{(a\"|\"a)}'), r'ä'), # a-diaeresis + (re.compile(r'{(ao|oa)}'), r'å'), # a-ring + (re.compile(r'{ae}'), r'æ'), # ae + (re.compile(r'{(c,|,c)}'), r'ç'), # c-cedilla + (re.compile(r'{(e`|`e)}'), r'è'), # e-grave + (re.compile(r'{(e\'|\'e)}'), r'é'), # e-acute + (re.compile(r'{(e\^|\^e)}'), r'ê'), # e-circumflex + (re.compile(r'{(e\"|\"e)}'), r'ë'), # e-diaeresis + (re.compile(r'{(i`|`i)}'), r'ì'), # i-grave + (re.compile(r'{(i\'|\'i)}'), r'í'), # i-acute + (re.compile(r'{(i\^|\^i)}'), r'î'), # i-circumflex + (re.compile(r'{(i\"|\"i)}'), r'ï'), # i-diaeresis + (re.compile(r'{(d-|-d)}'), r'ð'), # eth + (re.compile(r'{(n~|~n)}'), r'ñ'), # n-tilde + (re.compile(r'{(o`|`o)}'), r'ò'), # o-grave + (re.compile(r'{(o\'|\'o)}'), r'ó'), # o-acute + (re.compile(r'{(o\^|\^o)}'), r'ô'), # o-circumflex + (re.compile(r'{(o~|~o)}'), r'õ'), # o-tilde + (re.compile(r'{(o\"|\"o)}'), r'ö'), # o-diaeresis + (re.compile(r'{(o\/|\/o)}'), r'ø'), # o-stroke + (re.compile(r'{(u`|`u)}'), r'ù'), # u-grave + (re.compile(r'{(u\'|\'u)}'), r'ú'), # u-acute + (re.compile(r'{(u\^|\^u)}'), r'û'), # u-circumflex + (re.compile(r'{(u\"|\"u)}'), r'ü'), # u-diaeresis + (re.compile(r'{(y\'|\'y)}'), r'ý'), # y-acute + (re.compile(r'{(y\"|\"y)}'), r'ÿ'), # y-diaeresis + (re.compile(r'{OE}'), r'Œ'), # OE + (re.compile(r'{oe}'), r'œ'), # oe + (re.compile(r'{(S\^|\^S)}'), r'Š'), # Scaron + (re.compile(r'{(s\^|\^s)}'), r'š'), # scaron + (re.compile(r'{\*}'), r'•'), # bullet + (re.compile(r'{Fr}'), r'₣'), # Franc + (re.compile(r'{(L=|=L)}'), r'₤'), # Lira + (re.compile(r'{Rs}'), r'₨'), # Rupee + (re.compile(r'{(C=|=C)}'), r'€'), # euro + (re.compile(r'{tm}'), r'™'), # trademark + (re.compile(r'{spade}'), r'♠'), # spade + (re.compile(r'{club}'), r'♣'), # club + (re.compile(r'{heart}'), r'♥'), # heart + (re.compile(r'{diamond}'), r'♦'), # diamond + ] + glyph_defaults = [ + (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign + (re.compile(r'(\d+)\'', re.I), r'\1′'), # prime + (re.compile(r'(\d+)\"', re.I), r'\1″'), # prime-double + (re.compile(r"(\w)\'(\w)"), r'\1’\2'), # apostrophe's + (re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), r'\1’\2'), # back in '88 + (re.compile(r'(\S)\'(?=\s|\'|<|$)'), r'\1’'), # single closing + (re.compile(r'\'/'), r'‘'), # single opening + (re.compile(r'(\")\"'), r'\1”'), # double closing - following another + (re.compile(r'(\S)\"(?=\s|\"|<|$)'), r'\1”'), # double closing + (re.compile(r'"'), r'“'), # double opening + (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym + (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase + (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1⁄'), # ellipsis + (re.compile(r'(\s?)--(\s?)'), r'\1—\2'), # em dash + (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash + (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark + (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered + (re.compile(r'\b( ?)[([]C[])]', re.I), r'\1©'), # copyright + ] + def __init__(self, restricted=False, lite=False, noimage=False): """docstring for __init__""" @@ -673,211 +689,15 @@ class Textile(object): # fix: hackish text = re.sub(r'"\Z', '\" ', text) - glyph_search = ( - re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), # dimension sign - re.compile(r"(\w)\'(\w)"), # apostrophe's - re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), # back in '88 - re.compile(r'(\S)\'(?=\s|'+self.pnct+'|<|$)'), # single closing - re.compile(r'\'/'), # single opening - re.compile(r'(\")\"'), # double closing - following another - re.compile(r'(\S)\"(?=\s|'+self.pnct+'|<|$)'), # double closing - re.compile(r'"'), # double opening - re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), # 3+ uppercase acronym - re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), # 3+ uppercase - re.compile(r'\b(\s{0,1})?\.{3}'), # ellipsis - re.compile(r'(\s?)--(\s?)'), # em dash - re.compile(r'\s-(?:\s|$)'), # en dash - re.compile(r'\b( ?)[([]TM[])]', re.I), # trademark - re.compile(r'\b( ?)[([]R[])]', re.I), # registered - re.compile(r'\b( ?)[([]C[])]', re.I) # copyright - ) - - glyph_replace = [x % dict(self.glyph_defaults) for x in ( - r'\1\2%(txt_dimension)s\3', # dimension sign - r'\1%(txt_apostrophe)s\2', # apostrophe's - r'\1%(txt_apostrophe)s\2', # back in '88 - r'\1%(txt_quote_single_close)s', # single closing - r'%(txt_quote_single_open)s', # single opening - r'\1%(txt_quote_double_close)s', # double closing - following another - r'\1%(txt_quote_double_close)s', # double closing - r'%(txt_quote_double_open)s', # double opening - r'\1', # 3+ uppercase acronym - r'\1', # 3+ uppercase - r'\1%(txt_ellipsis)s', # ellipsis - r'\1%(txt_emdash)s\2', # em dash - r' %(txt_endash)s ', # en dash - r'\1%(txt_trademark)s', # trademark - r'\1%(txt_registered)s', # registered - r'\1%(txt_copyright)s' # copyright - )] - - if re.search(r'{.+?}', text): - glyph_search += ( - re.compile(r'{(c\||\|c)}'), # cent - re.compile(r'{(L-|-L)}'), # pound - re.compile(r'{(Y=|=Y)}'), # yen - re.compile(r'{\(c\)}'), # copyright - re.compile(r'{\(r\)}'), # registered - re.compile(r'{1/4}'), # quarter - re.compile(r'{1/2}'), # half - re.compile(r'{3/4}'), # three-quarter - re.compile(r'{(A`|`A)}'), # 192; - re.compile(r'{(A\'|\'A)}'), # 193; - re.compile(r'{(A\^|\^A)}'), # 194; - re.compile(r'{(A~|~A)}'), # 195; - re.compile(r'{(A\"|\"A)}'), # 196; - re.compile(r'{(Ao|oA)}'), # 197; - re.compile(r'{(AE)}'), # 198; - re.compile(r'{(C,|,C)}'), # 199; - re.compile(r'{(E`|`E)}'), # 200; - re.compile(r'{(E\'|\'E)}'), # 201; - re.compile(r'{(E\^|\^E)}'), # 202; - re.compile(r'{(E\"|\"E)}'), # 203; - re.compile(r'{(I`|`I)}'), # 204; - re.compile(r'{(I\'|\'I)}'), # 205; - re.compile(r'{(I\^|\^I)}'), # 206; - re.compile(r'{(I\"|\"I)}'), # 207; - re.compile(r'{(D-|-D)}'), # 208; - re.compile(r'{(N~|~N)}'), # 209; - re.compile(r'{(O`|`O)}'), # 210; - re.compile(r'{(O\'|\'O)}'), # 211; - re.compile(r'{(O\^|\^O)}'), # 212; - re.compile(r'{(O~|~O)}'), # 213; - re.compile(r'{(O\"|\"O)}'), # 214; - re.compile(r'{(O\/|\/O)}'), # 215; - re.compile(r'{(U`|`U)}'), # 216; - re.compile(r'{(U\'|\'U)}'), # 217; - re.compile(r'{(U\^|\^U)}'), # 218; - re.compile(r'{(U\"|\"U)}'), # 219; - re.compile(r'{(Y\'|\'Y)}'), # 220; - re.compile(r'{(a`|`a)}'), # a-grace - re.compile(r'{(a\'|\'a)}'), # a-acute - re.compile(r'{(a\^|\^a)}'), # a-circumflex - re.compile(r'{(a~|~a)}'), # a-tilde - re.compile(r'{(a\"|\"a)}'), # a-diaeresis - re.compile(r'{(ao|oa)}'), # a-ring - re.compile(r'{ae}'), # ae - re.compile(r'{(c,|,c)}'), # c-cedilla - re.compile(r'{(e`|`e)}'), # e-grace - re.compile(r'{(e\'|\'e)}'), # e-acute - re.compile(r'{(e\^|\^e)}'), # e-circumflex - re.compile(r'{(e\"|\"e)}'), # e-diaeresis - re.compile(r'{(i`|`i)}'), # i-grace - re.compile(r'{(i\'|\'i)}'), # i-acute - re.compile(r'{(i\^|\^i)}'), # i-circumflex - re.compile(r'{(i\"|\"i)}'), # i-diaeresis - re.compile(r'{(n~|~n)}'), # n-tilde - re.compile(r'{(o`|`o)}'), # o-grace - re.compile(r'{(o\'|\'o)}'), # o-acute - re.compile(r'{(o\^|\^o)}'), # o-circumflex - re.compile(r'{(o~|~o)}'), # o-tilde - re.compile(r'{(o\"|\"o)}'), # o-diaeresis - re.compile(r'{(o\/|\/o)}'), # o-stroke - re.compile(r'{(u`|`u)}'), # u-grace - re.compile(r'{(u\'|\'u)}'), # u-acute - re.compile(r'{(u\^|\^u)}'), # u-circumflex - re.compile(r'{(u\"|\"u)}'), # u-diaeresis - re.compile(r'{(y\'|\'y)}'), # y-acute - re.compile(r'{(y\"|\"y)}'), # y-diaeresis - re.compile(r'{OE}'), # y-diaeresis - re.compile(r'{oe}'), # y-diaeresis - re.compile(r'{\*}'), # bullet - re.compile(r'{Fr}'), # Franc - re.compile(r'{(L=|=L)}'), # Lira - re.compile(r'{Rs}'), # Rupee - re.compile(r'{(C=|=C)}'), # euro - re.compile(r'{tm}'), # euro - re.compile(r'{spade}'), # spade - re.compile(r'{club}'), # club - re.compile(r'{heart}'), # heart - re.compile(r'{diamond}') # diamond - ) - - glyph_replace += [x % dict(self.glyph_defaults) for x in ( - r'%(mac_cent)s', # cent - r'%(mac_pound)s', # pound - r'%(mac_yen)s', # yen - r'%(txt_copyright)s', # copyright - r'%(txt_registered)s', # registered - r'%(mac_quarter)s', # quarter - r'%(mac_half)s', # half - r'%(mac_three-quarter)s', # three-quarter - r'%(mac_cA-grave)s', # 192; - r'%(mac_cA-acute)s', # 193; - r'%(mac_cA-circumflex)s', # 194; - r'%(mac_cA-tilde)s', # 195; - r'%(mac_cA-diaeresis)s', # 196; - r'%(mac_cA-ring)s', # 197; - r'%(mac_cAE)s', # 198; - r'%(mac_cC-cedilla)s', # 199; - r'%(mac_cE-grave)s', # 200; - r'%(mac_cE-acute)s', # 201; - r'%(mac_cE-circumflex)s', # 202; - r'%(mac_cE-diaeresis)s', # 203; - r'%(mac_cI-grave)s', # 204; - r'%(mac_cI-acute)s', # 205; - r'%(mac_cI-circumflex)s', # 206; - r'%(mac_cI-diaeresis)s', # 207; - r'%(mac_cEth)s', # 208; - r'%(mac_cN-tilde)s', # 209; - r'%(mac_cO-grave)s', # 210; - r'%(mac_cO-acute)s', # 211; - r'%(mac_cO-circumflex)s', # 212; - r'%(mac_cO-tilde)s', # 213; - r'%(mac_cO-diaeresis)s', # 214; - r'%(mac_cO-stroke)s', # 216; - r'%(mac_cU-grave)s', # 217; - r'%(mac_cU-acute)s', # 218; - r'%(mac_cU-circumflex)s', # 219; - r'%(mac_cU-diaeresis)s', # 220; - r'%(mac_cY-acute)s', # 221; - r'%(mac_sa-grave)s', # 224; - r'%(mac_sa-acute)s', # 225; - r'%(mac_sa-circumflex)s', # 226; - r'%(mac_sa-tilde)s', # 227; - r'%(mac_sa-diaeresis)s', # 228; - r'%(mac_sa-ring)s', # 229; - r'%(mac_sae)s', # 230; - r'%(mac_sc-cedilla)s', # 231; - r'%(mac_se-grave)s', # 232; - r'%(mac_se-acute)s', # 233; - r'%(mac_se-circumflex)s', # 234; - r'%(mac_se-diaeresis)s', # 235; - r'%(mac_si-grave)s', # 236; - r'%(mac_si-acute)s', # 237; - r'%(mac_si-circumflex)s', # 238; - r'%(mac_si-diaeresis)s', # 239; - r'%(mac_sn-tilde)s', # 241; - r'%(mac_so-grave)s', # 242; - r'%(mac_so-acute)s', # 243; - r'%(mac_so-circumflex)s', # 244; - r'%(mac_so-tilde)s', # 245; - r'%(mac_so-diaeresis)s', # 246; - r'%(mac_so-stroke)s', # 248; - r'%(mac_su-grave)s', # 249; - r'%(mac_su-acute)s', # 250; - r'%(mac_su-circumflex)s', # 251; - r'%(mac_su-diaeresis)s', # 252; - r'%(mac_sy-acute)s', # 253; - r'%(mac_sy-diaeresis)s', # 255; - r'%(mac_cOE)s', # 338; - r'%(mac_soe)s', # 339; - r'%(mac_bullet)s', # bullet - r'%(mac_franc)s', # franc - r'%(mac_lira)s', # lira - r'%(mac_rupee)s', # rupee - r'%(mac_euro)s', # euro - r'%(txt_trademark)s', # trademark - r'%(mac_spade)s', # spade - r'%(mac_club)s', # club - r'%(mac_heart)s', # heart - r'%(mac_diamond)s' # diamond - )] - result = [] for line in re.compile(r'(<.*?>)', re.U).split(text): if not re.search(r'<.*>', line): - for s, r in zip(glyph_search, glyph_replace): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + self.glyph_defaults + else: + rules = self.glyph_defaults + for s, r in rules: line = s.sub(r, line) result.append(line) return ''.join(result) @@ -1045,7 +865,7 @@ class Textile(object): 'hello span strong and bold goodbye' """ qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^') - pnct = ".,\"'?!;:" + pnct = ".,\"'?!;:()" for qtag in qtags: pattern = re.compile(r"""