Fix #822744 (Unwrap problem - some central european chars missing)

This commit is contained in:
Kovid Goyal 2011-08-25 16:53:57 -06:00
commit ca9048cdae
4 changed files with 113 additions and 20 deletions

View File

@ -343,6 +343,7 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
# ` with letter before
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
@ -364,10 +365,14 @@ class HTMLPreProcessor(object):
(re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
(re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
(re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
(re.compile(u'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ĺ'),
(re.compile(u'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ĺ'),
(re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
(re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
(re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
(re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
(re.compile(u'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ŕ'),
(re.compile(u'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ŕ'),
(re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
(re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
(re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
@ -400,7 +405,31 @@ class HTMLPreProcessor(object):
# ˙
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
# ˇ
(re.compile(u'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'č'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Č'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: u'ď'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: u'Ď'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ě'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ě'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ľ'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ľ'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ň'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ň'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ř'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ř'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'š'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Š'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: u'ť'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: u'Ť'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ž'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ž'),
# °
(re.compile(u'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ů'),
(re.compile(u'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ů'),
# If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
@ -510,7 +539,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:

View File

@ -315,9 +315,11 @@ class HeuristicProcessor(object):
supports a range of html markup and text files
'''
# define the pieces of the regex
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßôľščťžňďěřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
soft_hyphen = u"\xad"
dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@ -326,19 +328,23 @@ class HeuristicProcessor(object):
unwrap_regex = lookahead+line_ending+blanklines+line_opening
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
dash_unwrap_regex = dash+line_ending+blanklines+line_opening
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
shy_unwrap_regex = soft_hyphen+txt_line_wrap
dash_unwrap_regex = dash+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content)
content = em_en_unwrap.sub('', content)
content = shy_unwrap.sub('', content)
content = dash_unwrap.sub('', content)
return content
def txt_process(self, match):

View File

@ -196,10 +196,40 @@ class Textile(object):
(re.compile(r'{(u\"|\"u)}'), r'&#252;'), # u-diaeresis
(re.compile(r'{(y\'|\'y)}'), r'&#253;'), # y-acute
(re.compile(r'{(y\"|\"y)}'), r'&#255;'), # y-diaeresis
(re.compile(r'{(C\ˇ|\ˇC)}'), r'&#268;'), # C-caron
(re.compile(r'{(c\ˇ|\ˇc)}'), r'&#269;'), # c-caron
(re.compile(r'{(D\ˇ|\ˇD)}'), r'&#270;'), # D-caron
(re.compile(r'{(d\ˇ|\ˇd)}'), r'&#271;'), # d-caron
(re.compile(r'{(E\ˇ|\ˇE)}'), r'&#282;'), # E-caron
(re.compile(r'{(e\ˇ|\ˇe)}'), r'&#283;'), # e-caron
(re.compile(r'{(L\'|\'L)}'), r'&#313;'), # L-acute
(re.compile(r'{(l\'|\'l)}'), r'&#314;'), # l-acute
(re.compile(r'{(L\ˇ|\ˇL)}'), r'&#317;'), # L-caron
(re.compile(r'{(l\ˇ|\ˇl)}'), r'&#318;'), # l-caron
(re.compile(r'{(N\ˇ|\ˇN)}'), r'&#327;'), # N-caron
(re.compile(r'{(n\ˇ|\ˇn)}'), r'&#328;'), # n-caron
(re.compile(r'{OE}'), r'&#338;'), # OE
(re.compile(r'{oe}'), r'&#339;'), # oe
(re.compile(r'{(S\^|\^S)}'), r'&Scaron;'), # Scaron
(re.compile(r'{(s\^|\^s)}'), r'&scaron;'), # scaron
(re.compile(r'{(R\'|\'R)}'), r'&#340;'), # R-acute
(re.compile(r'{(r\'|\'r)}'), r'&#341;'), # r-acute
(re.compile(r'{(R\ˇ|\ˇR)}'), r'&#344;'), # R-caron
(re.compile(r'{(r\ˇ|\ˇr)}'), r'&#345;'), # r-caron
(re.compile(r'{(S\^|\^S)}'), r'&#348;'), # S-circumflex
(re.compile(r'{(s\^|\^s)}'), r'&#349;'), # s-circumflex
(re.compile(r'{(S\ˇ|\ˇS)}'), r'&#352;'), # S-caron
(re.compile(r'{(s\ˇ|\ˇs)}'), r'&#353;'), # s-caron
(re.compile(r'{(T\ˇ|\ˇT)}'), r'&#356;'), # T-caron
(re.compile(r'{(t\ˇ|\ˇt)}'), r'&#357;'), # t-caron
(re.compile(r'{(U\°|\°U)}'), r'&#366;'), # U-ring
(re.compile(r'{(u\°|\°u)}'), r'&#367;'), # u-ring
(re.compile(r'{(Z\ˇ|\ˇZ)}'), r'&#381;'), # Z-caron
(re.compile(r'{(z\ˇ|\ˇz)}'), r'&#382;'), # z-caron
(re.compile(r'{\*}'), r'&#8226;'), # bullet
(re.compile(r'{Fr}'), r'&#8355;'), # Franc
(re.compile(r'{(L=|=L)}'), r'&#8356;'), # Lira
@ -219,13 +249,13 @@ class Textile(object):
]
glyph_defaults = [
(re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2&#215;\3'), # dimension sign
(re.compile(r'(\d+)\'(\s)', re.I), r'\1&#8242;\2'), # prime
(re.compile(r'(\d+)\"(\s)', re.I), r'\1&#8243;\2'), # prime-double
(re.compile(r'(\d+)\'(\s)', re.I), r'\1&#8242;\2'), # prime
(re.compile(r'(\d+)\"(\s)', re.I), r'\1&#8243;\2'), # prime-double
(re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'<acronym title="\2">\1</acronym>'), # 3+ uppercase acronym
(re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'<span class="caps">\1</span>'), # 3+ uppercase
(re.compile(r'\b(\s{0,1})?\.{3}'), r'\1&#8230;'), # ellipsis
(re.compile(r'^[\*_-]{3,}$', re.M), r'<hr />'), # <hr> scene-break
(re.compile(r'(^|[^-])--([^-]|$)'), r'\1&#8212;\2'), # em dash
(re.compile(r'(^|[^-])--([^-]|$)'), r'\1&#8212;\2'), # em dash
(re.compile(r'\s-(?:\s|$)'), r' &#8211; '), # en dash
(re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1&#8482;'), # trademark
(re.compile(r'\b( ?)[([]R[])]', re.I), r'\1&#174;'), # registered

View File

@ -26,7 +26,7 @@ def unsmarten(txt):
txt = re.sub(u'&#190;|&frac34;|¾', r'{3/4}', txt) # three-quarter
txt = re.sub(u'&#192;|&Agrave;|À', r'{A`)}', txt) # A-grave
txt = re.sub(u'&#193;|&Aacute;|Á', r"{A'}", txt) # A-acute
txt = re.sub(u'&#194;|&Acirc;|Â', r'{A^}', txt) # A-circumflex
txt = re.sub(u'&#194;|&Acirc;|Â', r'{A^}', txt) # A-circumflex
txt = re.sub(u'&#195;|&Atilde;|Ã', r'{A~}', txt) # A-tilde
txt = re.sub(u'&#196;|&Auml;|Ä', r'{A"}', txt) # A-umlaut
txt = re.sub(u'&#197;|&Aring;|Å', r'{Ao}', txt) # A-ring
@ -34,30 +34,30 @@ def unsmarten(txt):
txt = re.sub(u'&#199;|&Ccedil;|Ç', r'{C,}', txt) # C-cedilla
txt = re.sub(u'&#200;|&Egrave;|È', r'{E`}', txt) # E-grave
txt = re.sub(u'&#201;|&Eacute;|É', r"{E'}", txt) # E-acute
txt = re.sub(u'&#202;|&Ecirc;|Ê', r'{E^}', txt) # E-circumflex
txt = re.sub(u'&#202;|&Ecirc;|Ê', r'{E^}', txt) # E-circumflex
txt = re.sub(u'&#203;|&Euml;|Ë', r'{E"}', txt) # E-umlaut
txt = re.sub(u'&#204;|&Igrave;|Ì', r'{I`}', txt) # I-grave
txt = re.sub(u'&#205;|&Iacute;|Í', r"{I'}", txt) # I-acute
txt = re.sub(u'&#206;|&Icirc;|Î', r'{I^}', txt) # I-circumflex
txt = re.sub(u'&#206;|&Icirc;|Î', r'{I^}', txt) # I-circumflex
txt = re.sub(u'&#207;|&Iuml;|Ï', r'{I"}', txt) # I-umlaut
txt = re.sub(u'&#208;|&ETH;|Ð', r'{D-}', txt) # ETH
txt = re.sub(u'&#209;|&Ntilde;|Ñ', r'{N~}', txt) # N-tilde
txt = re.sub(u'&#210;|&Ograve;|Ò', r'{O`}', txt) # O-grave
txt = re.sub(u'&#211;|&Oacute;|Ó', r"{O'}", txt) # O-acute
txt = re.sub(u'&#212;|&Ocirc;|Ô', r'{O^}', txt) # O-circumflex
txt = re.sub(u'&#212;|&Ocirc;|Ô', r'{O^}', txt) # O-circumflex
txt = re.sub(u'&#213;|&Otilde;|Õ', r'{O~}', txt) # O-tilde
txt = re.sub(u'&#214;|&Ouml;|Ö', r'{O"}', txt) # O-umlaut
txt = re.sub(u'&#215;|&times;|×', r'{x}', txt) # dimension
txt = re.sub(u'&#216;|&Oslash;|Ø', r'{O/}', txt) # O-slash
txt = re.sub(u'&#217;|&Ugrave;|Ù', r"{U`}", txt) # U-grave
txt = re.sub(u'&#218;|&Uacute;|Ú', r"{U'}", txt) # U-acute
txt = re.sub(u'&#219;|&Ucirc;|Û', r'{U^}', txt) # U-circumflex
txt = re.sub(u'&#219;|&Ucirc;|Û', r'{U^}', txt) # U-circumflex
txt = re.sub(u'&#220;|&Uuml;|Ü', r'{U"}', txt) # U-umlaut
txt = re.sub(u'&#221;|&Yacute;|Ý', r"{Y'}", txt) # Y-grave
txt = re.sub(u'&#223;|&szlig;|ß', r'{sz}', txt) # sharp-s
txt = re.sub(u'&#224;|&agrave;|à', r'{a`}', txt) # a-grave
txt = re.sub(u'&#225;|&aacute;|á', r"{a'}", txt) # a-acute
txt = re.sub(u'&#226;|&acirc;|â', r'{a^}', txt) # a-circumflex
txt = re.sub(u'&#226;|&acirc;|â', r'{a^}', txt) # a-circumflex
txt = re.sub(u'&#227;|&atilde;|ã', r'{a~}', txt) # a-tilde
txt = re.sub(u'&#228;|&auml;|ä', r'{a"}', txt) # a-umlaut
txt = re.sub(u'&#229;|&aring;|å', r'{ao}', txt) # a-ring
@ -65,30 +65,58 @@ def unsmarten(txt):
txt = re.sub(u'&#231;|&ccedil;|ç', r'{c,}', txt) # c-cedilla
txt = re.sub(u'&#232;|&egrave;|è', r'{e`}', txt) # e-grave
txt = re.sub(u'&#233;|&eacute;|é', r"{e'}", txt) # e-acute
txt = re.sub(u'&#234;|&ecirc;|ê', r'{e^}', txt) # e-circumflex
txt = re.sub(u'&#234;|&ecirc;|ê', r'{e^}', txt) # e-circumflex
txt = re.sub(u'&#235;|&euml;|ë', r'{e"}', txt) # e-umlaut
txt = re.sub(u'&#236;|&igrave;|ì', r'{i`}', txt) # i-grave
txt = re.sub(u'&#237;|&iacute;|í', r"{i'}", txt) # i-acute
txt = re.sub(u'&#238;|&icirc;|î', r'{i^}', txt) # i-circumflex
txt = re.sub(u'&#238;|&icirc;|î', r'{i^}', txt) # i-circumflex
txt = re.sub(u'&#239;|&iuml;|ï', r'{i"}', txt) # i-umlaut
txt = re.sub(u'&#240;|&eth;|ð', r'{d-}', txt) # eth
txt = re.sub(u'&#241;|&ntilde;|ñ', r'{n~}', txt) # n-tilde
txt = re.sub(u'&#242;|&ograve;|ò', r'{o`}', txt) # o-grave
txt = re.sub(u'&#243;|&oacute;|ó', r"{o'}", txt) # o-acute
txt = re.sub(u'&#244;|&ocirc;|ô', r'{o^}', txt) # o-circumflex
txt = re.sub(u'&#244;|&ocirc;|ô', r'{o^}', txt) # o-circumflex
txt = re.sub(u'&#245;|&otilde;|õ', r'{o~}', txt) # o-tilde
txt = re.sub(u'&#246;|&ouml;|ö', r'{o"}', txt) # o-umlaut
txt = re.sub(u'&#248;|&oslash;|ø', r'{o/}', txt) # o-stroke
txt = re.sub(u'&#249;|&ugrave;|ù', r'{u`}', txt) # u-grave
txt = re.sub(u'&#250;|&uacute;|ú', r"{u'}", txt) # u-acute
txt = re.sub(u'&#251;|&ucirc;|û', r'{u^}', txt) # u-circumflex
txt = re.sub(u'&#251;|&ucirc;|û', r'{u^}', txt) # u-circumflex
txt = re.sub(u'&#252;|&uuml;|ü', r'{u"}', txt) # u-umlaut
txt = re.sub(u'&#253;|&yacute;|ý', r"{y'}", txt) # y-acute
txt = re.sub(u'&#255;|&yuml;|ÿ', r'{y"}', txt) # y-umlaut
txt = re.sub(u'&#268;|&Ccaron;|Č', r'{Cˇ}', txt) # C-caron
txt = re.sub(u'&#269;|&ccaron;|č', r'{cˇ}', txt) # c-caron
txt = re.sub(u'&#270;|&Dcaron;|Ď', r'{Dˇ}', txt) # D-caron
txt = re.sub(u'&#271;|&dcaron;|ď', r'{dˇ}', txt) # d-caron
txt = re.sub(u'&#282;|&Ecaron;|Ě', r'{Eˇ}', txt) # E-caron
txt = re.sub(u'&#283;|&ecaron;|ě', r'{eˇ}', txt) # e-caron
txt = re.sub(u'&#313;|&Lacute;|Ĺ', r"{L'}", txt) # L-acute
txt = re.sub(u'&#314;|&lacute;|ĺ', r"{l'}", txt) # l-acute
txt = re.sub(u'&#317;|&Lcaron;|Ľ', r'{Lˇ}', txt) # L-caron
txt = re.sub(u'&#318;|&lcaron;|ľ', r'{lˇ}', txt) # l-caron
txt = re.sub(u'&#327;|&Ncaron;|Ň', r'{Nˇ}', txt) # N-caron
txt = re.sub(u'&#328;|&ncaron;|ň', r'{nˇ}', txt) # n-caron
txt = re.sub(u'&#338;|&OElig;|Œ', r'{OE}', txt) # OE
txt = re.sub(u'&#339;|&oelig;|œ', r'{oe}', txt) # oe
txt = re.sub(u'&#348;|&Scaron;|Ŝ', r'{S^}', txt) # Scaron
txt = re.sub(u'&#349;|&scaron;|ŝ', r'{s^}', txt) # scaron
txt = re.sub(u'&#340;|&Racute;|Ŕ', r"{R'}", txt) # R-acute
txt = re.sub(u'&#341;|&racute;|ŕ', r"{r'}", txt) # r-acute
txt = re.sub(u'&#344;|&Rcaron;|Ř', r'{Rˇ}', txt) # R-caron
txt = re.sub(u'&#345;|&rcaron;|ř', r'{rˇ}', txt) # r-caron
txt = re.sub(u'&#348;|Ŝ', r'{S^}', txt) # S-circumflex
txt = re.sub(u'&#349;|ŝ', r'{s^}', txt) # s-circumflex
txt = re.sub(u'&#352;|&Scaron;|Š', r'{Sˇ}', txt) # S-caron
txt = re.sub(u'&#353;|&scaron;|š', r'{sˇ}', txt) # s-caron
txt = re.sub(u'&#356;|&Tcaron;|Ť', r'{Tˇ}', txt) # T-caron
txt = re.sub(u'&#357;|&tcaron;|ť', r'{tˇ}', txt) # t-caron
txt = re.sub(u'&#366;|&Uring;|Ů', r'{U°}', txt) # U-ring
txt = re.sub(u'&#367;|&uring;|ů', r'{u°}', txt) # u-ring
txt = re.sub(u'&#381;|&Zcaron;|Ž', r'{Zˇ}', txt) # Z-caron
txt = re.sub(u'&#382;|&zcaron;|ž', r'{zˇ}', txt) # z-caron
txt = re.sub(u'&#8226;|&bull;|•', r'{*}', txt) # bullet
txt = re.sub(u'&#8355;|₣', r'{Fr}', txt) # Franc
txt = re.sub(u'&#8356;|₤', r'{L=}', txt) # Lira