Fix #822744 (Unwrap problem - some central european chars missing)

This commit is contained in:
Kovid Goyal 2011-08-25 16:53:57 -06:00
commit ca9048cdae
4 changed files with 113 additions and 20 deletions

View File

@ -343,6 +343,7 @@ class HTMLPreProcessor(object):
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'), (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'), (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'), (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
# ` with letter before # ` with letter before
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'), (re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'), (re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
@ -364,10 +365,14 @@ class HTMLPreProcessor(object):
(re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'), (re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
(re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'), (re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
(re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'), (re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
(re.compile(u'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ĺ'),
(re.compile(u'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ĺ'),
(re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'), (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
(re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'), (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
(re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'), (re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
(re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'), (re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
(re.compile(u'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ŕ'),
(re.compile(u'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ŕ'),
(re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'), (re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
(re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'), (re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
(re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'), (re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
@ -401,6 +406,30 @@ class HTMLPreProcessor(object):
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'), (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
# ˇ
(re.compile(u'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'č'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Č'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: u'ď'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: u'Ď'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ě'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ě'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ľ'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ľ'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ň'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ň'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ř'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ř'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'š'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Š'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: u'ť'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: u'Ť'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ž'),
(re.compile(u'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ž'),
# °
(re.compile(u'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ů'),
(re.compile(u'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ů'),
# If pdf printed from a browser then the header/footer has a reliable pattern # If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
@ -510,7 +539,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
) )
for rule in self.PREPROCESS + start_rules: for rule in self.PREPROCESS + start_rules:

View File

@ -315,9 +315,11 @@ class HeuristicProcessor(object):
supports a range of html markup and text files supports a range of html markup and text files
''' '''
# define the pieces of the regex # define the pieces of the regex
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßôľščťžňďěřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])" em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
soft_hyphen = u"\xad" soft_hyphen = u"\xad"
dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?" line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*" blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*" line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@ -326,19 +328,23 @@ class HeuristicProcessor(object):
unwrap_regex = lookahead+line_ending+blanklines+line_opening unwrap_regex = lookahead+line_ending+blanklines+line_opening
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
dash_unwrap_regex = dash+line_ending+blanklines+line_opening
if format == 'txt': if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap unwrap_regex = lookahead+txt_line_wrap
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
shy_unwrap_regex = soft_hyphen+txt_line_wrap shy_unwrap_regex = soft_hyphen+txt_line_wrap
dash_unwrap_regex = dash+txt_line_wrap
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE) shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
content = unwrap.sub(' ', content) content = unwrap.sub(' ', content)
content = em_en_unwrap.sub('', content) content = em_en_unwrap.sub('', content)
content = shy_unwrap.sub('', content) content = shy_unwrap.sub('', content)
content = dash_unwrap.sub('', content)
return content return content
def txt_process(self, match): def txt_process(self, match):

View File

@ -196,10 +196,40 @@ class Textile(object):
(re.compile(r'{(u\"|\"u)}'), r'&#252;'), # u-diaeresis (re.compile(r'{(u\"|\"u)}'), r'&#252;'), # u-diaeresis
(re.compile(r'{(y\'|\'y)}'), r'&#253;'), # y-acute (re.compile(r'{(y\'|\'y)}'), r'&#253;'), # y-acute
(re.compile(r'{(y\"|\"y)}'), r'&#255;'), # y-diaeresis (re.compile(r'{(y\"|\"y)}'), r'&#255;'), # y-diaeresis
(re.compile(r'{(C\ˇ|\ˇC)}'), r'&#268;'), # C-caron
(re.compile(r'{(c\ˇ|\ˇc)}'), r'&#269;'), # c-caron
(re.compile(r'{(D\ˇ|\ˇD)}'), r'&#270;'), # D-caron
(re.compile(r'{(d\ˇ|\ˇd)}'), r'&#271;'), # d-caron
(re.compile(r'{(E\ˇ|\ˇE)}'), r'&#282;'), # E-caron
(re.compile(r'{(e\ˇ|\ˇe)}'), r'&#283;'), # e-caron
(re.compile(r'{(L\'|\'L)}'), r'&#313;'), # L-acute
(re.compile(r'{(l\'|\'l)}'), r'&#314;'), # l-acute
(re.compile(r'{(L\ˇ|\ˇL)}'), r'&#317;'), # L-caron
(re.compile(r'{(l\ˇ|\ˇl)}'), r'&#318;'), # l-caron
(re.compile(r'{(N\ˇ|\ˇN)}'), r'&#327;'), # N-caron
(re.compile(r'{(n\ˇ|\ˇn)}'), r'&#328;'), # n-caron
(re.compile(r'{OE}'), r'&#338;'), # OE (re.compile(r'{OE}'), r'&#338;'), # OE
(re.compile(r'{oe}'), r'&#339;'), # oe (re.compile(r'{oe}'), r'&#339;'), # oe
(re.compile(r'{(S\^|\^S)}'), r'&Scaron;'), # Scaron
(re.compile(r'{(s\^|\^s)}'), r'&scaron;'), # scaron (re.compile(r'{(R\'|\'R)}'), r'&#340;'), # R-acute
(re.compile(r'{(r\'|\'r)}'), r'&#341;'), # r-acute
(re.compile(r'{(R\ˇ|\ˇR)}'), r'&#344;'), # R-caron
(re.compile(r'{(r\ˇ|\ˇr)}'), r'&#345;'), # r-caron
(re.compile(r'{(S\^|\^S)}'), r'&#348;'), # S-circumflex
(re.compile(r'{(s\^|\^s)}'), r'&#349;'), # s-circumflex
(re.compile(r'{(S\ˇ|\ˇS)}'), r'&#352;'), # S-caron
(re.compile(r'{(s\ˇ|\ˇs)}'), r'&#353;'), # s-caron
(re.compile(r'{(T\ˇ|\ˇT)}'), r'&#356;'), # T-caron
(re.compile(r'{(t\ˇ|\ˇt)}'), r'&#357;'), # t-caron
(re.compile(r'{(U\°|\°U)}'), r'&#366;'), # U-ring
(re.compile(r'{(u\°|\°u)}'), r'&#367;'), # u-ring
(re.compile(r'{(Z\ˇ|\ˇZ)}'), r'&#381;'), # Z-caron
(re.compile(r'{(z\ˇ|\ˇz)}'), r'&#382;'), # z-caron
(re.compile(r'{\*}'), r'&#8226;'), # bullet (re.compile(r'{\*}'), r'&#8226;'), # bullet
(re.compile(r'{Fr}'), r'&#8355;'), # Franc (re.compile(r'{Fr}'), r'&#8355;'), # Franc
(re.compile(r'{(L=|=L)}'), r'&#8356;'), # Lira (re.compile(r'{(L=|=L)}'), r'&#8356;'), # Lira

View File

@ -85,10 +85,38 @@ def unsmarten(txt):
txt = re.sub(u'&#252;|&uuml;|ü', r'{u"}', txt) # u-umlaut txt = re.sub(u'&#252;|&uuml;|ü', r'{u"}', txt) # u-umlaut
txt = re.sub(u'&#253;|&yacute;|ý', r"{y'}", txt) # y-acute txt = re.sub(u'&#253;|&yacute;|ý', r"{y'}", txt) # y-acute
txt = re.sub(u'&#255;|&yuml;|ÿ', r'{y"}', txt) # y-umlaut txt = re.sub(u'&#255;|&yuml;|ÿ', r'{y"}', txt) # y-umlaut
txt = re.sub(u'&#268;|&Ccaron;|Č', r'{Cˇ}', txt) # C-caron
txt = re.sub(u'&#269;|&ccaron;|č', r'{cˇ}', txt) # c-caron
txt = re.sub(u'&#270;|&Dcaron;|Ď', r'{Dˇ}', txt) # D-caron
txt = re.sub(u'&#271;|&dcaron;|ď', r'{dˇ}', txt) # d-caron
txt = re.sub(u'&#282;|&Ecaron;|Ě', r'{Eˇ}', txt) # E-caron
txt = re.sub(u'&#283;|&ecaron;|ě', r'{eˇ}', txt) # e-caron
txt = re.sub(u'&#313;|&Lacute;|Ĺ', r"{L'}", txt) # L-acute
txt = re.sub(u'&#314;|&lacute;|ĺ', r"{l'}", txt) # l-acute
txt = re.sub(u'&#317;|&Lcaron;|Ľ', r'{Lˇ}', txt) # L-caron
txt = re.sub(u'&#318;|&lcaron;|ľ', r'{lˇ}', txt) # l-caron
txt = re.sub(u'&#327;|&Ncaron;|Ň', r'{Nˇ}', txt) # N-caron
txt = re.sub(u'&#328;|&ncaron;|ň', r'{nˇ}', txt) # n-caron
txt = re.sub(u'&#338;|&OElig;|Œ', r'{OE}', txt) # OE txt = re.sub(u'&#338;|&OElig;|Œ', r'{OE}', txt) # OE
txt = re.sub(u'&#339;|&oelig;|œ', r'{oe}', txt) # oe txt = re.sub(u'&#339;|&oelig;|œ', r'{oe}', txt) # oe
txt = re.sub(u'&#348;|&Scaron;|Ŝ', r'{S^}', txt) # Scaron
txt = re.sub(u'&#349;|&scaron;|ŝ', r'{s^}', txt) # scaron txt = re.sub(u'&#340;|&Racute;|Ŕ', r"{R'}", txt) # R-acute
txt = re.sub(u'&#341;|&racute;|ŕ', r"{r'}", txt) # r-acute
txt = re.sub(u'&#344;|&Rcaron;|Ř', r'{Rˇ}', txt) # R-caron
txt = re.sub(u'&#345;|&rcaron;|ř', r'{rˇ}', txt) # r-caron
txt = re.sub(u'&#348;|Ŝ', r'{S^}', txt) # S-circumflex
txt = re.sub(u'&#349;|ŝ', r'{s^}', txt) # s-circumflex
txt = re.sub(u'&#352;|&Scaron;|Š', r'{Sˇ}', txt) # S-caron
txt = re.sub(u'&#353;|&scaron;|š', r'{sˇ}', txt) # s-caron
txt = re.sub(u'&#356;|&Tcaron;|Ť', r'{Tˇ}', txt) # T-caron
txt = re.sub(u'&#357;|&tcaron;|ť', r'{tˇ}', txt) # t-caron
txt = re.sub(u'&#366;|&Uring;|Ů', r'{U°}', txt) # U-ring
txt = re.sub(u'&#367;|&uring;|ů', r'{u°}', txt) # u-ring
txt = re.sub(u'&#381;|&Zcaron;|Ž', r'{Zˇ}', txt) # Z-caron
txt = re.sub(u'&#382;|&zcaron;|ž', r'{zˇ}', txt) # z-caron
txt = re.sub(u'&#8226;|&bull;|•', r'{*}', txt) # bullet txt = re.sub(u'&#8226;|&bull;|•', r'{*}', txt) # bullet
txt = re.sub(u'&#8355;|₣', r'{Fr}', txt) # Franc txt = re.sub(u'&#8355;|₣', r'{Fr}', txt) # Franc
txt = re.sub(u'&#8356;|₤', r'{L=}', txt) # Lira txt = re.sub(u'&#8356;|₤', r'{L=}', txt) # Lira