mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #822744 (Unwrap problem - some central european chars missing)
This commit is contained in:
commit
ca9048cdae
@ -343,6 +343,7 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
(re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
(re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
|
||||||
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
(re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
|
||||||
|
|
||||||
# ` with letter before
|
# ` with letter before
|
||||||
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
|
(re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
|
||||||
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
|
(re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
|
||||||
@ -364,10 +365,14 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
|
(re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
|
(re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
|
(re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
|
||||||
|
(re.compile(u'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ĺ'),
|
||||||
|
(re.compile(u'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ĺ'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
|
(re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
|
(re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
|
(re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
|
(re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
|
||||||
|
(re.compile(u'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ŕ'),
|
||||||
|
(re.compile(u'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ŕ'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
|
(re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
|
(re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
|
||||||
(re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
|
(re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
|
||||||
@ -401,6 +406,30 @@ class HTMLPreProcessor(object):
|
|||||||
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
|
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
|
||||||
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
|
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
|
||||||
|
|
||||||
|
# ˇ
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'č'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Č'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: u'ď'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: u'Ď'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ě'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ě'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ľ'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ľ'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ň'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ň'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ř'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ř'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'š'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Š'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: u'ť'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: u'Ť'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ž'),
|
||||||
|
(re.compile(u'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ž'),
|
||||||
|
|
||||||
|
# °
|
||||||
|
(re.compile(u'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ů'),
|
||||||
|
(re.compile(u'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ů'),
|
||||||
|
|
||||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||||
|
|
||||||
@ -510,7 +539,7 @@ class HTMLPreProcessor(object):
|
|||||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
|
@ -315,9 +315,11 @@ class HeuristicProcessor(object):
|
|||||||
supports a range of html markup and text files
|
supports a range of html markup and text files
|
||||||
'''
|
'''
|
||||||
# define the pieces of the regex
|
# define the pieces of the regex
|
||||||
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßôľščťžňďěřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
|
||||||
|
lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
|
||||||
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
|
||||||
soft_hyphen = u"\xad"
|
soft_hyphen = u"\xad"
|
||||||
|
dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
|
||||||
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
|
||||||
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
|
||||||
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
|
line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
|
||||||
@ -326,19 +328,23 @@ class HeuristicProcessor(object):
|
|||||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||||
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
||||||
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
||||||
|
dash_unwrap_regex = dash+line_ending+blanklines+line_opening
|
||||||
|
|
||||||
if format == 'txt':
|
if format == 'txt':
|
||||||
unwrap_regex = lookahead+txt_line_wrap
|
unwrap_regex = lookahead+txt_line_wrap
|
||||||
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||||
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||||
|
dash_unwrap_regex = dash+txt_line_wrap
|
||||||
|
|
||||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||||
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||||
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||||
|
dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
|
||||||
|
|
||||||
content = unwrap.sub(' ', content)
|
content = unwrap.sub(' ', content)
|
||||||
content = em_en_unwrap.sub('', content)
|
content = em_en_unwrap.sub('', content)
|
||||||
content = shy_unwrap.sub('', content)
|
content = shy_unwrap.sub('', content)
|
||||||
|
content = dash_unwrap.sub('', content)
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def txt_process(self, match):
|
def txt_process(self, match):
|
||||||
|
@ -196,10 +196,40 @@ class Textile(object):
|
|||||||
(re.compile(r'{(u\"|\"u)}'), r'ü'), # u-diaeresis
|
(re.compile(r'{(u\"|\"u)}'), r'ü'), # u-diaeresis
|
||||||
(re.compile(r'{(y\'|\'y)}'), r'ý'), # y-acute
|
(re.compile(r'{(y\'|\'y)}'), r'ý'), # y-acute
|
||||||
(re.compile(r'{(y\"|\"y)}'), r'ÿ'), # y-diaeresis
|
(re.compile(r'{(y\"|\"y)}'), r'ÿ'), # y-diaeresis
|
||||||
|
|
||||||
|
(re.compile(r'{(C\ˇ|\ˇC)}'), r'Č'), # C-caron
|
||||||
|
(re.compile(r'{(c\ˇ|\ˇc)}'), r'č'), # c-caron
|
||||||
|
(re.compile(r'{(D\ˇ|\ˇD)}'), r'Ď'), # D-caron
|
||||||
|
(re.compile(r'{(d\ˇ|\ˇd)}'), r'ď'), # d-caron
|
||||||
|
(re.compile(r'{(E\ˇ|\ˇE)}'), r'Ě'), # E-caron
|
||||||
|
(re.compile(r'{(e\ˇ|\ˇe)}'), r'ě'), # e-caron
|
||||||
|
(re.compile(r'{(L\'|\'L)}'), r'Ĺ'), # L-acute
|
||||||
|
(re.compile(r'{(l\'|\'l)}'), r'ĺ'), # l-acute
|
||||||
|
(re.compile(r'{(L\ˇ|\ˇL)}'), r'Ľ'), # L-caron
|
||||||
|
(re.compile(r'{(l\ˇ|\ˇl)}'), r'ľ'), # l-caron
|
||||||
|
(re.compile(r'{(N\ˇ|\ˇN)}'), r'Ň'), # N-caron
|
||||||
|
(re.compile(r'{(n\ˇ|\ˇn)}'), r'ň'), # n-caron
|
||||||
|
|
||||||
(re.compile(r'{OE}'), r'Œ'), # OE
|
(re.compile(r'{OE}'), r'Œ'), # OE
|
||||||
(re.compile(r'{oe}'), r'œ'), # oe
|
(re.compile(r'{oe}'), r'œ'), # oe
|
||||||
(re.compile(r'{(S\^|\^S)}'), r'Š'), # Scaron
|
|
||||||
(re.compile(r'{(s\^|\^s)}'), r'š'), # scaron
|
(re.compile(r'{(R\'|\'R)}'), r'Ŕ'), # R-acute
|
||||||
|
(re.compile(r'{(r\'|\'r)}'), r'ŕ'), # r-acute
|
||||||
|
(re.compile(r'{(R\ˇ|\ˇR)}'), r'Ř'), # R-caron
|
||||||
|
(re.compile(r'{(r\ˇ|\ˇr)}'), r'ř'), # r-caron
|
||||||
|
|
||||||
|
(re.compile(r'{(S\^|\^S)}'), r'Ŝ'), # S-circumflex
|
||||||
|
(re.compile(r'{(s\^|\^s)}'), r'ŝ'), # s-circumflex
|
||||||
|
|
||||||
|
(re.compile(r'{(S\ˇ|\ˇS)}'), r'Š'), # S-caron
|
||||||
|
(re.compile(r'{(s\ˇ|\ˇs)}'), r'š'), # s-caron
|
||||||
|
(re.compile(r'{(T\ˇ|\ˇT)}'), r'Ť'), # T-caron
|
||||||
|
(re.compile(r'{(t\ˇ|\ˇt)}'), r'ť'), # t-caron
|
||||||
|
(re.compile(r'{(U\°|\°U)}'), r'Ů'), # U-ring
|
||||||
|
(re.compile(r'{(u\°|\°u)}'), r'ů'), # u-ring
|
||||||
|
(re.compile(r'{(Z\ˇ|\ˇZ)}'), r'Ž'), # Z-caron
|
||||||
|
(re.compile(r'{(z\ˇ|\ˇz)}'), r'ž'), # z-caron
|
||||||
|
|
||||||
(re.compile(r'{\*}'), r'•'), # bullet
|
(re.compile(r'{\*}'), r'•'), # bullet
|
||||||
(re.compile(r'{Fr}'), r'₣'), # Franc
|
(re.compile(r'{Fr}'), r'₣'), # Franc
|
||||||
(re.compile(r'{(L=|=L)}'), r'₤'), # Lira
|
(re.compile(r'{(L=|=L)}'), r'₤'), # Lira
|
||||||
|
@ -85,10 +85,38 @@ def unsmarten(txt):
|
|||||||
txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||||
txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute
|
txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute
|
||||||
txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||||
|
|
||||||
|
txt = re.sub(u'Č|Č|Č', r'{Cˇ}', txt) # C-caron
|
||||||
|
txt = re.sub(u'č|č|č', r'{cˇ}', txt) # c-caron
|
||||||
|
txt = re.sub(u'Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron
|
||||||
|
txt = re.sub(u'ď|ď|ď', r'{dˇ}', txt) # d-caron
|
||||||
|
txt = re.sub(u'Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron
|
||||||
|
txt = re.sub(u'ě|ě|ě', r'{eˇ}', txt) # e-caron
|
||||||
|
txt = re.sub(u'Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute
|
||||||
|
txt = re.sub(u'ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute
|
||||||
|
txt = re.sub(u'Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron
|
||||||
|
txt = re.sub(u'ľ|ľ|ľ', r'{lˇ}', txt) # l-caron
|
||||||
|
txt = re.sub(u'Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron
|
||||||
|
txt = re.sub(u'ň|ň|ň', r'{nˇ}', txt) # n-caron
|
||||||
|
|
||||||
txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE
|
txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||||
txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe
|
txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe
|
||||||
txt = re.sub(u'Ŝ|Š|Ŝ', r'{S^}', txt) # Scaron
|
|
||||||
txt = re.sub(u'ŝ|š|ŝ', r'{s^}', txt) # scaron
|
txt = re.sub(u'Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute
|
||||||
|
txt = re.sub(u'ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute
|
||||||
|
txt = re.sub(u'Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron
|
||||||
|
txt = re.sub(u'ř|ř|ř', r'{rˇ}', txt) # r-caron
|
||||||
|
txt = re.sub(u'Ŝ|Ŝ', r'{S^}', txt) # S-circumflex
|
||||||
|
txt = re.sub(u'ŝ|ŝ', r'{s^}', txt) # s-circumflex
|
||||||
|
txt = re.sub(u'Š|Š|Š', r'{Sˇ}', txt) # S-caron
|
||||||
|
txt = re.sub(u'š|š|š', r'{sˇ}', txt) # s-caron
|
||||||
|
txt = re.sub(u'Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron
|
||||||
|
txt = re.sub(u'ť|ť|ť', r'{tˇ}', txt) # t-caron
|
||||||
|
txt = re.sub(u'Ů|Ů|Ů', r'{U°}', txt) # U-ring
|
||||||
|
txt = re.sub(u'ů|ů|ů', r'{u°}', txt) # u-ring
|
||||||
|
txt = re.sub(u'Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron
|
||||||
|
txt = re.sub(u'ž|ž|ž', r'{zˇ}', txt) # z-caron
|
||||||
|
|
||||||
txt = re.sub(u'•|•|•', r'{*}', txt) # bullet
|
txt = re.sub(u'•|•|•', r'{*}', txt) # bullet
|
||||||
txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc
|
txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc
|
||||||
txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira
|
txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira
|
||||||
|
Loading…
x
Reference in New Issue
Block a user