Fix #822744 (Unwrap problem - some central european chars missing)

2025-07-09 03:04:10 -04:00 · 2011-08-25 16:53:57 -06:00 · 2011-08-25 16:53:57 -06:00 · ca9048cdae
commit ca9048cdae
parent 405267542b 88737c88d4
4 changed files with 113 additions and 20 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -343,6 +343,7 @@ class HTMLPreProcessor(object):
                  (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                  (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                  (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
                  # ` with letter before
                  (re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
                  (re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
@ -364,10 +365,14 @@ class HTMLPreProcessor(object):
                  (re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
                  (re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
                  (re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
                  (re.compile(u'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ĺ'),
                  (re.compile(u'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ĺ'),
                  (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
                  (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
                  (re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
                  (re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
                  (re.compile(u'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ŕ'),
                  (re.compile(u'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ŕ'),
                  (re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
                  (re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
                  (re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
@ -401,6 +406,30 @@ class HTMLPreProcessor(object):
                  (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                  (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
                  # ˇ
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'č'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Č'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: u'ď'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: u'Ď'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ě'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ě'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ľ'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ľ'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ň'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ň'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ř'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ř'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'š'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Š'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: u'ť'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: u'Ť'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ž'),
                  (re.compile(u'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ž'),
                  # °
                  (re.compile(u'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ů'),
                  (re.compile(u'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ů'),
                  # If pdf printed from a browser then the header/footer has a reliable pattern
                  (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
@ -510,7 +539,7 @@ class HTMLPreProcessor(object):
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )
        for rule in self.PREPROCESS + start_rules:
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -315,9 +315,11 @@ class HeuristicProcessor(object):
        supports a range of html markup and text files
        '''
        # define the pieces of the regex
-        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßôľščťžňďěřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+                      
        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
        soft_hyphen = u"\xad"
        dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@ -326,19 +328,23 @@ class HeuristicProcessor(object):
        unwrap_regex = lookahead+line_ending+blanklines+line_opening
        em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
        shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
        dash_unwrap_regex = dash+line_ending+blanklines+line_opening
        if format == 'txt':
            unwrap_regex = lookahead+txt_line_wrap
            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
            shy_unwrap_regex = soft_hyphen+txt_line_wrap
            dash_unwrap_regex = dash+txt_line_wrap
        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
        dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)
        content = unwrap.sub(' ', content)
        content = em_en_unwrap.sub('', content)
        content = shy_unwrap.sub('', content)
        content = dash_unwrap.sub('', content)
        return content
    def txt_process(self, match):
--- a/src/calibre/ebooks/textile/functions.py
+++ b/src/calibre/ebooks/textile/functions.py
@ -196,10 +196,40 @@ class Textile(object):
        (re.compile(r'{(u\"|\"u)}'),     r'&#252;'),   #  u-diaeresis
        (re.compile(r'{(y\'|\'y)}'),     r'&#253;'),   #  y-acute
        (re.compile(r'{(y\"|\"y)}'),     r'&#255;'),   #  y-diaeresis
        (re.compile(r'{(C\ˇ|\ˇC)}'),     r'&#268;'),   #  C-caron
        (re.compile(r'{(c\ˇ|\ˇc)}'),     r'&#269;'),   #  c-caron
        (re.compile(r'{(D\ˇ|\ˇD)}'),     r'&#270;'),   #  D-caron
        (re.compile(r'{(d\ˇ|\ˇd)}'),     r'&#271;'),   #  d-caron
        (re.compile(r'{(E\ˇ|\ˇE)}'),     r'&#282;'),   #  E-caron
        (re.compile(r'{(e\ˇ|\ˇe)}'),     r'&#283;'),   #  e-caron
        (re.compile(r'{(L\'|\'L)}'),     r'&#313;'),   #  L-acute
        (re.compile(r'{(l\'|\'l)}'),     r'&#314;'),   #  l-acute
        (re.compile(r'{(L\ˇ|\ˇL)}'),     r'&#317;'),   #  L-caron
        (re.compile(r'{(l\ˇ|\ˇl)}'),     r'&#318;'),   #  l-caron
        (re.compile(r'{(N\ˇ|\ˇN)}'),     r'&#327;'),   #  N-caron
        (re.compile(r'{(n\ˇ|\ˇn)}'),     r'&#328;'),   #  n-caron
        (re.compile(r'{OE}'),            r'&#338;'),   #  OE
        (re.compile(r'{oe}'),            r'&#339;'),   #  oe
-        (re.compile(r'{(S\^|\^S)}'),     r'&Scaron;'), #  Scaron
+
-        (re.compile(r'{(s\^|\^s)}'),     r'&scaron;'), #  scaron
+        (re.compile(r'{(R\'|\'R)}'),     r'&#340;'),   #  R-acute
        (re.compile(r'{(r\'|\'r)}'),     r'&#341;'),   #  r-acute
        (re.compile(r'{(R\ˇ|\ˇR)}'),     r'&#344;'),   #  R-caron
        (re.compile(r'{(r\ˇ|\ˇr)}'),     r'&#345;'),   #  r-caron
        (re.compile(r'{(S\^|\^S)}'),     r'&#348;'),   #  S-circumflex
        (re.compile(r'{(s\^|\^s)}'),     r'&#349;'),   #  s-circumflex
        (re.compile(r'{(S\ˇ|\ˇS)}'),     r'&#352;'),   #  S-caron
        (re.compile(r'{(s\ˇ|\ˇs)}'),     r'&#353;'),   #  s-caron
        (re.compile(r'{(T\ˇ|\ˇT)}'),     r'&#356;'),   #  T-caron
        (re.compile(r'{(t\ˇ|\ˇt)}'),     r'&#357;'),   #  t-caron
        (re.compile(r'{(U\°|\°U)}'),     r'&#366;'),   #  U-ring
        (re.compile(r'{(u\°|\°u)}'),     r'&#367;'),   #  u-ring
        (re.compile(r'{(Z\ˇ|\ˇZ)}'),     r'&#381;'),   #  Z-caron
        (re.compile(r'{(z\ˇ|\ˇz)}'),     r'&#382;'),   #  z-caron
        (re.compile(r'{\*}'),            r'&#8226;'),  #  bullet
        (re.compile(r'{Fr}'),            r'&#8355;'),  #  Franc
        (re.compile(r'{(L=|=L)}'),       r'&#8356;'),  #  Lira
--- a/src/calibre/ebooks/txt/unsmarten.py
+++ b/src/calibre/ebooks/txt/unsmarten.py
@ -85,10 +85,38 @@ def unsmarten(txt):
    txt = re.sub(u'&#252;|&uuml;|ü',     r'{u"}',  txt)  # u-umlaut
    txt = re.sub(u'&#253;|&yacute;|ý',   r"{y'}",  txt)  # y-acute
    txt = re.sub(u'&#255;|&yuml;|ÿ',     r'{y"}',  txt)  # y-umlaut
    txt = re.sub(u'&#268;|&Ccaron;|Č',   r'{Cˇ}',  txt)  # C-caron
    txt = re.sub(u'&#269;|&ccaron;|č',   r'{cˇ}',  txt)  # c-caron
    txt = re.sub(u'&#270;|&Dcaron;|Ď',   r'{Dˇ}',  txt)  # D-caron
    txt = re.sub(u'&#271;|&dcaron;|ď',   r'{dˇ}',  txt)  # d-caron
    txt = re.sub(u'&#282;|&Ecaron;|Ě',   r'{Eˇ}',  txt)  # E-caron
    txt = re.sub(u'&#283;|&ecaron;|ě',   r'{eˇ}',  txt)  # e-caron
    txt = re.sub(u'&#313;|&Lacute;|Ĺ',   r"{L'}",  txt)  # L-acute
    txt = re.sub(u'&#314;|&lacute;|ĺ',   r"{l'}",  txt)  # l-acute
    txt = re.sub(u'&#317;|&Lcaron;|Ľ',   r'{Lˇ}',  txt)  # L-caron
    txt = re.sub(u'&#318;|&lcaron;|ľ',   r'{lˇ}',  txt)  # l-caron
    txt = re.sub(u'&#327;|&Ncaron;|Ň',   r'{Nˇ}',  txt)  # N-caron
    txt = re.sub(u'&#328;|&ncaron;|ň',   r'{nˇ}',  txt)  # n-caron
    txt = re.sub(u'&#338;|&OElig;|Œ',    r'{OE}',  txt)  # OE
    txt = re.sub(u'&#339;|&oelig;|œ',    r'{oe}',  txt)  # oe
-    txt = re.sub(u'&#348;|&Scaron;|Ŝ',   r'{S^}', txt)  # Scaron
+
-    txt = re.sub(u'&#349;|&scaron;|ŝ',   r'{s^}', txt)  # scaron
+    txt = re.sub(u'&#340;|&Racute;|Ŕ',   r"{R'}",  txt)  # R-acute
    txt = re.sub(u'&#341;|&racute;|ŕ',   r"{r'}",  txt)  # r-acute
    txt = re.sub(u'&#344;|&Rcaron;|Ř',   r'{Rˇ}',  txt)  # R-caron
    txt = re.sub(u'&#345;|&rcaron;|ř',   r'{rˇ}',  txt)  # r-caron
    txt = re.sub(u'&#348;|Ŝ',            r'{S^}',  txt)  # S-circumflex
    txt = re.sub(u'&#349;|ŝ',            r'{s^}',  txt)  # s-circumflex
    txt = re.sub(u'&#352;|&Scaron;|Š',   r'{Sˇ}',  txt)  # S-caron
    txt = re.sub(u'&#353;|&scaron;|š',   r'{sˇ}',  txt)  # s-caron
    txt = re.sub(u'&#356;|&Tcaron;|Ť',   r'{Tˇ}',  txt)  # T-caron
    txt = re.sub(u'&#357;|&tcaron;|ť',   r'{tˇ}',  txt)  # t-caron
    txt = re.sub(u'&#366;|&Uring;|Ů',    r'{U°}',  txt)  # U-ring
    txt = re.sub(u'&#367;|&uring;|ů',    r'{u°}',  txt)  # u-ring
    txt = re.sub(u'&#381;|&Zcaron;|Ž',   r'{Zˇ}',  txt)  # Z-caron
    txt = re.sub(u'&#382;|&zcaron;|ž',   r'{zˇ}',  txt)  # z-caron
    txt = re.sub(u'&#8226;|&bull;|•',    r'{*}',   txt)  # bullet
    txt = re.sub(u'&#8355;|₣',           r'{Fr}',  txt)  # Franc
    txt = re.sub(u'&#8356;|₤',           r'{L=}',  txt)  # Lira