Fix #822744 (Unwrap problem - some central european chars missing)

2026-05-28 17:52:34 -04:00 · 2011-08-25 16:53:57 -06:00
parent 405267542b 88737c88d4
commit ca9048cdae
4 changed files with 113 additions and 20 deletions
@@ -343,6 +343,7 @@ class HTMLPreProcessor(object):
                  (re.compile(u'`\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ò'),
                  (re.compile(u'`\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ù'),
                  (re.compile(u'`\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ù'),
+                  
                  # ` with letter before
                  (re.compile(u'a\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'à'),
                  (re.compile(u'A\s*(<br.*?>)*\s*`', re.UNICODE), lambda match: u'À'),
@@ -364,10 +365,14 @@ class HTMLPreProcessor(object):
                  (re.compile(u'´\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'É'),
                  (re.compile(u'´\s*(<br.*?>)*\s*i', re.UNICODE), lambda match: u'í'),
                  (re.compile(u'´\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Í'),
+                  (re.compile(u'´\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ĺ'),
+                  (re.compile(u'´\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ĺ'),
                  (re.compile(u'´\s*(<br.*?>)*\s*o', re.UNICODE), lambda match: u'ó'),
                  (re.compile(u'´\s*(<br.*?>)*\s*O', re.UNICODE), lambda match: u'Ó'),
                  (re.compile(u'´\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ń'),
                  (re.compile(u'´\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ń'),
+                  (re.compile(u'´\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ŕ'),
+                  (re.compile(u'´\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ŕ'),
                  (re.compile(u'´\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'ś'),
                  (re.compile(u'´\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Ś'),
                  (re.compile(u'´\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ú'),
@@ -400,7 +405,31 @@ class HTMLPreProcessor(object):
                  # ˙
                  (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
                  (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
-
+                  
+                  # ˇ
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*c', re.UNICODE), lambda match: u'č'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*C', re.UNICODE), lambda match: u'Č'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*d', re.UNICODE), lambda match: u'ď'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*D', re.UNICODE), lambda match: u'Ď'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*e', re.UNICODE), lambda match: u'ě'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*E', re.UNICODE), lambda match: u'Ě'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*l', re.UNICODE), lambda match: u'ľ'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*L', re.UNICODE), lambda match: u'Ľ'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*n', re.UNICODE), lambda match: u'ň'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*N', re.UNICODE), lambda match: u'Ň'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*r', re.UNICODE), lambda match: u'ř'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*R', re.UNICODE), lambda match: u'Ř'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*s', re.UNICODE), lambda match: u'š'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*S', re.UNICODE), lambda match: u'Š'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*t', re.UNICODE), lambda match: u'ť'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*T', re.UNICODE), lambda match: u'Ť'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ž'),
+                  (re.compile(u'ˇ\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ž'),
+                  
+                  # °
+                  (re.compile(u'°\s*(<br.*?>)*\s*u', re.UNICODE), lambda match: u'ů'),
+                  (re.compile(u'°\s*(<br.*?>)*\s*U', re.UNICODE), lambda match: u'Ů'),
+                  
                  # If pdf printed from a browser then the header/footer has a reliable pattern
                  (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),

@@ -510,7 +539,7 @@ class HTMLPreProcessor(object):
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
-                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßě,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
+                    (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )

        for rule in self.PREPROCESS + start_rules:
@@ -315,9 +315,11 @@ class HeuristicProcessor(object):
        supports a range of html markup and text files
        '''
        # define the pieces of the regex
-        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðßôľščťžňďěřů,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
+                      
+        lookahead = "(?<=.{"+str(length)+u"}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:“”)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
        em_en_lookahead = "(?<=.{"+str(length)+u"}[\u2013\u2014])"
        soft_hyphen = u"\xad"
+        dash = u"\x2d" # some ocrs doesn't convert dashes to hyphens
        line_ending = "\s*</(span|[iubp]|div)>\s*(</(span|[iubp]|div)>)?"
        blanklines = "\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*"
        line_opening = "<(span|[iubp]|div)[^>]*>\s*(<(span|[iubp]|div)[^>]*>)?\s*"
@@ -326,19 +328,23 @@ class HeuristicProcessor(object):
        unwrap_regex = lookahead+line_ending+blanklines+line_opening
        em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
        shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
+        dash_unwrap_regex = dash+line_ending+blanklines+line_opening

        if format == 'txt':
            unwrap_regex = lookahead+txt_line_wrap
            em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
            shy_unwrap_regex = soft_hyphen+txt_line_wrap
+            dash_unwrap_regex = dash+txt_line_wrap

        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
+        dash_unwrap = re.compile(u"%s" % dash_unwrap_regex, re.UNICODE)

        content = unwrap.sub(' ', content)
        content = em_en_unwrap.sub('', content)
        content = shy_unwrap.sub('', content)
+        content = dash_unwrap.sub('', content)
        return content

    def txt_process(self, match):
@@ -196,10 +196,40 @@ class Textile(object):
        (re.compile(r'{(u\"|\"u)}'),     r'&#252;'),   #  u-diaeresis
        (re.compile(r'{(y\'|\'y)}'),     r'&#253;'),   #  y-acute
        (re.compile(r'{(y\"|\"y)}'),     r'&#255;'),   #  y-diaeresis
+
+        (re.compile(r'{(C\ˇ|\ˇC)}'),     r'&#268;'),   #  C-caron
+        (re.compile(r'{(c\ˇ|\ˇc)}'),     r'&#269;'),   #  c-caron
+        (re.compile(r'{(D\ˇ|\ˇD)}'),     r'&#270;'),   #  D-caron
+        (re.compile(r'{(d\ˇ|\ˇd)}'),     r'&#271;'),   #  d-caron
+        (re.compile(r'{(E\ˇ|\ˇE)}'),     r'&#282;'),   #  E-caron
+        (re.compile(r'{(e\ˇ|\ˇe)}'),     r'&#283;'),   #  e-caron
+        (re.compile(r'{(L\'|\'L)}'),     r'&#313;'),   #  L-acute
+        (re.compile(r'{(l\'|\'l)}'),     r'&#314;'),   #  l-acute
+        (re.compile(r'{(L\ˇ|\ˇL)}'),     r'&#317;'),   #  L-caron
+        (re.compile(r'{(l\ˇ|\ˇl)}'),     r'&#318;'),   #  l-caron
+        (re.compile(r'{(N\ˇ|\ˇN)}'),     r'&#327;'),   #  N-caron
+        (re.compile(r'{(n\ˇ|\ˇn)}'),     r'&#328;'),   #  n-caron
+
        (re.compile(r'{OE}'),            r'&#338;'),   #  OE
        (re.compile(r'{oe}'),            r'&#339;'),   #  oe
-        (re.compile(r'{(S\^|\^S)}'),     r'&Scaron;'), #  Scaron
-        (re.compile(r'{(s\^|\^s)}'),     r'&scaron;'), #  scaron
+
+        (re.compile(r'{(R\'|\'R)}'),     r'&#340;'),   #  R-acute
+        (re.compile(r'{(r\'|\'r)}'),     r'&#341;'),   #  r-acute
+        (re.compile(r'{(R\ˇ|\ˇR)}'),     r'&#344;'),   #  R-caron
+        (re.compile(r'{(r\ˇ|\ˇr)}'),     r'&#345;'),   #  r-caron
+
+        (re.compile(r'{(S\^|\^S)}'),     r'&#348;'),   #  S-circumflex
+        (re.compile(r'{(s\^|\^s)}'),     r'&#349;'),   #  s-circumflex
+        
+        (re.compile(r'{(S\ˇ|\ˇS)}'),     r'&#352;'),   #  S-caron
+        (re.compile(r'{(s\ˇ|\ˇs)}'),     r'&#353;'),   #  s-caron
+        (re.compile(r'{(T\ˇ|\ˇT)}'),     r'&#356;'),   #  T-caron
+        (re.compile(r'{(t\ˇ|\ˇt)}'),     r'&#357;'),   #  t-caron
+        (re.compile(r'{(U\°|\°U)}'),     r'&#366;'),   #  U-ring
+        (re.compile(r'{(u\°|\°u)}'),     r'&#367;'),   #  u-ring
+        (re.compile(r'{(Z\ˇ|\ˇZ)}'),     r'&#381;'),   #  Z-caron
+        (re.compile(r'{(z\ˇ|\ˇz)}'),     r'&#382;'),   #  z-caron
+        
        (re.compile(r'{\*}'),            r'&#8226;'),  #  bullet
        (re.compile(r'{Fr}'),            r'&#8355;'),  #  Franc
        (re.compile(r'{(L=|=L)}'),       r'&#8356;'),  #  Lira
@@ -219,13 +249,13 @@ class Textile(object):
    ]
    glyph_defaults = [
        (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'),                   r'\1\2&#215;\3'),                       #  dimension sign
-        (re.compile(r'(\d+)\'(\s)', re.I),                             r'\1&#8242;\2'),                          #  prime
-        (re.compile(r'(\d+)\"(\s)', re.I),                             r'\1&#8243;\2'),                          #  prime-double
+        (re.compile(r'(\d+)\'(\s)', re.I),                             r'\1&#8242;\2'),                        #  prime
+        (re.compile(r'(\d+)\"(\s)', re.I),                             r'\1&#8243;\2'),                        #  prime-double
        (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'),      r'<acronym title="\2">\1</acronym>'),   #  3+ uppercase acronym
        (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'),         r'<span class="caps">\1</span>'),       #  3+ uppercase
        (re.compile(r'\b(\s{0,1})?\.{3}'),                             r'\1&#8230;'),                          #  ellipsis
        (re.compile(r'^[\*_-]{3,}$', re.M),                            r'<hr />'),                             #  <hr> scene-break
-        (re.compile(r'(^|[^-])--([^-]|$)'),                                r'\1&#8212;\2'),                        #  em dash
+        (re.compile(r'(^|[^-])--([^-]|$)'),                            r'\1&#8212;\2'),                        #  em dash
        (re.compile(r'\s-(?:\s|$)'),                                   r' &#8211; '),                          #  en dash
        (re.compile(r'\b( ?)[([]TM[])]', re.I),                        r'\1&#8482;'),                          #  trademark
        (re.compile(r'\b( ?)[([]R[])]', re.I),                         r'\1&#174;'),                           #  registered
@@ -26,7 +26,7 @@ def unsmarten(txt):
    txt = re.sub(u'&#190;|&frac34;|¾',   r'{3/4}', txt)  # three-quarter
    txt = re.sub(u'&#192;|&Agrave;|À',   r'{A`)}', txt)  # A-grave
    txt = re.sub(u'&#193;|&Aacute;|Á',   r"{A'}",  txt)  # A-acute
-    txt = re.sub(u'&#194;|&Acirc;|Â',    r'{A^}', txt)  # A-circumflex
+    txt = re.sub(u'&#194;|&Acirc;|Â',    r'{A^}',  txt)  # A-circumflex
    txt = re.sub(u'&#195;|&Atilde;|Ã',   r'{A~}',  txt)  # A-tilde
    txt = re.sub(u'&#196;|&Auml;|Ä',     r'{A"}',  txt)  # A-umlaut
    txt = re.sub(u'&#197;|&Aring;|Å',    r'{Ao}',  txt)  # A-ring
@@ -34,30 +34,30 @@ def unsmarten(txt):
    txt = re.sub(u'&#199;|&Ccedil;|Ç',   r'{C,}',  txt)  # C-cedilla
    txt = re.sub(u'&#200;|&Egrave;|È',   r'{E`}',  txt)  # E-grave
    txt = re.sub(u'&#201;|&Eacute;|É',   r"{E'}",  txt)  # E-acute
-    txt = re.sub(u'&#202;|&Ecirc;|Ê',    r'{E^}', txt)  # E-circumflex
+    txt = re.sub(u'&#202;|&Ecirc;|Ê',    r'{E^}',  txt)  # E-circumflex
    txt = re.sub(u'&#203;|&Euml;|Ë',     r'{E"}',  txt)  # E-umlaut
    txt = re.sub(u'&#204;|&Igrave;|Ì',   r'{I`}',  txt)  # I-grave
    txt = re.sub(u'&#205;|&Iacute;|Í',   r"{I'}",  txt)  # I-acute
-    txt = re.sub(u'&#206;|&Icirc;|Î',    r'{I^}', txt)  # I-circumflex
+    txt = re.sub(u'&#206;|&Icirc;|Î',    r'{I^}',  txt)  # I-circumflex
    txt = re.sub(u'&#207;|&Iuml;|Ï',     r'{I"}',  txt)  # I-umlaut
    txt = re.sub(u'&#208;|&ETH;|Ð',      r'{D-}',  txt)  # ETH
    txt = re.sub(u'&#209;|&Ntilde;|Ñ',   r'{N~}',  txt)  # N-tilde
    txt = re.sub(u'&#210;|&Ograve;|Ò',   r'{O`}',  txt)  # O-grave
    txt = re.sub(u'&#211;|&Oacute;|Ó',   r"{O'}",  txt)  # O-acute
-    txt = re.sub(u'&#212;|&Ocirc;|Ô',    r'{O^}', txt)  # O-circumflex
+    txt = re.sub(u'&#212;|&Ocirc;|Ô',    r'{O^}',  txt)  # O-circumflex
    txt = re.sub(u'&#213;|&Otilde;|Õ',   r'{O~}',  txt)  # O-tilde
    txt = re.sub(u'&#214;|&Ouml;|Ö',     r'{O"}',  txt)  # O-umlaut
    txt = re.sub(u'&#215;|&times;|×',    r'{x}',   txt)  # dimension
    txt = re.sub(u'&#216;|&Oslash;|Ø',   r'{O/}',  txt)  # O-slash
    txt = re.sub(u'&#217;|&Ugrave;|Ù',   r"{U`}",  txt)  # U-grave
    txt = re.sub(u'&#218;|&Uacute;|Ú',   r"{U'}",  txt)  # U-acute
-    txt = re.sub(u'&#219;|&Ucirc;|Û',    r'{U^}', txt)  # U-circumflex
+    txt = re.sub(u'&#219;|&Ucirc;|Û',    r'{U^}',  txt)  # U-circumflex
    txt = re.sub(u'&#220;|&Uuml;|Ü',     r'{U"}',  txt)  # U-umlaut
    txt = re.sub(u'&#221;|&Yacute;|Ý',   r"{Y'}",  txt)  # Y-grave
    txt = re.sub(u'&#223;|&szlig;|ß',    r'{sz}',  txt)  # sharp-s
    txt = re.sub(u'&#224;|&agrave;|à',   r'{a`}',  txt)  # a-grave
    txt = re.sub(u'&#225;|&aacute;|á',   r"{a'}",  txt)  # a-acute
-    txt = re.sub(u'&#226;|&acirc;|â',    r'{a^}', txt)  # a-circumflex
+    txt = re.sub(u'&#226;|&acirc;|â',    r'{a^}',  txt)  # a-circumflex
    txt = re.sub(u'&#227;|&atilde;|ã',   r'{a~}',  txt)  # a-tilde
    txt = re.sub(u'&#228;|&auml;|ä',     r'{a"}',  txt)  # a-umlaut
    txt = re.sub(u'&#229;|&aring;|å',    r'{ao}',  txt)  # a-ring
@@ -65,30 +65,58 @@ def unsmarten(txt):
    txt = re.sub(u'&#231;|&ccedil;|ç',   r'{c,}',  txt)  # c-cedilla
    txt = re.sub(u'&#232;|&egrave;|è',   r'{e`}',  txt)  # e-grave
    txt = re.sub(u'&#233;|&eacute;|é',   r"{e'}",  txt)  # e-acute
-    txt = re.sub(u'&#234;|&ecirc;|ê',    r'{e^}', txt)  # e-circumflex
+    txt = re.sub(u'&#234;|&ecirc;|ê',    r'{e^}',  txt)  # e-circumflex
    txt = re.sub(u'&#235;|&euml;|ë',     r'{e"}',  txt)  # e-umlaut
    txt = re.sub(u'&#236;|&igrave;|ì',   r'{i`}',  txt)  # i-grave
    txt = re.sub(u'&#237;|&iacute;|í',   r"{i'}",  txt)  # i-acute
-    txt = re.sub(u'&#238;|&icirc;|î',    r'{i^}', txt)  # i-circumflex
+    txt = re.sub(u'&#238;|&icirc;|î',    r'{i^}',  txt)  # i-circumflex
    txt = re.sub(u'&#239;|&iuml;|ï',     r'{i"}',  txt)  # i-umlaut
    txt = re.sub(u'&#240;|&eth;|ð',      r'{d-}',  txt)  # eth
    txt = re.sub(u'&#241;|&ntilde;|ñ',   r'{n~}',  txt)  # n-tilde
    txt = re.sub(u'&#242;|&ograve;|ò',   r'{o`}',  txt)  # o-grave
    txt = re.sub(u'&#243;|&oacute;|ó',   r"{o'}",  txt)  # o-acute
-    txt = re.sub(u'&#244;|&ocirc;|ô',    r'{o^}', txt)  # o-circumflex
+    txt = re.sub(u'&#244;|&ocirc;|ô',    r'{o^}',  txt)  # o-circumflex
    txt = re.sub(u'&#245;|&otilde;|õ',   r'{o~}',  txt)  # o-tilde
    txt = re.sub(u'&#246;|&ouml;|ö',     r'{o"}',  txt)  # o-umlaut
    txt = re.sub(u'&#248;|&oslash;|ø',   r'{o/}',  txt)  # o-stroke
    txt = re.sub(u'&#249;|&ugrave;|ù',   r'{u`}',  txt)  # u-grave
    txt = re.sub(u'&#250;|&uacute;|ú',   r"{u'}",  txt)  # u-acute
-    txt = re.sub(u'&#251;|&ucirc;|û',    r'{u^}', txt)  # u-circumflex
+    txt = re.sub(u'&#251;|&ucirc;|û',    r'{u^}',  txt)  # u-circumflex
    txt = re.sub(u'&#252;|&uuml;|ü',     r'{u"}',  txt)  # u-umlaut
    txt = re.sub(u'&#253;|&yacute;|ý',   r"{y'}",  txt)  # y-acute
    txt = re.sub(u'&#255;|&yuml;|ÿ',     r'{y"}',  txt)  # y-umlaut
+    
+    txt = re.sub(u'&#268;|&Ccaron;|Č',   r'{Cˇ}',  txt)  # C-caron
+    txt = re.sub(u'&#269;|&ccaron;|č',   r'{cˇ}',  txt)  # c-caron
+    txt = re.sub(u'&#270;|&Dcaron;|Ď',   r'{Dˇ}',  txt)  # D-caron
+    txt = re.sub(u'&#271;|&dcaron;|ď',   r'{dˇ}',  txt)  # d-caron
+    txt = re.sub(u'&#282;|&Ecaron;|Ě',   r'{Eˇ}',  txt)  # E-caron
+    txt = re.sub(u'&#283;|&ecaron;|ě',   r'{eˇ}',  txt)  # e-caron
+    txt = re.sub(u'&#313;|&Lacute;|Ĺ',   r"{L'}",  txt)  # L-acute
+    txt = re.sub(u'&#314;|&lacute;|ĺ',   r"{l'}",  txt)  # l-acute
+    txt = re.sub(u'&#317;|&Lcaron;|Ľ',   r'{Lˇ}',  txt)  # L-caron
+    txt = re.sub(u'&#318;|&lcaron;|ľ',   r'{lˇ}',  txt)  # l-caron
+    txt = re.sub(u'&#327;|&Ncaron;|Ň',   r'{Nˇ}',  txt)  # N-caron
+    txt = re.sub(u'&#328;|&ncaron;|ň',   r'{nˇ}',  txt)  # n-caron
+   
    txt = re.sub(u'&#338;|&OElig;|Œ',    r'{OE}',  txt)  # OE
    txt = re.sub(u'&#339;|&oelig;|œ',    r'{oe}',  txt)  # oe
-    txt = re.sub(u'&#348;|&Scaron;|Ŝ',   r'{S^}', txt)  # Scaron
-    txt = re.sub(u'&#349;|&scaron;|ŝ',   r'{s^}', txt)  # scaron
+
+    txt = re.sub(u'&#340;|&Racute;|Ŕ',   r"{R'}",  txt)  # R-acute
+    txt = re.sub(u'&#341;|&racute;|ŕ',   r"{r'}",  txt)  # r-acute
+    txt = re.sub(u'&#344;|&Rcaron;|Ř',   r'{Rˇ}',  txt)  # R-caron
+    txt = re.sub(u'&#345;|&rcaron;|ř',   r'{rˇ}',  txt)  # r-caron
+    txt = re.sub(u'&#348;|Ŝ',            r'{S^}',  txt)  # S-circumflex
+    txt = re.sub(u'&#349;|ŝ',            r'{s^}',  txt)  # s-circumflex
+    txt = re.sub(u'&#352;|&Scaron;|Š',   r'{Sˇ}',  txt)  # S-caron
+    txt = re.sub(u'&#353;|&scaron;|š',   r'{sˇ}',  txt)  # s-caron
+    txt = re.sub(u'&#356;|&Tcaron;|Ť',   r'{Tˇ}',  txt)  # T-caron
+    txt = re.sub(u'&#357;|&tcaron;|ť',   r'{tˇ}',  txt)  # t-caron
+    txt = re.sub(u'&#366;|&Uring;|Ů',    r'{U°}',  txt)  # U-ring
+    txt = re.sub(u'&#367;|&uring;|ů',    r'{u°}',  txt)  # u-ring
+    txt = re.sub(u'&#381;|&Zcaron;|Ž',   r'{Zˇ}',  txt)  # Z-caron
+    txt = re.sub(u'&#382;|&zcaron;|ž',   r'{zˇ}',  txt)  # z-caron
+
    txt = re.sub(u'&#8226;|&bull;|•',    r'{*}',   txt)  # bullet
    txt = re.sub(u'&#8355;|₣',           r'{Fr}',  txt)  # Franc
    txt = re.sub(u'&#8356;|₤',           r'{L=}',  txt)  # Lira