Fixed a UTF-8 decoding error which occurs on bad input while removing hyphens, cleaned up indents, regex tweaks

2025-07-09 03:04:10 -04:00 · 2010-09-28 16:07:43 +08:00 · 2010-09-28 16:07:43 +08:00 · b7f6d820a7
commit b7f6d820a7
parent f5431765f4
2 changed files with 17 additions and 14 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -75,7 +75,7 @@ class DocAnalysis(object):
        if format == 'html':
            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
        elif format == 'pdf':
-            linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
+            linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
        self.lines = linere.findall(raw)
@ -191,18 +191,21 @@ class Dehyphenator(object):
        lookupword = self.removesuffixes.sub('', dehyphenated)
        if self.prefixes.match(firsthalf) is None:
           lookupword = self.removeprefix.sub('', lookupword)
-        #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+        print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
-        booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
+        try:
            searchresult = self.html.find(str.lower(lookupword))
        except:
            return hyphenated                
        if self.format == 'html_cleanup':
-           if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
+            if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
-               #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+                #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
-               return dehyphenated
+                return dehyphenated
-           elif self.html.find(hyphenated) != -1:
+            elif self.html.find(hyphenated) != -1:
-               #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+                #print "Cleanup:returned hyphenated word: " + str(hyphenated)
-               return hyphenated
+                return hyphenated
-           else:
+            else:
-               #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+                #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
-               return firsthalf+u'\u2014'+wraptags+secondhalf
+                return firsthalf+u'\u2014'+wraptags+secondhalf
        else:
            if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -145,7 +145,7 @@ class PreProcessor(object):
        #
        # Build the Regular Expressions in pieces
        lookahead = "(?=<(p|div))"
-        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>span|[ibu])[^>]*>)?\s*(<(?P<inner2>span|[ibu])[^>]*>)?\s*(<(?P<inner3>span|[ibu])[^>]*>)?\s*"
+        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
        chapter_header_open = r"(?P<chap>"
        chapter_header_close = ")\s*"
        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
@ -154,7 +154,7 @@ class PreProcessor(object):
        else:
            blank_lines = ""
        opt_title_open = "("
-        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>span|[ibu])[^>]*>)?\s*(<(?P<inner5>span|[ibu])[^>]*>)?\s*(<(?P<inner6>span|[ibu])[^>]*>)?\s*"
+        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
        title_header_open = "(?P<title>"
        title_header_close = ")\s*"
        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"