diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 4a2d56d957..960dbf0242 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -75,7 +75,7 @@ class DocAnalysis(object): if format == 'html': linere = re.compile('(?<=]*>\s*

).*?(?=

)', re.DOTALL) elif format == 'pdf': - linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) + linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL) elif format == 'spanned_html': linere = re.compile('(?<=)', re.DOTALL) self.lines = linere.findall(raw) @@ -191,18 +191,21 @@ class Dehyphenator(object): lookupword = self.removesuffixes.sub('', dehyphenated) if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) - #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) - booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) + print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + try: + searchresult = self.html.find(str.lower(lookupword)) + except: + return hyphenated if self.format == 'html_cleanup': - if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1: - #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) - return dehyphenated - elif self.html.find(hyphenated) != -1: - #print "Cleanup:returned hyphenated word: " + str(hyphenated) - return hyphenated - else: - #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) - return firsthalf+u'\u2014'+wraptags+secondhalf + if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1: + #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + return dehyphenated + elif self.html.find(hyphenated) != -1: + #print "Cleanup:returned hyphenated word: " + str(hyphenated) + return hyphenated + else: + #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + return firsthalf+u'\u2014'+wraptags+secondhalf else: if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1: diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 96df37f631..b6969a3659 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -145,7 +145,7 @@ class PreProcessor(object): # # Build the Regular Expressions in pieces lookahead = "(?=<(p|div))" - chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*" + chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" chapter_header_open = r"(?P" chapter_header_close = ")\s*" chapter_line_close = "()?\s*()?\s*(]*>)?\s*\s*" @@ -154,7 +154,7 @@ class PreProcessor(object): else: blank_lines = "" opt_title_open = "(" - title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*" + title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" title_header_open = "(?P" title_header_close = ")\s*" title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"