diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 4a2d56d957..960dbf0242 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -75,7 +75,7 @@ class DocAnalysis(object):
if format == 'html':
linere = re.compile('(?<=
]*>\s*
).*?(?=)', re.DOTALL)
elif format == 'pdf':
- linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
+ linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=)', re.DOTALL)
self.lines = linere.findall(raw)
@@ -191,18 +191,21 @@ class Dehyphenator(object):
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
- #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
- booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
+ print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ try:
+ searchresult = self.html.find(str.lower(lookupword))
+ except:
+ return hyphenated
if self.format == 'html_cleanup':
- if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
- #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
- return dehyphenated
- elif self.html.find(hyphenated) != -1:
- #print "Cleanup:returned hyphenated word: " + str(hyphenated)
- return hyphenated
- else:
- #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
- return firsthalf+u'\u2014'+wraptags+secondhalf
+ if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
+ #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ elif self.html.find(hyphenated) != -1:
+ #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ return hyphenated
+ else:
+ #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ return firsthalf+u'\u2014'+wraptags+secondhalf
else:
if self.html.find(lookupword) != -1 or self.html.find(str.lower(lookupword)) != -1:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 96df37f631..b6969a3659 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -145,7 +145,7 @@ class PreProcessor(object):
#
# Build the Regular Expressions in pieces
lookahead = "(?=<(p|div))"
- chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*"
+ chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P"
chapter_header_close = ")\s*"
chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)\s[^>]*>)?\s*(?P=outer)>\s*"
@@ -154,7 +154,7 @@ class PreProcessor(object):
else:
blank_lines = ""
opt_title_open = "("
- title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*"
+ title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
title_header_open = "(?P"
title_header_close = ")\s*"
title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)\s[^>]*>)?\s*(?P=outer2)>"