diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 683eaac6d2..786bb79bae 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -27,7 +27,7 @@ class HeuristicProcessor(object):
self.linereg = re.compile('(?<=
)', re.IGNORECASE|re.DOTALL)
self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
- self.multi_blank = re.compile(r'(\s*]*>\s*
){2,}', re.IGNORECASE)
+ self.multi_blank = re.compile(r'(\s*]*>\s*
){2,}(?!\s*' in src[:1000]
@@ -42,8 +42,10 @@ class HeuristicProcessor(object):
" chapters. - " + unicode(chap))
return ''+chap+'
\n'
else:
- txt_chap = html2text(chap)
- txt_title = html2text(title)
+ delete_whitespace = re.compile('^\s*(?P.*?)\s*$')
+ delete_quotes = re.compile('\'\"')
+ txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g', html2text(chap)))
+ txt_title = delete_quotes.sub('', delete_whitespace.sub('\g', html2text(title)))
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
@@ -416,6 +418,12 @@ class HeuristicProcessor(object):
return True
return False
+ def detect_blank_formatting(self, html):
+ blanks_before_headings = re.compile(r'(\s*]*>\s*
){2,}(?=\s*'+'\n'+'
', html)
if getattr(self.extra_opts, 'format_scene_breaks', False):
+ html = self.detect_blank_formatting(html)
+ html = self.detect_soft_breaks(html)
# Center separator lines
html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
if not self.blanks_deleted: