diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 683eaac6d2..786bb79bae 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -27,7 +27,7 @@ class HeuristicProcessor(object): self.linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.softbreak = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) + self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}(?!\s*' in src[:1000] @@ -42,8 +42,10 @@ class HeuristicProcessor(object): " chapters. - " + unicode(chap)) return '

'+chap+'

\n' else: - txt_chap = html2text(chap) - txt_title = html2text(title) + delete_whitespace = re.compile('^\s*(?P.*?)\s*$') + delete_quotes = re.compile('\'\"') + txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g', html2text(chap))) + txt_title = delete_quotes.sub('', delete_whitespace.sub('\g', html2text(title))) self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked " + unicode(self.html_preprocess_sections) + " chapters & titles. - " + unicode(chap) + ", " + unicode(title)) @@ -416,6 +418,12 @@ class HeuristicProcessor(object): return True return False + def detect_blank_formatting(self, html): + blanks_before_headings = re.compile(r'(\s*]*>\s*

){2,}(?=\s*'+'\n'+'', html) if getattr(self.extra_opts, 'format_scene_breaks', False): + html = self.detect_blank_formatting(html) + html = self.detect_soft_breaks(html) # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) if not self.blanks_deleted: