diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 786bb79bae..957950ec29 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -24,9 +24,10 @@ class HeuristicProcessor(object): self.chapters_no_title = 0 self.chapters_with_title = 0 self.blanks_deleted = False + self.blanks_between_paragraphs = False self.linereg = re.compile('(?<=
)', re.IGNORECASE|re.DOTALL)
- self.blankreg = re.compile(r'\s*(?P ]*>)\s*(?P ]*>)\s*(?P ]*>)\s*(?P ]*>)\s*(?P ]*>\s*
]*>\s*
){1,}', re.IGNORECASE) + + def markup_spacers(match): + blanks = match.group(0) + blanks = self.blankreg.sub('\n', blanks) + return blanks + html = blanks_before_headings.sub(markup_spacers, html) + html = blanks_after_headings.sub(markup_spacers, html) + if self.html_preprocess_sections > self.min_chapters: + html = re.sub('(?si)^.*?(?=
', html) return html + + def __call__(self, html): self.log.debug("********* Heuristic processing HTML *********") @@ -465,25 +482,23 @@ class HeuristicProcessor(object): #html = re.sub('
\u00a0
', html) # Determine whether the document uses interleaved blank lines - blanks_between_paragraphs = self.analyze_blanks(html) + self.blanks_between_paragraphs = self.analyze_blanks(html) #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic if getattr(self.extra_opts, 'markup_chapter_headings', False): - html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs) - - self.dump(html, 'after_chapter_markup') + html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs) if getattr(self.extra_opts, 'italicize_common_cases', False): html = self.markup_italicis(html) # If more than 40% of the lines are empty paragraphs and the user has enabled delete # blank paragraphs then delete blank lines to clean up spacing - if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): + if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log.debug("deleting blank lines") self.blanks_deleted = True - html = self.multi_blank.sub('\n', html) + html = self.multi_blank.sub('\n
', html) html = self.blankreg.sub('', html) # Determine line ending type @@ -538,13 +553,10 @@ class HeuristicProcessor(object): html = self.detect_blank_formatting(html) html = self.detect_soft_breaks(html) # Center separator lines - html = re.sub(u'<(?P
' + '\g
', html) + html = re.sub(u'<(?P
' + '\g
]*>\s*
', '