From e8153d5e6900df625125900c6bab539533acc502 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 31 Jan 2011 01:36:08 +0800 Subject: [PATCH] merge multiple blank paragraphs --- src/calibre/ebooks/conversion/utils.py | 44 ++++++++++++++++++++------ src/calibre/ebooks/txt/txtml.py | 2 ++ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index a115e584b6..b37cd4b869 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -29,6 +29,7 @@ class HeuristicProcessor(object): self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) self.multi_blank = re.compile(r'(\s*]*>\s*

){2,}(?!\s*]*>\s*

){2,}', re.IGNORECASE) def is_pdftohtml(self, src): return '' in src[:1000] @@ -418,14 +419,32 @@ class HeuristicProcessor(object): if getattr(self.extra_opts, option, False): return True return False + + def merge_blanks(self, html, blanks_count=None): + single_blank = re.compile(r'(\s*]*>\s*

)', re.IGNORECASE) + base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp + em_per_line = 1.5 # Add another 1.5 em for each additional blank + + def merge_matches(match): + to_merge = match.group(0) + lines = float(len(single_blank.findall(to_merge))) - 1. + em = base_em + (em_per_line * lines) + if to_merge.find('whitespace'): + newline = self.any_multi_blank.sub('\n

', match.group(0)) + else: + newline = self.any_multi_blank.sub('\n

', match.group(0)) + return newline + + html = self.any_multi_blank.sub(merge_matches, html) + return html - def detect_blank_formatting(self, html): + def detect_whitespace(self, html): blanks_before_headings = re.compile(r'(\s*]*>\s*

){1,}(?=\s*)(\s*]*>\s*

){1,}', re.IGNORECASE) def markup_whitespaces(match): blanks = match.group(0) - blanks = self.blankreg.sub('\n

', blanks) + blanks = self.blankreg.sub('\n

', blanks) return blanks html = blanks_before_headings.sub(markup_whitespaces, html) html = blanks_after_headings.sub(markup_whitespaces, html) @@ -435,9 +454,9 @@ class HeuristicProcessor(object): def detect_soft_breaks(self, html): if not self.blanks_deleted and self.blanks_between_paragraphs: - html = self.multi_blank.sub('\n

', html) + html = self.multi_blank.sub('\n

', html) else: - html = self.blankreg.sub('\n

', html) + html = self.blankreg.sub('\n

', html) return html @@ -499,7 +518,7 @@ class HeuristicProcessor(object): if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False): self.log.debug("deleting blank lines") self.blanks_deleted = True - html = self.multi_blank.sub('\n

', html) + html = self.multi_blank.sub('\n

', html) html = self.blankreg.sub('', html) # Determine line ending type @@ -550,14 +569,21 @@ class HeuristicProcessor(object): doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) html = doubleheading.sub('\g'+'\n'+'', html) + # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks, + # style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks. + # Multiple sequential blank paragraphs are merged with appropriate margins + # If non-blank scene breaks exist they are center aligned and styled with appropriate margins. if getattr(self.extra_opts, 'format_scene_breaks', False): - html = self.detect_blank_formatting(html) + html = self.detect_whitespace(html) html = self.detect_soft_breaks(html) - # Center separator lines - html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) + blanks_count = len(self.any_multi_blank.findall(html)) + if blanks_count >= 1: + html = self.merge_blanks(html, blanks_count) + # Center separator lines, use a bit larger margin in this case + html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) #html = re.sub(']*>\s*

', '

', html) if self.deleted_nbsps: - # put back non-breaking spaces in empty paragraphs to preserve original formatting + # put back non-breaking spaces in empty paragraphs so they render correctly html = self.anyblank.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) return html diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py index 00992a8612..bf33e5540a 100644 --- a/src/calibre/ebooks/txt/txtml.py +++ b/src/calibre/ebooks/txt/txtml.py @@ -222,6 +222,8 @@ class TXTMLizer(object): # Scene breaks. if tag == 'hr': text.append('\n\n* * *\n\n') + elif style['margin-top']: + text.append('\n\n' + '\n' * round(style['margin-top'])) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: