From e8153d5e6900df625125900c6bab539533acc502 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Mon, 31 Jan 2011 01:36:08 +0800
Subject: [PATCH] merge multiple blank paragraphs
---
src/calibre/ebooks/conversion/utils.py | 44 ++++++++++++++++++++------
src/calibre/ebooks/txt/txtml.py | 2 ++
2 files changed, 37 insertions(+), 9 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index a115e584b6..b37cd4b869 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -29,6 +29,7 @@ class HeuristicProcessor(object):
self.blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
self.anyblank = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
self.multi_blank = re.compile(r'(\s*]*>\s*
){2,}(?!\s*]*>\s*
){2,}', re.IGNORECASE)
def is_pdftohtml(self, src):
return '' in src[:1000]
@@ -418,14 +419,32 @@ class HeuristicProcessor(object):
if getattr(self.extra_opts, option, False):
return True
return False
+
+ def merge_blanks(self, html, blanks_count=None):
+ single_blank = re.compile(r'(\s*]*>\s*
)', re.IGNORECASE)
+ base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
+ em_per_line = 1.5 # Add another 1.5 em for each additional blank
+
+ def merge_matches(match):
+ to_merge = match.group(0)
+ lines = float(len(single_blank.findall(to_merge))) - 1.
+ em = base_em + (em_per_line * lines)
+ if to_merge.find('whitespace'):
+ newline = self.any_multi_blank.sub('\n
', match.group(0))
+ else:
+ newline = self.any_multi_blank.sub('\n
', match.group(0))
+ return newline
+
+ html = self.any_multi_blank.sub(merge_matches, html)
+ return html
- def detect_blank_formatting(self, html):
+ def detect_whitespace(self, html):
blanks_before_headings = re.compile(r'(\s*]*>\s*
){1,}(?=\s*)(\s*]*>\s*
){1,}', re.IGNORECASE)
def markup_whitespaces(match):
blanks = match.group(0)
- blanks = self.blankreg.sub('\n
', blanks)
+ blanks = self.blankreg.sub('\n
', blanks)
return blanks
html = blanks_before_headings.sub(markup_whitespaces, html)
html = blanks_after_headings.sub(markup_whitespaces, html)
@@ -435,9 +454,9 @@ class HeuristicProcessor(object):
def detect_soft_breaks(self, html):
if not self.blanks_deleted and self.blanks_between_paragraphs:
- html = self.multi_blank.sub('\n
', html)
+ html = self.multi_blank.sub('\n
', html)
else:
- html = self.blankreg.sub('\n
', html)
+ html = self.blankreg.sub('\n
', html)
return html
@@ -499,7 +518,7 @@ class HeuristicProcessor(object):
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
self.log.debug("deleting blank lines")
self.blanks_deleted = True
- html = self.multi_blank.sub('\n
', html)
+ html = self.multi_blank.sub('\n
', html)
html = self.blankreg.sub('', html)
# Determine line ending type
@@ -550,14 +569,21 @@ class HeuristicProcessor(object):
doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
html = doubleheading.sub('\g'+'\n'+'
', html)
+ # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
+ # style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
+ # Multiple sequential blank paragraphs are merged with appropriate margins
+ # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
if getattr(self.extra_opts, 'format_scene_breaks', False):
- html = self.detect_blank_formatting(html)
+ html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
- # Center separator lines
- html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
+ blanks_count = len(self.any_multi_blank.findall(html))
+ if blanks_count >= 1:
+ html = self.merge_blanks(html, blanks_count)
+ # Center separator lines, use a bit larger margin in this case
+ html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•=✦]+\s*)+)\s*((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>', '' + '\g' + '
', html)
#html = re.sub(']*>\s*
', '
', html)
if self.deleted_nbsps:
- # put back non-breaking spaces in empty paragraphs to preserve original formatting
+ # put back non-breaking spaces in empty paragraphs so they render correctly
html = self.anyblank.sub('\n'+r'\g'+u'\u00a0'+r'\g', html)
return html
diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py
index 00992a8612..bf33e5540a 100644
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@@ -222,6 +222,8 @@ class TXTMLizer(object):
# Scene breaks.
if tag == 'hr':
text.append('\n\n* * *\n\n')
+ elif style['margin-top']:
+ text.append('\n\n' + '\n' * round(style['margin-top']))
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text: