From f46749863850242e58e92a2f337a6abb1be03486 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 15 Jan 2011 00:11:21 +0800 Subject: [PATCH] preserve soft breaks when deleting blank paragraphs --- src/calibre/ebooks/conversion/utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 286fad1aaa..96bd303933 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -294,8 +294,8 @@ class PreProcessor(object): # If more than 40% of the lines are empty paragraphs and the user has enabled delete # blank paragraphs then delete blank lines to clean up spacing linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL) - blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) - #multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

){2,}', re.IGNORECASE) + blankreg = re.compile(r'\s*(?P]*>)\s*(?P

)', re.IGNORECASE) + multi_blank = re.compile(r'(\s*]*>\s*

){2,}', re.IGNORECASE) blanklines = blankreg.findall(html) lines = linereg.findall(html) blanks_between_paragraphs = False @@ -303,11 +303,8 @@ class PreProcessor(object): if len(lines) > 1: self.log("There are " + unicode(len(blanklines)) + " blank lines. " + unicode(float(len(blanklines)) / float(len(lines))) + " percent blank") - if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, - 'delete_blank_paragraphs', False): - self.log("deleting blank lines") - html = blankreg.sub('', html) - elif float(len(blanklines)) / float(len(lines)) > 0.40: + + if float(len(blanklines)) / float(len(lines)) > 0.40: blanks_between_paragraphs = True print "blanks between paragraphs is marked True" else: @@ -319,7 +316,12 @@ class PreProcessor(object): html = self.markup_chapters(html, totalwords, blanks_between_paragraphs) - + if blanks_between_paragraphs and getattr(self.extra_opts, + 'delete_blank_paragraphs', False): + self.log("deleting blank lines") + html = multi_blank.sub('\n

', html) + html = blankreg.sub('', html) + ###### Unwrap lines ###### # # Some OCR sourced files have line breaks in the html using a combination of span & p tags