From 070df88d209b99ce404822673492d6d34d789772 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 4 Feb 2011 18:06:13 +0800
Subject: [PATCH 1/3] modified blank line handling to better handle lrf formats
 and preserve scene breaks

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 16ef4c86e2..397146b415 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -28,8 +28,8 @@ class HeuristicProcessor(object):
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
-        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
-        self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
+        self.multi_blank = re.compile(r'(\s*(<div[^>]*>\s*)?<p[^>]*>\s*</p>(\s*</div>)?(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
+        self.any_multi_blank = re.compile(r'(\s*(<div[^>]*>\s*)?<p[^>]*>\s*</p>(\s*</div>)?(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
         self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
         self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
         self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)

From 80cbd6d89f7836f62572e7664d605fc9a6186c09 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 4 Feb 2011 18:48:37 +0800
Subject: [PATCH 2/3] slightly better method

---
 src/calibre/ebooks/conversion/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 397146b415..f541701480 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -28,8 +28,8 @@ class HeuristicProcessor(object):
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
-        self.multi_blank = re.compile(r'(\s*(<div[^>]*>\s*)?<p[^>]*>\s*</p>(\s*</div>)?(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
-        self.any_multi_blank = re.compile(r'(\s*(<div[^>]*>\s*)?<p[^>]*>\s*</p>(\s*</div>)?(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
+        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
+        self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
         self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
         self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
         self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
@@ -379,6 +379,8 @@ class HeuristicProcessor(object):
         html = re.sub('(?i)</?st1:\w+>', '', html)
         # Re-open self closing paragraph tags
         html = re.sub('<p[^>/]*/>', '<p> </p>', html)
+        # delete surrounding divs from empty paragraphs
+        html = re.sub('<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
         # Get rid of empty span, bold, font, em, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
@@ -637,6 +639,7 @@ class HeuristicProcessor(object):
             blanks_count = len(self.any_multi_blank.findall(html))
             if blanks_count >= 1:
                 html = self.merge_blanks(html, blanks_count)
+                self.dump(html, 'before_after_merge_blanks')
             scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
             scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
             # If the user has enabled scene break replacement, then either softbreaks

From 84431ab4ddf9d7cd93a8e04d1ce19a14524091a2 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Fri, 4 Feb 2011 20:34:07 +0800
Subject: [PATCH 3/3] ...

---
 src/calibre/ebooks/conversion/utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f541701480..a9f733277a 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -379,13 +379,13 @@ class HeuristicProcessor(object):
         html = re.sub('(?i)</?st1:\w+>', '', html)
         # Re-open self closing paragraph tags
         html = re.sub('<p[^>/]*/>', '<p> </p>', html)
-        # delete surrounding divs from empty paragraphs
-        html = re.sub('<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
         # Get rid of empty span, bold, font, em, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
+        # delete surrounding divs from empty paragraphs
+        html = re.sub('<div[^>]*>\s*<p[^>]*>\s*</p>\s*</div>', '<p> </p>', html)
         # Empty heading tags
         html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
         self.deleted_nbsps = True
@@ -563,7 +563,6 @@ class HeuristicProcessor(object):
         # Determine whether the document uses interleaved blank lines
         self.blanks_between_paragraphs = self.analyze_blanks(html)
 
-        #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
 
         if getattr(self.extra_opts, 'markup_chapter_headings', False):
@@ -639,7 +638,6 @@ class HeuristicProcessor(object):
             blanks_count = len(self.any_multi_blank.findall(html))
             if blanks_count >= 1:
                 html = self.merge_blanks(html, blanks_count)
-                self.dump(html, 'before_after_merge_blanks')
             scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
             scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
             # If the user has enabled scene break replacement, then either softbreaks