From cdf61487ffaa8d6caf0037680bd4d7c4901ab066 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 25 Jan 2011 17:29:55 +0800
Subject: [PATCH 1/4] add strong tags to list of empty tags removed during
 cleanup

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f6e259b6f9..d4b03b8d55 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -375,9 +375,9 @@ class HeuristicProcessor(object):
         html = re.sub('<p\s?/>', '', html)
         # Get rid of empty span, bold, font, em, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
-        html = re.sub(r"\s*<(font|[ibu]|em)[^>]*>\s*(<(font|[ibu]|em)[^>]*>\s*</(font|[ibu]|em)>\s*){0,2}\s*</(font|[ibu]|em)>", " ", html)
+        html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
         self.deleted_nbsps = True
         return html
 

From 5c988788a05aec3284b29268dad1a4748cd89948 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 29 Jan 2011 19:06:02 +0800
Subject: [PATCH 2/4] started scene break detection in heuristics, improved
 text conversion of Sigil compatible chapter titles

---
 src/calibre/ebooks/conversion/utils.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 683eaac6d2..786bb79bae 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -27,7 +27,7 @@ class HeuristicProcessor(object):
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.softbreak = re.compile(r'\s*(?P<openline><p(?=\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
-        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
+        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
 
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@@ -42,8 +42,10 @@ class HeuristicProcessor(object):
                     " chapters. - " + unicode(chap))
             return '<h2>'+chap+'</h2>\n'
         else:
-            txt_chap = html2text(chap)
-            txt_title = html2text(title)
+            delete_whitespace = re.compile('^\s*(?P<c>.*?)\s*$')
+            delete_quotes = re.compile('\'\"')
+            txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(chap)))
+            txt_title = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(title)))
             self.html_preprocess_sections = self.html_preprocess_sections + 1
             self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                     " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
@@ -416,6 +418,12 @@ class HeuristicProcessor(object):
                 return True
         return False
 
+    def detect_blank_formatting(self, html):
+        blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?=\s*<h\d)', re.IGNORECASE)
+        return html
+
+    def detect_soft_breaks(self, html):
+        return html
 
     def __call__(self, html):
         self.log.debug("*********  Heuristic processing HTML  *********")
@@ -465,6 +473,8 @@ class HeuristicProcessor(object):
         if getattr(self.extra_opts, 'markup_chapter_headings', False):
             html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
 
+        self.dump(html, 'after_chapter_markup')
+
         if getattr(self.extra_opts, 'italicize_common_cases', False):
             html = self.markup_italicis(html)
 
@@ -525,6 +535,8 @@ class HeuristicProcessor(object):
             html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
         if getattr(self.extra_opts, 'format_scene_breaks', False):
+            html = self.detect_blank_formatting(html)
+            html = self.detect_soft_breaks(html)
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
             if not self.blanks_deleted:

From bace283325092cb5757d6554fced9db27dfbbaca Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 29 Jan 2011 21:15:44 +0800
Subject: [PATCH 3/4] initial scene break detection

---
 src/calibre/ebooks/conversion/utils.py | 40 +++++++++++++++++---------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 786bb79bae..957950ec29 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -24,9 +24,10 @@ class HeuristicProcessor(object):
         self.chapters_no_title = 0
         self.chapters_with_title = 0
         self.blanks_deleted = False
+        self.blanks_between_paragraphs = False
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
-        self.softbreak = re.compile(r'\s*(?P<openline><p(?=\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|spacer)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
 
     def is_pdftohtml(self, src):
@@ -419,12 +420,28 @@ class HeuristicProcessor(object):
         return False
 
     def detect_blank_formatting(self, html):
-        blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?=\s*<h\d)', re.IGNORECASE)
+        blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){1,}(?=\s*<h\d)', re.IGNORECASE)
+        blanks_after_headings = re.compile(r'(?<=</h\d>)(\s*<p[^>]*>\s*</p>){1,}', re.IGNORECASE)
+        
+        def markup_spacers(match):
+           blanks = match.group(0)
+           blanks = self.blankreg.sub('\n<p class="spacer"> </p>', blanks)
+           return blanks
+        html = blanks_before_headings.sub(markup_spacers, html)
+        html = blanks_after_headings.sub(markup_spacers, html)
+        if self.html_preprocess_sections > self.min_chapters:
+            html = re.sub('(?si)^.*?(?=<h\d)', markup_spacers, html)
         return html
 
     def detect_soft_breaks(self, html):
+        if not self.blanks_deleted and self.blanks_between_paragraphs:
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+        else:
+            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
         return html
 
+
+
     def __call__(self, html):
         self.log.debug("*********  Heuristic processing HTML  *********")
 
@@ -465,25 +482,23 @@ class HeuristicProcessor(object):
         #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
 
         # Determine whether the document uses interleaved blank lines
-        blanks_between_paragraphs = self.analyze_blanks(html)
+        self.blanks_between_paragraphs = self.analyze_blanks(html)
 
         #self.dump(html, 'before_chapter_markup')
         # detect chapters/sections to match xpath or splitting logic
 
         if getattr(self.extra_opts, 'markup_chapter_headings', False):
-            html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
-
-        self.dump(html, 'after_chapter_markup')
+            html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
 
         if getattr(self.extra_opts, 'italicize_common_cases', False):
             html = self.markup_italicis(html)
 
         # If more than 40% of the lines are empty paragraphs and the user has enabled delete
         # blank paragraphs then delete blank lines to clean up spacing
-        if blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
+        if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log.debug("deleting blank lines")
             self.blanks_deleted = True
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
             html = self.blankreg.sub('', html)
 
         # Determine line ending type
@@ -538,13 +553,10 @@ class HeuristicProcessor(object):
             html = self.detect_blank_formatting(html)
             html = self.detect_soft_breaks(html)
             # Center separator lines
-            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
-            if not self.blanks_deleted:
-                html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
+            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
             html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs to preserve original formatting
-            html = self.blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
-            html = self.softbreak.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
+            html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
         return html

From c98a122539751fc4fd56d4f32ee7bd4e06313f8b Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sat, 29 Jan 2011 21:53:53 +0800
Subject: [PATCH 4/4] ...

---
 src/calibre/ebooks/conversion/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 957950ec29..5beefb5bd9 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -554,7 +554,7 @@ class HeuristicProcessor(object):
             html = self.detect_soft_breaks(html)
             # Center separator lines
             html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
-            html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
+            #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs to preserve original formatting