From 6b421643705fb3e575bdda1225171485ba01965a Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Sun, 30 Jan 2011 18:11:15 +0800
Subject: [PATCH 01/11] adjusted margins for scene break heuristics

---
 src/calibre/ebooks/conversion/utils.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 5beefb5bd9..a115e584b6 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -26,7 +26,7 @@ class HeuristicProcessor(object):
         self.blanks_deleted = False
         self.blanks_between_paragraphs = False
         self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|spacer)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
 
@@ -423,21 +423,21 @@ class HeuristicProcessor(object):
         blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){1,}(?=\s*<h\d)', re.IGNORECASE)
         blanks_after_headings = re.compile(r'(?<=</h\d>)(\s*<p[^>]*>\s*</p>){1,}', re.IGNORECASE)
         
-        def markup_spacers(match):
+        def markup_whitespaces(match):
            blanks = match.group(0)
-           blanks = self.blankreg.sub('\n<p class="spacer"> </p>', blanks)
+           blanks = self.blankreg.sub('\n<p class="whitespace"> </p>', blanks)
            return blanks
-        html = blanks_before_headings.sub(markup_spacers, html)
-        html = blanks_after_headings.sub(markup_spacers, html)
+        html = blanks_before_headings.sub(markup_whitespaces, html)
+        html = blanks_after_headings.sub(markup_whitespaces, html)
         if self.html_preprocess_sections > self.min_chapters:
-            html = re.sub('(?si)^.*?(?=<h\d)', markup_spacers, html)
+            html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html)
         return html
 
     def detect_soft_breaks(self, html):
         if not self.blanks_deleted and self.blanks_between_paragraphs:
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; margin-bottom:.5em; page-break-before:avoid"> </p>', html)
         else:
-            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; margin-bottom:.5em; page-break-before:avoid"> </p>', html)
         return html
 
 
@@ -489,6 +489,7 @@ class HeuristicProcessor(object):
 
         if getattr(self.extra_opts, 'markup_chapter_headings', False):
             html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
+        self.dump(html, 'after_chapter_markup')
 
         if getattr(self.extra_opts, 'italicize_common_cases', False):
             html = self.markup_italicis(html)
@@ -498,7 +499,7 @@ class HeuristicProcessor(object):
         if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log.debug("deleting blank lines")
             self.blanks_deleted = True
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; margin-bottom:.5em; page-break-before:avoid"> </p>', html)
             html = self.blankreg.sub('', html)
 
         # Determine line ending type
@@ -553,7 +554,7 @@ class HeuristicProcessor(object):
             html = self.detect_blank_formatting(html)
             html = self.detect_soft_breaks(html)
             # Center separator lines
-            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
+            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:.5em; margin-bottom:.5em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
             #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:

From e8153d5e6900df625125900c6bab539533acc502 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 31 Jan 2011 01:36:08 +0800
Subject: [PATCH 02/11] merge multiple blank paragraphs

---
 src/calibre/ebooks/conversion/utils.py | 44 ++++++++++++++++++++------
 src/calibre/ebooks/txt/txtml.py        |  2 ++
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index a115e584b6..b37cd4b869 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -29,6 +29,7 @@ class HeuristicProcessor(object):
         self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
+        self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
 
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@@ -418,14 +419,32 @@ class HeuristicProcessor(object):
             if getattr(self.extra_opts, option, False):
                 return True
         return False
+        
+    def merge_blanks(self, html, blanks_count=None):
+        single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
+        base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
+        em_per_line = 1.5 # Add another 1.5 em for each additional blank
+        
+        def merge_matches(match):
+            to_merge = match.group(0)
+            lines = float(len(single_blank.findall(to_merge))) - 1.
+            em = base_em + (em_per_line * lines)
+            if to_merge.find('whitespace'):
+                newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
+            else:
+                newline = self.any_multi_blank.sub('\n<p class="softbreak'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
+            return newline
+            
+        html = self.any_multi_blank.sub(merge_matches, html)
+        return html
 
-    def detect_blank_formatting(self, html):
+    def detect_whitespace(self, html):
         blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){1,}(?=\s*<h\d)', re.IGNORECASE)
         blanks_after_headings = re.compile(r'(?<=</h\d>)(\s*<p[^>]*>\s*</p>){1,}', re.IGNORECASE)
         
         def markup_whitespaces(match):
            blanks = match.group(0)
-           blanks = self.blankreg.sub('\n<p class="whitespace"> </p>', blanks)
+           blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:.5em"> </p>', blanks)
            return blanks
         html = blanks_before_headings.sub(markup_whitespaces, html)
         html = blanks_after_headings.sub(markup_whitespaces, html)
@@ -435,9 +454,9 @@ class HeuristicProcessor(object):
 
     def detect_soft_breaks(self, html):
         if not self.blanks_deleted and self.blanks_between_paragraphs:
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; margin-bottom:.5em; page-break-before:avoid"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
         else:
-            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; margin-bottom:.5em; page-break-before:avoid"> </p>', html)
+            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
         return html
 
 
@@ -499,7 +518,7 @@ class HeuristicProcessor(object):
         if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
             self.log.debug("deleting blank lines")
             self.blanks_deleted = True
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; margin-bottom:.5em; page-break-before:avoid"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
             html = self.blankreg.sub('', html)
 
         # Determine line ending type
@@ -550,14 +569,21 @@ class HeuristicProcessor(object):
             doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
             html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
 
+        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
+        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
+        # Multiple sequential blank paragraphs are merged with appropriate margins
+        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
         if getattr(self.extra_opts, 'format_scene_breaks', False):
-            html = self.detect_blank_formatting(html)
+            html = self.detect_whitespace(html)
             html = self.detect_soft_breaks(html)
-            # Center separator lines
-            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:.5em; margin-bottom:.5em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
+            blanks_count = len(self.any_multi_blank.findall(html))
+            if blanks_count >= 1:
+                html = self.merge_blanks(html, blanks_count)
+            # Center separator lines, use a bit larger margin in this case
+            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:.65em; margin-bottom:.65em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
             #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:
-            # put back non-breaking spaces in empty paragraphs to preserve original formatting
+            # put back non-breaking spaces in empty paragraphs so they render correctly
             html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
         return html
diff --git a/src/calibre/ebooks/txt/txtml.py b/src/calibre/ebooks/txt/txtml.py
index 00992a8612..bf33e5540a 100644
--- a/src/calibre/ebooks/txt/txtml.py
+++ b/src/calibre/ebooks/txt/txtml.py
@@ -222,6 +222,8 @@ class TXTMLizer(object):
         # Scene breaks.
         if tag == 'hr':
             text.append('\n\n* * *\n\n')
+        elif style['margin-top']:
+            text.append('\n\n' + '\n' * round(style['margin-top']))
 
         # Process tags that contain text.
         if hasattr(elem, 'text') and elem.text:

From 31c277880e6fce5b2d99e8fdfdede943804b6917 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 31 Jan 2011 09:39:28 +0800
Subject: [PATCH 03/11] scene break detection to detect any repeating non-word
 character

---
 src/calibre/ebooks/conversion/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index b37cd4b869..d0dc81405b 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -580,10 +580,10 @@ class HeuristicProcessor(object):
             if blanks_count >= 1:
                 html = self.merge_blanks(html, blanks_count)
             # Center separator lines, use a bit larger margin in this case
-            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:.65em; margin-bottom:.65em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
+            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P<break>((?P<breakchar>(?!\s)\W)\s*(?P=breakchar)?)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:.65em; margin-bottom:.65em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
             #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs so they render correctly
             html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
-        return html
+        return html
\ No newline at end of file

From a96c73480d6a014e0b446c5003d773c8c48bb022 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Mon, 31 Jan 2011 16:19:47 +0800
Subject: [PATCH 04/11] fixed overmatching/substitution issue in italicize
 function

---
 src/calibre/ebooks/conversion/utils.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index d0dc81405b..74afbe7a42 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -159,7 +159,7 @@ class HeuristicProcessor(object):
         ]
 
         for word in ITALICIZE_WORDS:
-            html = re.sub(r'(?<=\s|>)' + word + r'(?=\s|<)', '<i>%s</i>' % word, html)
+            html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
 
         for pat in ITALICIZE_STYLE_PATS:
             html = re.sub(pat, lambda mo: '<i>%s</i>' % mo.group('words'), html)
@@ -375,8 +375,8 @@ class HeuristicProcessor(object):
         html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
         # Delete microsoft 'smart' tags
         html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Delete self closing paragraph tags
-        html = re.sub('<p\s?/>', '', html)
+        # Re-open self closing paragraph tags
+        html = re.sub('<p[^>/]*/>', '<p> </p>', html)
         # Get rid of empty span, bold, font, em, & italics tags
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
@@ -463,7 +463,6 @@ class HeuristicProcessor(object):
 
     def __call__(self, html):
         self.log.debug("*********  Heuristic processing HTML  *********")
-
         # Count the words in the document to estimate how many chapters to look for and whether
         # other types of processing are attempted
         try:
@@ -477,7 +476,7 @@ class HeuristicProcessor(object):
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = self.arrange_htm_line_endings(html)
-
+        self.dump(html, 'after_arrange_line_endings')
         if self.cleanup_required():
             ###### Check Markup ######
             #
@@ -580,7 +579,9 @@ class HeuristicProcessor(object):
             if blanks_count >= 1:
                 html = self.merge_blanks(html, blanks_count)
             # Center separator lines, use a bit larger margin in this case
-            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P<break>((?P<breakchar>(?!\s)\W)\s*(?P=breakchar)?)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:.65em; margin-bottom:.65em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
+            scene_break = re.compile(r'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', re.IGNORECASE|re.UNICODE)
+            print "found "+str(len(scene_break.findall(html)))+" scene breaks"
+            html = scene_break.sub('<p class="scenebreak" style="text-align:center; margin-top:.65em; margin-bottom:.65em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
             #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:

From 5596f506a7a511eea83f3dad86e93ac87fb9f757 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 1 Feb 2011 01:51:22 +0800
Subject: [PATCH 05/11] improved scene break/whitespace formatting

---
 src/calibre/ebooks/conversion/utils.py | 60 +++++++++++++++++++-------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 74afbe7a42..77086efd97 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -30,6 +30,9 @@ class HeuristicProcessor(object):
         self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
         self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
         self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
+        self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
+        self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
+        self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
 
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@@ -188,19 +191,17 @@ class HeuristicProcessor(object):
 
         # Build the Regular Expressions in pieces
         init_lookahead = "(?=<(p|div))"
-        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
+        chapter_line_open = self.line_open
         title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
         chapter_header_open = r"(?P<chap>"
         title_header_open = r"(?P<title>"
         chapter_header_close = ")\s*"
         title_header_close = ")"
-        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
+        chapter_line_close = self.line_close
         title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
 
         is_pdftohtml = self.is_pdftohtml(html)
         if is_pdftohtml:
-            chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
-            chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
             title_line_open = "<(?P<outer2>p)[^>]*>\s*"
             title_line_close = "\s*</(?P=outer2)>"
 
@@ -382,6 +383,8 @@ class HeuristicProcessor(object):
         html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
         html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
         html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
+        # Empty heading tags
+        html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
         self.deleted_nbsps = True
         return html
 
@@ -421,13 +424,12 @@ class HeuristicProcessor(object):
         return False
         
     def merge_blanks(self, html, blanks_count=None):
-        single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
         base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
         em_per_line = 1.5 # Add another 1.5 em for each additional blank
         
         def merge_matches(match):
             to_merge = match.group(0)
-            lines = float(len(single_blank.findall(to_merge))) - 1.
+            lines = float(len(self.single_blank.findall(to_merge))) - 1.
             em = base_em + (em_per_line * lines)
             if to_merge.find('whitespace'):
                 newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
@@ -439,17 +441,37 @@ class HeuristicProcessor(object):
         return html
 
     def detect_whitespace(self, html):
-        blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){1,}(?=\s*<h\d)', re.IGNORECASE)
-        blanks_after_headings = re.compile(r'(?<=</h\d>)(\s*<p[^>]*>\s*</p>){1,}', re.IGNORECASE)
+        blanks_around_headings = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)                                     
+        blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
         
+        def merge_header_whitespace(match):
+            initblanks = match.group('initparas')
+            endblanks = match.group('initparas') 
+            heading = match.group('heading')
+            top_margin = ''
+            bottom_margin = ''
+            if initblanks is not None:
+                top_margin = 'margin=top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+            if endblanks is not None:
+                bottom_margin = 'margin=top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+
+            if initblanks == None and endblanks == None:
+                return heading
+            else:
+                heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
+            return heading
+
+        html = blanks_around_headings.sub(merge_header_whitespace, html)
+
         def markup_whitespaces(match):
-           blanks = match.group(0)
-           blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:.5em"> </p>', blanks)
-           return blanks
-        html = blanks_before_headings.sub(markup_whitespaces, html)
-        html = blanks_after_headings.sub(markup_whitespaces, html)
+            blanks = match.group(0)
+            blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:.5em"> </p>', blanks)
+            return blanks
+
+        html = blanks_n_nopunct.sub(markup_whitespaces, html)
         if self.html_preprocess_sections > self.min_chapters:
             html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html)
+
         return html
 
     def detect_soft_breaks(self, html):
@@ -496,6 +518,11 @@ class HeuristicProcessor(object):
             # fix indents must run before this step, as it removes non-breaking spaces
             html = self.cleanup_markup(html)
 
+        is_pdftohtml = self.is_pdftohtml(html)
+        if is_pdftohtml:
+            self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
+            self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
+
         # ADE doesn't render <br />, change to empty paragraphs
         #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
 
@@ -558,7 +585,7 @@ class HeuristicProcessor(object):
         if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
             self.log.debug("Looking for more split points based on punctuation,"
                     " currently have " + unicode(self.html_preprocess_sections))
-            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
             html = chapdetect3.sub(self.chapter_break, html)
 
         if getattr(self.extra_opts, 'renumber_headings', False):
@@ -579,9 +606,10 @@ class HeuristicProcessor(object):
             if blanks_count >= 1:
                 html = self.merge_blanks(html, blanks_count)
             # Center separator lines, use a bit larger margin in this case
-            scene_break = re.compile(r'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', re.IGNORECASE|re.UNICODE)
+            scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
+            scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
             print "found "+str(len(scene_break.findall(html)))+" scene breaks"
-            html = scene_break.sub('<p class="scenebreak" style="text-align:center; margin-top:.65em; margin-bottom:.65em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
+            html = scene_break.sub('<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:.65em; margin-bottom:.65em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
             #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
 
         if self.deleted_nbsps:

From 241ef0b6e1135c33e2b61c6a75d4a65a6eaeb5a4 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Mon, 31 Jan 2011 19:57:43 -0500
Subject: [PATCH 06/11] Heuristics: Add replace soft scene break option.

---
 src/calibre/ebooks/conversion/cli.py     |  3 +-
 src/calibre/ebooks/conversion/plumber.py |  4 +++
 src/calibre/gui2/convert/heuristics.py   | 45 +++++++++++++++++++++++-
 src/calibre/gui2/convert/heuristics.ui   | 36 +++++++++++++++++++
 4 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 33ae61f16a..25179d48a7 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -45,7 +45,8 @@ For full documentation of the conversion system see
 HEURISTIC_OPTIONS = ['markup_chapter_headings',
                       'italicize_common_cases', 'fix_indents',
                       'html_unwrap_factor', 'unwrap_lines',
-                      'delete_blank_paragraphs', 'format_scene_breaks',
+                      'delete_blank_paragraphs',
+                      'format_scene_breaks', 'replace_soft_scene_breaks',
                       'dehyphenate', 'renumber_headings']
 
 def print_help(parser, log):
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 5807ba5f8f..2c37053759 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -530,6 +530,10 @@ OptionRecommendation(name='format_scene_breaks',
     help=_('Left aligned scene break markers are center aligned. '
            'Replace soft scene breaks that use multiple blank lines with'
            'horizontal rules.')),
+           
+OptionRecommendation(name='replace_soft_scene_breaks',
+    recommended_value='', level=OptionRecommendation.LOW,
+    help=_('Replace soft scene breaks with the specified text.')),
 
 OptionRecommendation(name='dehyphenate',
     recommended_value=True, level=OptionRecommendation.LOW,
diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index e788888257..73b4622246 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en'
 
 from PyQt4.Qt import Qt
 
+from calibre.gui2 import gprefs
 from calibre.gui2.convert.heuristics_ui import Ui_Form
 from calibre.gui2.convert import Widget
 
@@ -21,17 +22,35 @@ class HeuristicsWidget(Widget, Ui_Form):
                 ['enable_heuristics', 'markup_chapter_headings',
                  'italicize_common_cases', 'fix_indents',
                  'html_unwrap_factor', 'unwrap_lines',
-                 'delete_blank_paragraphs', 'format_scene_breaks',
+                 'delete_blank_paragraphs',
+                 'format_scene_breaks', 'replace_soft_scene_breaks',
                  'dehyphenate', 'renumber_headings']
                 )
         self.db, self.book_id = db, book_id
+        self.rssb_defaults = ['', '<hr />', '* * *']
         self.initialize_options(get_option, get_help, db, book_id)
 
+        self.load_histories()
+
         self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
         self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
 
         self.enable_heuristics(self.opt_enable_heuristics.checkState())
 
+    def restore_defaults(self, get_option):
+        Widget.restore_defaults(self, get_option)
+        
+        rssb_hist = gprefs['replace_soft_scene_breaks_history']
+        for x in self.rssb_defaults:
+            if x in rssb_hist:
+                del rssb_hist[rssb_hist.index(x)]
+        gprefs['replace_soft_scene_breaks_history'] = self.rssb_defaults + gprefs['replace_soft_scene_breaks_history']
+
+    def commit_options(self, save_defaults=False):
+        Widget.commit_options(self, save_defaults)
+        
+        self.save_histories()
+
     def break_cycles(self):
         Widget.break_cycles(self)
 
@@ -45,6 +64,30 @@ class HeuristicsWidget(Widget, Ui_Form):
         if val is None and g is self.opt_html_unwrap_factor:
             g.setValue(0.0)
             return True
+        if not val and g is self.opt_replace_soft_scene_breaks:
+            g.lineEdit().setText('')
+            return True
+
+    def load_histories(self):
+        val = unicode(self.opt_replace_soft_scene_breaks.currentText())
+        rssb_hist = gprefs.get('replace_soft_scene_breaks_history', self.rssb_defaults)
+        if val in rssb_hist:
+            del rssb_hist[rssb_hist.index(val)]
+        rssb_hist.insert(0, val)
+        for v in rssb_hist:
+            # Ensure we don't have duplicate items.
+            if self.opt_replace_soft_scene_breaks.findText(v) == -1:
+                self.opt_replace_soft_scene_breaks.addItem(v)
+        self.opt_replace_soft_scene_breaks.setCurrentIndex(0)
+
+    def save_histories(self):
+        rssb_history = []
+        history_pats = [unicode(self.opt_replace_soft_scene_breaks.lineEdit().text())] + [unicode(self.opt_replace_soft_scene_breaks.itemText(i)) for i in xrange(self.opt_replace_soft_scene_breaks.count())]
+        for p in history_pats[:10]:
+            # Ensure we don't have duplicate items.
+            if p not in rssb_history:
+                rssb_history.append(p)
+        gprefs['replace_soft_scene_breaks_history'] = rssb_history
 
     def enable_heuristics(self, state):
         state = state == Qt.Checked
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index 6863fcf8e6..c047957d4d 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -150,6 +150,42 @@
         </property>
        </widget>
       </item>
+      <item>
+       <layout class="QHBoxLayout" name="horizontalLayout_2">
+        <property name="sizeConstraint">
+         <enum>QLayout::SetDefaultConstraint</enum>
+        </property>
+        <item>
+         <widget class="QLabel" name="label_2">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="text">
+           <string>Replace soft scene breaks:</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QComboBox" name="opt_replace_soft_scene_breaks">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="editable">
+           <bool>true</bool>
+          </property>
+          <property name="insertPolicy">
+           <enum>QComboBox::InsertAtTop</enum>
+          </property>
+         </widget>
+        </item>
+       </layout>
+      </item>
       <item>
        <widget class="QCheckBox" name="opt_dehyphenate">
         <property name="text">

From 971e3150f9aaf86f7b253d6d88534e5e0256dc57 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 1 Feb 2011 13:17:58 +0800
Subject: [PATCH 07/11] ...

---
 src/calibre/ebooks/conversion/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 77086efd97..1263372ce3 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -451,21 +451,21 @@ class HeuristicProcessor(object):
             top_margin = ''
             bottom_margin = ''
             if initblanks is not None:
-                top_margin = 'margin=top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+                top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
             if endblanks is not None:
-                bottom_margin = 'margin=top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
+                bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
 
             if initblanks == None and endblanks == None:
                 return heading
             else:
-                heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
+                heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
             return heading
 
         html = blanks_around_headings.sub(merge_header_whitespace, html)
 
         def markup_whitespaces(match):
             blanks = match.group(0)
-            blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:.5em"> </p>', blanks)
+            blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
             return blanks
 
         html = blanks_n_nopunct.sub(markup_whitespaces, html)

From d75e17e6b44e8ae688ade08bd30ae552ab0c48c3 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 1 Feb 2011 18:07:37 +0800
Subject: [PATCH 08/11] added scene break replacement logic

---
 src/calibre/ebooks/conversion/cli.py     |  2 +-
 src/calibre/ebooks/conversion/plumber.py |  4 +++
 src/calibre/ebooks/conversion/utils.py   | 33 ++++++++++++++++++++----
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 33ae61f16a..278d599378 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -143,7 +143,7 @@ def add_pipeline_options(parser, plumber):
                      ' patterns. Disabled by default. Use %s to enable. '
                      ' Individual actions can be disabled with the %s options.')
                   % ('--enable-heuristics', '--disable-*'),
-                  ['enable_heuristics'] + HEURISTIC_OPTIONS
+                  ['enable_heuristics', 'replace_scene_breaks'] + HEURISTIC_OPTIONS
                   ),
 
               'SEARCH AND REPLACE' : (
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 5807ba5f8f..59d7a0ed2a 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -530,6 +530,10 @@ OptionRecommendation(name='format_scene_breaks',
     help=_('Left aligned scene break markers are center aligned. '
            'Replace soft scene breaks that use multiple blank lines with'
            'horizontal rules.')),
+           
+OptionRecommendation(name='replace_scene_breaks',
+    recommended_value=None, level=OptionRecommendation.LOW,
+    help=_('Replace scene breaks with the specified text.')),
 
 OptionRecommendation(name='dehyphenate',
     recommended_value=True, level=OptionRecommendation.LOW,
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 1263372ce3..cf305f1022 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -33,6 +33,7 @@ class HeuristicProcessor(object):
         self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
         self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
         self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
+        self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
 
     def is_pdftohtml(self, src):
         return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@@ -481,6 +482,22 @@ class HeuristicProcessor(object):
             html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
         return html
 
+    def markup_user_break(self, replacement_break):
+        hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em">'
+        if re.findall('(<|>)', replacement_break):
+            if re.match('^<hr', replacement_break):
+                scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
+            elif re.match('^<img', replacement_break):
+                scene_break = self.scene_break_open+replacement_break+'</p>'
+            else:
+                replacement_break = html2text(replacement_break)
+                replacement_break = re.sub('\s', '&nbsp;', replacement_break)
+                scene_break = self.scene_break_open+replacement_break+'</p>'
+        else:
+            replacement_break = re.sub('\s', '&nbsp;', replacement_break)
+            scene_break = self.scene_break_open+replacement_break+'</p>'
+
+        return scene_break
 
 
     def __call__(self, html):
@@ -498,7 +515,7 @@ class HeuristicProcessor(object):
 
         # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
         html = self.arrange_htm_line_endings(html)
-        self.dump(html, 'after_arrange_line_endings')
+        #self.dump(html, 'after_arrange_line_endings')
         if self.cleanup_required():
             ###### Check Markup ######
             #
@@ -534,7 +551,7 @@ class HeuristicProcessor(object):
 
         if getattr(self.extra_opts, 'markup_chapter_headings', False):
             html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
-        self.dump(html, 'after_chapter_markup')
+        #self.dump(html, 'after_chapter_markup')
 
         if getattr(self.extra_opts, 'italicize_common_cases', False):
             html = self.markup_italicis(html)
@@ -608,9 +625,15 @@ class HeuristicProcessor(object):
             # Center separator lines, use a bit larger margin in this case
             scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
             scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
-            print "found "+str(len(scene_break.findall(html)))+" scene breaks"
-            html = scene_break.sub('<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:.65em; margin-bottom:.65em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
-            #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
+            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
+            if replacement_break is not None:
+                replacement_break = self.markup_user_break(replacement_break)
+                if len(scene_break.findall(html)) >= 1:
+                    html = scene_break.sub(replacement_break, html)
+                else:
+                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html) 
+            else:
+                html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
 
         if self.deleted_nbsps:
             # put back non-breaking spaces in empty paragraphs so they render correctly

From c4f74eb182eab41b749ddb814791c55dc260f1bd Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Tue, 1 Feb 2011 07:29:01 -0500
Subject: [PATCH 09/11] Heuristics: Rename scene break option. Fix bug
 preventing saving settings.

---
 src/calibre/ebooks/conversion/cli.py     |  5 ++---
 src/calibre/ebooks/conversion/plumber.py |  4 ++--
 src/calibre/gui2/convert/heuristics.py   | 26 ++++++++++++------------
 src/calibre/gui2/convert/heuristics.ui   |  2 +-
 src/calibre/gui2/convert/single.py       |  1 +
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py
index 25179d48a7..b3d2f8cac5 100644
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@@ -45,8 +45,7 @@ For full documentation of the conversion system see
 HEURISTIC_OPTIONS = ['markup_chapter_headings',
                       'italicize_common_cases', 'fix_indents',
                       'html_unwrap_factor', 'unwrap_lines',
-                      'delete_blank_paragraphs',
-                      'format_scene_breaks', 'replace_soft_scene_breaks',
+                      'delete_blank_paragraphs', 'format_scene_breaks',
                       'dehyphenate', 'renumber_headings']
 
 def print_help(parser, log):
@@ -144,7 +143,7 @@ def add_pipeline_options(parser, plumber):
                      ' patterns. Disabled by default. Use %s to enable. '
                      ' Individual actions can be disabled with the %s options.')
                   % ('--enable-heuristics', '--disable-*'),
-                  ['enable_heuristics'] + HEURISTIC_OPTIONS
+                  ['enable_heuristics',  'replace_scene_breaks'] + HEURISTIC_OPTIONS
                   ),
 
               'SEARCH AND REPLACE' : (
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 2c37053759..a4708d398c 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -531,9 +531,9 @@ OptionRecommendation(name='format_scene_breaks',
            'Replace soft scene breaks that use multiple blank lines with'
            'horizontal rules.')),
            
-OptionRecommendation(name='replace_soft_scene_breaks',
+OptionRecommendation(name='replace_scene_breaks',
     recommended_value='', level=OptionRecommendation.LOW,
-    help=_('Replace soft scene breaks with the specified text.')),
+    help=_('Replace scene breaks with the specified text.')),
 
 OptionRecommendation(name='dehyphenate',
     recommended_value=True, level=OptionRecommendation.LOW,
diff --git a/src/calibre/gui2/convert/heuristics.py b/src/calibre/gui2/convert/heuristics.py
index 73b4622246..8ca4cab455 100644
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@@ -23,7 +23,7 @@ class HeuristicsWidget(Widget, Ui_Form):
                  'italicize_common_cases', 'fix_indents',
                  'html_unwrap_factor', 'unwrap_lines',
                  'delete_blank_paragraphs',
-                 'format_scene_breaks', 'replace_soft_scene_breaks',
+                 'format_scene_breaks', 'replace_scene_breaks',
                  'dehyphenate', 'renumber_headings']
                 )
         self.db, self.book_id = db, book_id
@@ -40,16 +40,16 @@ class HeuristicsWidget(Widget, Ui_Form):
     def restore_defaults(self, get_option):
         Widget.restore_defaults(self, get_option)
         
-        rssb_hist = gprefs['replace_soft_scene_breaks_history']
+        rssb_hist = gprefs['replace_scene_breaks_history']
         for x in self.rssb_defaults:
             if x in rssb_hist:
                 del rssb_hist[rssb_hist.index(x)]
-        gprefs['replace_soft_scene_breaks_history'] = self.rssb_defaults + gprefs['replace_soft_scene_breaks_history']
+        gprefs['replace_scene_breaks_history'] = self.rssb_defaults + gprefs['replace_scene_breaks_history']
 
     def commit_options(self, save_defaults=False):
-        Widget.commit_options(self, save_defaults)
-        
         self.save_histories()
+        
+        return Widget.commit_options(self, save_defaults)
 
     def break_cycles(self):
         Widget.break_cycles(self)
@@ -64,30 +64,30 @@ class HeuristicsWidget(Widget, Ui_Form):
         if val is None and g is self.opt_html_unwrap_factor:
             g.setValue(0.0)
             return True
-        if not val and g is self.opt_replace_soft_scene_breaks:
+        if not val and g is self.opt_replace_scene_breaks:
             g.lineEdit().setText('')
             return True
 
     def load_histories(self):
-        val = unicode(self.opt_replace_soft_scene_breaks.currentText())
-        rssb_hist = gprefs.get('replace_soft_scene_breaks_history', self.rssb_defaults)
+        val = unicode(self.opt_replace_scene_breaks.currentText())
+        rssb_hist = gprefs.get('replace_scene_breaks_history', self.rssb_defaults)
         if val in rssb_hist:
             del rssb_hist[rssb_hist.index(val)]
         rssb_hist.insert(0, val)
         for v in rssb_hist:
             # Ensure we don't have duplicate items.
-            if self.opt_replace_soft_scene_breaks.findText(v) == -1:
-                self.opt_replace_soft_scene_breaks.addItem(v)
-        self.opt_replace_soft_scene_breaks.setCurrentIndex(0)
+            if self.opt_replace_scene_breaks.findText(v) == -1:
+                self.opt_replace_scene_breaks.addItem(v)
+        self.opt_replace_scene_breaks.setCurrentIndex(0)
 
     def save_histories(self):
         rssb_history = []
-        history_pats = [unicode(self.opt_replace_soft_scene_breaks.lineEdit().text())] + [unicode(self.opt_replace_soft_scene_breaks.itemText(i)) for i in xrange(self.opt_replace_soft_scene_breaks.count())]
+        history_pats = [unicode(self.opt_replace_scene_breaks.lineEdit().text())] + [unicode(self.opt_replace_scene_breaks.itemText(i)) for i in xrange(self.opt_replace_scene_breaks.count())]
         for p in history_pats[:10]:
             # Ensure we don't have duplicate items.
             if p not in rssb_history:
                 rssb_history.append(p)
-        gprefs['replace_soft_scene_breaks_history'] = rssb_history
+        gprefs['replace_scene_breaks_history'] = rssb_history
 
     def enable_heuristics(self, state):
         state = state == Qt.Checked
diff --git a/src/calibre/gui2/convert/heuristics.ui b/src/calibre/gui2/convert/heuristics.ui
index c047957d4d..4f7cf5ea6e 100644
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@@ -169,7 +169,7 @@
          </widget>
         </item>
         <item>
-         <widget class="QComboBox" name="opt_replace_soft_scene_breaks">
+         <widget class="QComboBox" name="opt_replace_scene_breaks">
           <property name="sizePolicy">
            <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
             <horstretch>0</horstretch>
diff --git a/src/calibre/gui2/convert/single.py b/src/calibre/gui2/convert/single.py
index 59fcbb65ad..6540383229 100644
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@@ -258,6 +258,7 @@ class Config(ResizableDialog, Ui_Dialog):
             if not w.pre_commit_check():
                 return
             x = w.commit(save_defaults=False)
+            print x
             recs.update(x)
         self.opf_file, self.cover_file = self.mw.opf_file, self.mw.cover_file
         self._recommendations = recs

From 48f202c7fd875bb4ccabeed6d7078e56607da142 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 1 Feb 2011 21:21:36 +0800
Subject: [PATCH 10/11] allow user applied styles to <hr> tags, updated
 comments/docs

---
 src/calibre/ebooks/conversion/utils.py | 19 +++++++++++++++++--
 src/calibre/manual/conversion.rst      | 13 +++++++++----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index cf305f1022..21c6063f63 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -483,10 +483,23 @@ class HeuristicProcessor(object):
         return html
 
     def markup_user_break(self, replacement_break):
+        '''
+        Takes string a user supplies and wraps it in markup that will be centered with 
+        appropriate margins.  <hr> and <img> tags are allowed.  If the user specifies
+        a style with width attributes in the <hr> tag then the appropriate margins are
+        applied to wrapping divs.  This is because many ebook devices don't support margin:auto
+        All other html is converted to text.
+        '''
         hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em">'
         if re.findall('(<|>)', replacement_break):
             if re.match('^<hr', replacement_break):
-                scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
+                if replacement_break.find('width') != -1:
+                   width = int(re.sub('.*?width(:|=)(?P<wnum>\d+).*', '\g<wnum>', replacement_break))
+                   divpercent = (100 - width) / 2
+                   hr_open = re.sub('45', str(divpercent), hr_open)
+                   scene_break = hr_open+replacement_break+'</div>'
+                else:
+                    scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
             elif re.match('^<img', replacement_break):
                 scene_break = self.scene_break_open+replacement_break+'</p>'
             else:
@@ -622,9 +635,11 @@ class HeuristicProcessor(object):
             blanks_count = len(self.any_multi_blank.findall(html))
             if blanks_count >= 1:
                 html = self.merge_blanks(html, blanks_count)
-            # Center separator lines, use a bit larger margin in this case
             scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
             scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
+            # If the user has enabled scene break replacement, then either softbreaks
+            # or 'hard' scene breaks are replaced, depending on which is in use
+            # Otherwise separator lines are centered, use a bit larger margin in this case
             replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
             if replacement_break is not None:
                 replacement_break = self.markup_user_break(replacement_break)
diff --git a/src/calibre/manual/conversion.rst b/src/calibre/manual/conversion.rst
index 7f3ff21fe0..ecd8609ecc 100644
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@@ -311,10 +311,15 @@ remove all non-breaking-space entities, or may include false positive matches re
 
 :guilabel:`Ensure scene breaks are consistently formatted`
     With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.  
-    It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
-    page width.  Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and 
-    thus become difficult to distinguish.
+    'Soft' scene break markers, i.e. scene breaks only defined by extra white space, are styled to ensure that they 
+    will not be displayed in conjunction with page breaks.
 
+:guilabel:`Replace scene breaks`
+    If this option is configured then |app| will replace scene break markers it finds with the replacement text specified by the
+    user. In general you should avoid using html tags, |app| will discard any tags and use pre-defined markup.  <hr />
+    tags, i.e. horizontal rules, are an exception.  These can optionally be specified with styles, if you choose to add your own
+    style be sure to include the 'width' setting, otherwise the style information will be discarded.
+ 
 :guilabel:`Remove unnecessary hyphens`
     |app| will analyze all hyphenated content in the document when this option is enabled.  The document itself is used
     as a dictionary for analysis.  This allows |app| to accurately remove hyphens for any words in the document in any language, 
@@ -628,7 +633,7 @@ between 0 and 1. The default is 0.45, just under the median line length. Lower t
 text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
 
 Also, they often have headers and footers as part of the document that will become included with the text.
-Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
+Use the Search and Replace panel to remove headers and footers to mitigate this issue. If the headers and footers are not
 removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read 
 :ref:`regexptutorial`.
 

From 72fe944b95bd3a6066b43b77a5b0ba9abb1685e8 Mon Sep 17 00:00:00 2001
From: ldolse <ldolse@yahoo.com>
Date: Tue, 1 Feb 2011 22:05:54 +0800
Subject: [PATCH 11/11] ...

---
 src/calibre/gui2/convert/single.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/calibre/gui2/convert/single.py b/src/calibre/gui2/convert/single.py
index 6540383229..59fcbb65ad 100644
--- a/src/calibre/gui2/convert/single.py
+++ b/src/calibre/gui2/convert/single.py
@@ -258,7 +258,6 @@ class Config(ResizableDialog, Ui_Dialog):
             if not w.pre_commit_check():
                 return
             x = w.commit(save_defaults=False)
-            print x
             recs.update(x)
         self.opf_file, self.cover_file = self.mw.opf_file, self.mw.cover_file
         self._recommendations = recs