Heuristics: Improved Scene break detection and add option to control what scene breaks are replaced by.

2025-08-30 23:00:21 -04:00 · 2011-02-01 17:36:54 -07:00 · 2011-02-01 17:36:54 -07:00 · 1fe6530b8f
commit 1fe6530b8f
parent 3e895675e2 72fe944b95
6 changed files with 223 additions and 35 deletions
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -46,7 +46,8 @@ HEURISTIC_OPTIONS = ['markup_chapter_headings',
                      'italicize_common_cases', 'fix_indents',
                      'html_unwrap_factor', 'unwrap_lines',
                      'delete_blank_paragraphs', 'format_scene_breaks',
-                      'dehyphenate', 'renumber_headings']
+                      'dehyphenate', 'renumber_headings',
                      'replace_scene_breaks']
 def print_help(parser, log):
    help = parser.format_help().encode(preferred_encoding, 'replace')
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -531,6 +531,11 @@ OptionRecommendation(name='format_scene_breaks',
           'Replace soft scene breaks that use multiple blank lines with'
           'horizontal rules.')),
 OptionRecommendation(name='replace_scene_breaks',
    recommended_value=None, level=OptionRecommendation.LOW,
    help=_('Replace scene breaks with the specified text. By default, the '
        'text from the input document is used.')),
 OptionRecommendation(name='dehyphenate',
    recommended_value=True, level=OptionRecommendation.LOW,
    help=_('Analyze hyphenated words throughout the document.  The '
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -26,9 +26,14 @@ class HeuristicProcessor(object):
        self.blanks_deleted = False
        self.blanks_between_paragraphs = False
        self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
-        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|spacer)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
+        self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
        self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
        self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
        self.line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
        self.line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
        self.single_blank = re.compile(r'(\s*<p[^>]*>\s*</p>)', re.IGNORECASE)
        self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
@ -187,19 +192,17 @@ class HeuristicProcessor(object):
        # Build the Regular Expressions in pieces
        init_lookahead = "(?=<(p|div))"
-        chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
+        chapter_line_open = self.line_open
        title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
        chapter_header_open = r"(?P<chap>"
        title_header_open = r"(?P<title>"
        chapter_header_close = ")\s*"
        title_header_close = ")"
-        chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
+        chapter_line_close = self.line_close
        title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)>)?\s*</(?P=outer2)>"
        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            chapter_line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
            chapter_line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
            title_line_open = "<(?P<outer2>p)[^>]*>\s*"
            title_line_close = "\s*</(?P=outer2)>"
@ -374,13 +377,15 @@ class HeuristicProcessor(object):
        html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
        # Delete microsoft 'smart' tags
        html = re.sub('(?i)</?st1:\w+>', '', html)
-        # Delete self closing paragraph tags
+        # Re-open self closing paragraph tags
-        html = re.sub('<p\s?/>', '', html)
+        html = re.sub('<p[^>/]*/>', '<p> </p>', html)
        # Get rid of empty span, bold, font, em, & italics tags
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
        html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
        html = re.sub(r"\s*<(font|[ibu]|em|strong)[^>]*>\s*(<(font|[ibu]|em|strong)[^>]*>\s*</(font|[ibu]|em|strong)>\s*){0,2}\s*</(font|[ibu]|em|strong)>", " ", html)
        # Empty heading tags
        html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
        self.deleted_nbsps = True
        return html
@ -419,32 +424,98 @@ class HeuristicProcessor(object):
                return True
        return False
-    def detect_blank_formatting(self, html):
+    def merge_blanks(self, html, blanks_count=None):
-        blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){1,}(?=\s*<h\d)', re.IGNORECASE)
+        base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
-        blanks_after_headings = re.compile(r'(?<=</h\d>)(\s*<p[^>]*>\s*</p>){1,}', re.IGNORECASE)
+        em_per_line = 1.5 # Add another 1.5 em for each additional blank
-        def markup_spacers(match):
+        def merge_matches(match):
            to_merge = match.group(0)
            lines = float(len(self.single_blank.findall(to_merge))) - 1.
            em = base_em + (em_per_line * lines)
            if to_merge.find('whitespace'):
                newline = self.any_multi_blank.sub('\n<p class="whitespace'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
            else:
                newline = self.any_multi_blank.sub('\n<p class="softbreak'+str(int(em * 10))+'" style="text-align:center; margin-top:'+str(em)+'em"> </p>', match.group(0))
            return newline
        html = self.any_multi_blank.sub(merge_matches, html)
        return html
    def detect_whitespace(self, html):
        blanks_around_headings = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?(?P<heading><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
        blanks_n_nopunct = re.compile(r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE)
        def merge_header_whitespace(match):
            initblanks = match.group('initparas')
            endblanks = match.group('initparas')
            heading = match.group('heading')
            top_margin = ''
            bottom_margin = ''
            if initblanks is not None:
                top_margin = 'margin-top:'+str(len(self.single_blank.findall(initblanks)))+'em;'
            if endblanks is not None:
                bottom_margin = 'margin-bottom:'+str(len(self.single_blank.findall(initblanks)))+'em;'
            if initblanks == None and endblanks == None:
                return heading
            else:
                heading = re.sub('(?i)<h(?P<hnum>\d+)[^>]*>', '\n\n<h'+'\g<hnum>'+' style="'+top_margin+bottom_margin+'">', heading)
            return heading
        html = blanks_around_headings.sub(merge_header_whitespace, html)
        def markup_whitespaces(match):
            blanks = match.group(0)
-           blanks = self.blankreg.sub('\n<p class="spacer"> </p>', blanks)
+            blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
            return blanks
-        html = blanks_before_headings.sub(markup_spacers, html)
+
-        html = blanks_after_headings.sub(markup_spacers, html)
+        html = blanks_n_nopunct.sub(markup_whitespaces, html)
        if self.html_preprocess_sections > self.min_chapters:
-            html = re.sub('(?si)^.*?(?=<h\d)', markup_spacers, html)
+            html = re.sub('(?si)^.*?(?=<h\d)', markup_whitespaces, html)
        return html
    def detect_soft_breaks(self, html):
        if not self.blanks_deleted and self.blanks_between_paragraphs:
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
        else:
-            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
        return html
    def markup_user_break(self, replacement_break):
        '''
        Takes string a user supplies and wraps it in markup that will be centered with
        appropriate margins.  <hr> and <img> tags are allowed.  If the user specifies
        a style with width attributes in the <hr> tag then the appropriate margins are
        applied to wrapping divs.  This is because many ebook devices don't support margin:auto
        All other html is converted to text.
        '''
        hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em">'
        if re.findall('(<|>)', replacement_break):
            if re.match('^<hr', replacement_break):
                if replacement_break.find('width') != -1:
                   width = int(re.sub('.*?width(:|=)(?P<wnum>\d+).*', '\g<wnum>', replacement_break))
                   divpercent = (100 - width) / 2
                   hr_open = re.sub('45', str(divpercent), hr_open)
                   scene_break = hr_open+replacement_break+'</div>'
                else:
                   scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
            elif re.match('^<img', replacement_break):
                scene_break = self.scene_break_open+replacement_break+'</p>'
            else:
                from calibre.utils.html2text import html2text
                replacement_break = html2text(replacement_break)
                replacement_break = re.sub('\s', '&nbsp;', replacement_break)
                scene_break = self.scene_break_open+replacement_break+'</p>'
        else:
            replacement_break = re.sub('\s', '&nbsp;', replacement_break)
            scene_break = self.scene_break_open+replacement_break+'</p>'
        return scene_break
    def __call__(self, html):
        self.log.debug("*********  Heuristic processing HTML  *********")
        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
        try:
@ -458,7 +529,7 @@ class HeuristicProcessor(object):
        # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
        html = self.arrange_htm_line_endings(html)
-
+        #self.dump(html, 'after_arrange_line_endings')
        if self.cleanup_required():
            ###### Check Markup ######
            #
@ -478,6 +549,11 @@ class HeuristicProcessor(object):
            # fix indents must run before this step, as it removes non-breaking spaces
            html = self.cleanup_markup(html)
        is_pdftohtml = self.is_pdftohtml(html)
        if is_pdftohtml:
            self.line_open = "<(?P<outer>p)[^>]*>(\s*<[ibu][^>]*>)?\s*"
            self.line_close = "\s*(</[ibu][^>]*>\s*)?</(?P=outer)>"
        # ADE doesn't render <br />, change to empty paragraphs
        #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
@ -489,6 +565,7 @@ class HeuristicProcessor(object):
        if getattr(self.extra_opts, 'markup_chapter_headings', False):
            html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
        #self.dump(html, 'after_chapter_markup')
        if getattr(self.extra_opts, 'italicize_common_cases', False):
            html = self.markup_italicis(html)
@ -498,7 +575,7 @@ class HeuristicProcessor(object):
        if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
            self.log.debug("deleting blank lines")
            self.blanks_deleted = True
-            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid"> </p>', html)
+            html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
            html = self.blankreg.sub('', html)
        # Determine line ending type
@ -539,7 +616,7 @@ class HeuristicProcessor(object):
        if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
            self.log.debug("Looking for more split points based on punctuation,"
                    " currently have " + unicode(self.html_preprocess_sections))
-            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
+            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)
        if getattr(self.extra_opts, 'renumber_headings', False):
@ -549,14 +626,32 @@ class HeuristicProcessor(object):
            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
        # If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
        # style it with the 'whitespace' class.  All remaining blank lines are styled as softbreaks.
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
-            html = self.detect_blank_formatting(html)
+            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
-            # Center separator lines
+            blanks_count = len(self.any_multi_blank.findall(html))
-            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em; page-break-before:avoid">' + '\g<break>' + '</p>', html)
+            if blanks_count >= 1:
-            #html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', '<div id="softbreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em"><hr style="height: 3px; background:#505050" /></div>', html)
+                html = self.merge_blanks(html, blanks_count)
            scene_break_regex = self.line_open+'(?![\w\'\"])(?P<break>((?P<break_char>((?!\s)\W))\s*(?P=break_char)?)+)\s*'+self.line_close
            scene_break = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
            # If the user has enabled scene break replacement, then either softbreaks
            # or 'hard' scene breaks are replaced, depending on which is in use
            # Otherwise separator lines are centered, use a bit larger margin in this case
            replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
            if replacement_break is not None:
                replacement_break = self.markup_user_break(replacement_break)
                if len(scene_break.findall(html)) >= 1:
                    html = scene_break.sub(replacement_break, html)
                else:
                    html = re.sub('<p\s+class="softbreak"[^>]*>\s*</p>', replacement_break, html)
            else:
                html = scene_break.sub(self.scene_break_open+'\g<break>'+'</p>', html)
        if self.deleted_nbsps:
-            # put back non-breaking spaces in empty paragraphs to preserve original formatting
+            # put back non-breaking spaces in empty paragraphs so they render correctly
            html = self.anyblank.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
        return html
--- a/src/calibre/gui2/convert/heuristics.py
+++ b/src/calibre/gui2/convert/heuristics.py
@ -6,6 +6,7 @@ __docformat__ = 'restructuredtext en'
 from PyQt4.Qt import Qt
 from calibre.gui2 import gprefs
 from calibre.gui2.convert.heuristics_ui import Ui_Form
 from calibre.gui2.convert import Widget
@ -21,17 +22,35 @@ class HeuristicsWidget(Widget, Ui_Form):
                ['enable_heuristics', 'markup_chapter_headings',
                 'italicize_common_cases', 'fix_indents',
                 'html_unwrap_factor', 'unwrap_lines',
-                 'delete_blank_paragraphs', 'format_scene_breaks',
+                 'delete_blank_paragraphs',
                 'format_scene_breaks', 'replace_scene_breaks',
                 'dehyphenate', 'renumber_headings']
                )
        self.db, self.book_id = db, book_id
        self.rssb_defaults = ['', '<hr />', '* * *']
        self.initialize_options(get_option, get_help, db, book_id)
        self.load_histories()
        self.opt_enable_heuristics.stateChanged.connect(self.enable_heuristics)
        self.opt_unwrap_lines.stateChanged.connect(self.enable_unwrap)
        self.enable_heuristics(self.opt_enable_heuristics.checkState())
    def restore_defaults(self, get_option):
        Widget.restore_defaults(self, get_option)
        rssb_hist = gprefs['replace_scene_breaks_history']
        for x in self.rssb_defaults:
            if x in rssb_hist:
                del rssb_hist[rssb_hist.index(x)]
        gprefs['replace_scene_breaks_history'] = self.rssb_defaults + gprefs['replace_scene_breaks_history']
    def commit_options(self, save_defaults=False):
        self.save_histories()
        return Widget.commit_options(self, save_defaults)
    def break_cycles(self):
        Widget.break_cycles(self)
@ -45,6 +64,30 @@ class HeuristicsWidget(Widget, Ui_Form):
        if val is None and g is self.opt_html_unwrap_factor:
            g.setValue(0.0)
            return True
        if not val and g is self.opt_replace_scene_breaks:
            g.lineEdit().setText('')
            return True
    def load_histories(self):
        val = unicode(self.opt_replace_scene_breaks.currentText())
        rssb_hist = gprefs.get('replace_scene_breaks_history', self.rssb_defaults)
        if val in rssb_hist:
            del rssb_hist[rssb_hist.index(val)]
        rssb_hist.insert(0, val)
        for v in rssb_hist:
            # Ensure we don't have duplicate items.
            if self.opt_replace_scene_breaks.findText(v) == -1:
                self.opt_replace_scene_breaks.addItem(v)
        self.opt_replace_scene_breaks.setCurrentIndex(0)
    def save_histories(self):
        rssb_history = []
        history_pats = [unicode(self.opt_replace_scene_breaks.lineEdit().text())] + [unicode(self.opt_replace_scene_breaks.itemText(i)) for i in xrange(self.opt_replace_scene_breaks.count())]
        for p in history_pats[:10]:
            # Ensure we don't have duplicate items.
            if p not in rssb_history:
                rssb_history.append(p)
        gprefs['replace_scene_breaks_history'] = rssb_history
    def enable_heuristics(self, state):
        state = state == Qt.Checked
--- a/src/calibre/gui2/convert/heuristics.ui
+++ b/src/calibre/gui2/convert/heuristics.ui
@ -150,6 +150,45 @@
        </property>
       </widget>
      </item>
      <item>
       <layout class="QHBoxLayout" name="horizontalLayout_2">
        <property name="sizeConstraint">
         <enum>QLayout::SetDefaultConstraint</enum>
        </property>
        <item>
         <widget class="QLabel" name="label_2">
          <property name="sizePolicy">
           <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
            <horstretch>0</horstretch>
            <verstretch>0</verstretch>
           </sizepolicy>
          </property>
          <property name="text">
           <string>Replace soft scene &amp;breaks:</string>
          </property>
          <property name="buddy">
           <cstring>opt_replace_scene_breaks</cstring>
          </property>
         </widget>
        </item>
        <item>
         <widget class="QComboBox" name="opt_replace_scene_breaks">
          <property name="sizePolicy">
           <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
            <horstretch>0</horstretch>
            <verstretch>0</verstretch>
           </sizepolicy>
          </property>
          <property name="editable">
           <bool>true</bool>
          </property>
          <property name="insertPolicy">
           <enum>QComboBox::InsertAtTop</enum>
          </property>
         </widget>
        </item>
       </layout>
      </item>
      <item>
       <widget class="QCheckBox" name="opt_dehyphenate">
        <property name="text">
--- a/src/calibre/manual/conversion.rst
+++ b/src/calibre/manual/conversion.rst
@ -311,9 +311,14 @@ remove all non-breaking-space entities, or may include false positive matches re
 :guilabel:`Ensure scene breaks are consistently formatted`
    With this option |app| will attempt to detect common scene-break markers and ensure that they are center aligned.  
-    It also attempts to detect scene breaks defined by white space and replace them with a horizontal rule 15% of the
+    'Soft' scene break markers, i.e. scene breaks only defined by extra white space, are styled to ensure that they 
-    page width.  Some readers may find this desirable as these 'soft' scene breaks often become page breaks on readers, and 
+    will not be displayed in conjunction with page breaks.
-    thus become difficult to distinguish.
+
 :guilabel:`Replace scene breaks`
    If this option is configured then |app| will replace scene break markers it finds with the replacement text specified by the
    user. In general you should avoid using html tags, |app| will discard any tags and use pre-defined markup.  <hr />
    tags, i.e. horizontal rules, are an exception.  These can optionally be specified with styles, if you choose to add your own
    style be sure to include the 'width' setting, otherwise the style information will be discarded.
 :guilabel:`Remove unnecessary hyphens`
    |app| will analyze all hyphenated content in the document when this option is enabled.  The document itself is used
@ -628,7 +633,7 @@ between 0 and 1. The default is 0.45, just under the median line length. Lower t
 text in the unwrapping. Increase to include less. You can adjust this value in the conversion settings under :guilabel:`PDF Input`.
 Also, they often have headers and footers as part of the document that will become included with the text.
-Use the options to remove headers and footers to mitigate this issue. If the headers and footers are not
+Use the Search and Replace panel to remove headers and footers to mitigate this issue. If the headers and footers are not
 removed from the text it can throw off the paragraph unwrapping. To learn how to use the header and footer removal options, read 
 :ref:`regexptutorial`.