Sync to ldolse heuristics branch.

2025-11-22 06:23:02 -05:00 · 2011-01-15 10:05:36 -05:00 · 2011-01-15 10:05:36 -05:00 · afb72b8a59
commit afb72b8a59
parent cfaa113f95 0edf1e550e
3 changed files with 16 additions and 7 deletions
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
                      'italicize_common_cases', 'fix_indents',
                      'html_unwrap_factor', 'unwrap_lines',
                      'delete_blank_paragraphs', 'format_scene_breaks',
-                      'dehyphenate',
+                      'dehyphenate', 'renumber_headings',
                  ]
                  ),
                  
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -532,7 +532,13 @@ OptionRecommendation(name='dehyphenate',
    help=_('Analyses hyphenated words throughout the document.  The '
           'document itself is used as a dictionary to determine whether hyphens '
           'should be retained or removed.')),
-    
+
+OptionRecommendation(name='renumber_headings',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Looks for occurences of sequential <h1> or <h2> tags. '
+           'The tags are renumbered to prevent splitting in the middle '
+           'of chapter headings.')),
+
 OptionRecommendation(name='sr1_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -272,9 +272,11 @@ class PreProcessor(object):

        unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
        em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
+        shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)

        content = unwrap.sub(' ', content)
        content = em_en_unwrap.sub('', content)
+        content = shy_unwrap.sub('', content)
        return content

    def txt_process(self, match):
@ -461,11 +463,12 @@ class PreProcessor(object):
            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)

-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
+        if getattr(self.extra_opts, 'renumber_headings', False):
+            # search for places where a first or second level heading is immediately followed by another
+            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+            # headings and titles, images, etc
+            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)

        if getattr(self.extra_opts, 'format_scene_breaks', False):
            # Center separator lines