added option for renumbering heading tags

2026-03-06 00:43:42 -05:00 · 2011-01-15 22:07:20 +08:00 · 2011-01-15 22:07:20 +08:00 · 946f1cf6c0
commit 946f1cf6c0
parent 1301fe69d1
3 changed files with 16 additions and 9 deletions
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
                      'italicize_common_cases', 'fix_indents',
                      'html_unwrap_factor', 'unwrap_lines',
                      'delete_blank_paragraphs', 'format_scene_breaks',
-                      'dehyphenate',
+                      'dehyphenate', 'renumber_headings',
                  ]
                  ),
                  
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -532,7 +532,13 @@ OptionRecommendation(name='dehyphenate',
    help=_('Analyses hyphenated words throughout the document.  The '
           'document itself is used as a dictionary to determine whether hyphens '
           'should be retained or removed.')),
-    
+
+OptionRecommendation(name='renumber_headings',
+    recommended_value=False, level=OptionRecommendation.LOW,
+    help=_('Looks for occurences of sequential <h1> or <h2> tags. '
+           'The tags are renumbered to prevent splitting in the middle '
+           'of chapter headings.')),
+
 OptionRecommendation(name='sr1_search',
    recommended_value='', level=OptionRecommendation.LOW,
    help=_('Search pattern (regular expression) to be replaced with '
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -416,7 +416,7 @@ class PreProcessor(object):
                dehyphenator = Dehyphenator()
                html = dehyphenator(html,'html_cleanup', length)

-        if getattr(self.extra_opts, 'dehyphenate', True):
+        if getattr(self.extra_opts, 'dehyphenate', False):
            # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
            self.log("Fixing hyphenated content")
            dehyphenator = Dehyphenator()
@ -429,13 +429,14 @@ class PreProcessor(object):
            chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
            html = chapdetect3.sub(self.chapter_break, html)

-        # search for places where a first or second level heading is immediately followed by another
-        # top level heading.  demote the second heading to h3 to prevent splitting between chapter
-        # headings and titles, images, etc
-        doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
-        html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
+        if getattr(self.extra_opts, 'renumber_headings', True):
+            # search for places where a first or second level heading is immediately followed by another
+            # top level heading.  demote the second heading to h3 to prevent splitting between chapter
+            # headings and titles, images, etc
+            doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
+            html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)

-        if getattr(self.extra_opts, 'format_scene_breaks', True):
+        if getattr(self.extra_opts, 'format_scene_breaks', False):
            # Center separator lines
            html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)