diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index db1ec0857d..c9612d97b9 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber): 'italicize_common_cases', 'fix_indents', 'html_unwrap_factor', 'unwrap_lines', 'delete_blank_paragraphs', 'format_scene_breaks', - 'dehyphenate', + 'dehyphenate', 'renumber_headings', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 48b965f624..b8c45dfa14 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -532,7 +532,13 @@ OptionRecommendation(name='dehyphenate', help=_('Analyses hyphenated words throughout the document. The ' 'document itself is used as a dictionary to determine whether hyphens ' 'should be retained or removed.')), - + +OptionRecommendation(name='renumber_headings', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Looks for occurences of sequential

or

tags. ' + 'The tags are renumbered to prevent splitting in the middle ' + 'of chapter headings.')), + OptionRecommendation(name='sr1_search', recommended_value='', level=OptionRecommendation.LOW, help=_('Search pattern (regular expression) to be replaced with ' diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 56d4339d8c..305346d496 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -272,9 +272,11 @@ class PreProcessor(object): unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) + shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE) content = unwrap.sub(' ', content) content = em_en_unwrap.sub('', content) + content = shy_unwrap.sub('', content) return content def txt_process(self, match): @@ -461,11 +463,12 @@ class PreProcessor(object): chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) - # search for places where a first or second level heading is immediately followed by another - # top level heading. demote the second heading to h3 to prevent splitting between chapter - # headings and titles, images, etc - doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) - html = doubleheading.sub('\g'+'\n'+'

', html) + if getattr(self.extra_opts, 'renumber_headings', False): + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) + html = doubleheading.sub('\g'+'\n'+'', html) if getattr(self.extra_opts, 'format_scene_breaks', False): # Center separator lines