From 946f1cf6c0e332898d34a7cf41680b6b2e3fce7b Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 15 Jan 2011 22:07:20 +0800 Subject: [PATCH 1/3] added option for renumbering heading tags --- src/calibre/ebooks/conversion/cli.py | 2 +- src/calibre/ebooks/conversion/plumber.py | 8 +++++++- src/calibre/ebooks/conversion/utils.py | 15 ++++++++------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/calibre/ebooks/conversion/cli.py b/src/calibre/ebooks/conversion/cli.py index db1ec0857d..c9612d97b9 100644 --- a/src/calibre/ebooks/conversion/cli.py +++ b/src/calibre/ebooks/conversion/cli.py @@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber): 'italicize_common_cases', 'fix_indents', 'html_unwrap_factor', 'unwrap_lines', 'delete_blank_paragraphs', 'format_scene_breaks', - 'dehyphenate', + 'dehyphenate', 'renumber_headings', ] ), diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 48b965f624..b8c45dfa14 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -532,7 +532,13 @@ OptionRecommendation(name='dehyphenate', help=_('Analyses hyphenated words throughout the document. The ' 'document itself is used as a dictionary to determine whether hyphens ' 'should be retained or removed.')), - + +OptionRecommendation(name='renumber_headings', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Looks for occurences of sequential

or

tags. ' + 'The tags are renumbered to prevent splitting in the middle ' + 'of chapter headings.')), + OptionRecommendation(name='sr1_search', recommended_value='', level=OptionRecommendation.LOW, help=_('Search pattern (regular expression) to be replaced with ' diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 2a88d371cc..4c62d2c06f 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -416,7 +416,7 @@ class PreProcessor(object): dehyphenator = Dehyphenator() html = dehyphenator(html,'html_cleanup', length) - if getattr(self.extra_opts, 'dehyphenate', True): + if getattr(self.extra_opts, 'dehyphenate', False): # dehyphenate in cleanup mode to fix anything previous conversions/editing missed self.log("Fixing hyphenated content") dehyphenator = Dehyphenator() @@ -429,13 +429,14 @@ class PreProcessor(object): chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) - # search for places where a first or second level heading is immediately followed by another - # top level heading. demote the second heading to h3 to prevent splitting between chapter - # headings and titles, images, etc - doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) - html = doubleheading.sub('\g'+'\n'+'

', html) + if getattr(self.extra_opts, 'renumber_headings', True): + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE) + html = doubleheading.sub('\g'+'\n'+'', html) - if getattr(self.extra_opts, 'format_scene_breaks', True): + if getattr(self.extra_opts, 'format_scene_breaks', False): # Center separator lines html = re.sub(u'<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(?P([*#•]+\s*)+)\s*()?\s*()?\s*()?\s*', '

' + '\g' + '

', html) From 81c365b3a9efd8b546fa096bbe4eb737e607b6ba Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 15 Jan 2011 22:41:38 +0800 Subject: [PATCH 2/3] ... --- src/calibre/ebooks/conversion/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 96d386bf78..3693d11cee 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -461,7 +461,7 @@ class PreProcessor(object): chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P
(]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?(){0,2}\s*()?\s*(){0,2}\s*()?\s*)', re.IGNORECASE) html = chapdetect3.sub(self.chapter_break, html) - if getattr(self.extra_opts, 'renumber_headings', True): + if getattr(self.extra_opts, 'renumber_headings', False): # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc From 0edf1e550ea35f4b63208138ecd07c3d5dcb6856 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 15 Jan 2011 22:47:51 +0800 Subject: [PATCH 3/3] ... --- src/calibre/ebooks/conversion/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 3693d11cee..305346d496 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -272,9 +272,11 @@ class PreProcessor(object): unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE) em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE) + shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE) content = unwrap.sub(' ', content) content = em_en_unwrap.sub('', content) + content = shy_unwrap.sub('', content) return content def txt_process(self, match):