mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to ldolse heuristics branch.
This commit is contained in:
commit
afb72b8a59
@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
|
||||
'italicize_common_cases', 'fix_indents',
|
||||
'html_unwrap_factor', 'unwrap_lines',
|
||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||
'dehyphenate',
|
||||
'dehyphenate', 'renumber_headings',
|
||||
]
|
||||
),
|
||||
|
||||
|
@ -533,6 +533,12 @@ OptionRecommendation(name='dehyphenate',
|
||||
'document itself is used as a dictionary to determine whether hyphens '
|
||||
'should be retained or removed.')),
|
||||
|
||||
OptionRecommendation(name='renumber_headings',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Looks for occurences of sequential <h1> or <h2> tags. '
|
||||
'The tags are renumbered to prevent splitting in the middle '
|
||||
'of chapter headings.')),
|
||||
|
||||
OptionRecommendation(name='sr1_search',
|
||||
recommended_value='', level=OptionRecommendation.LOW,
|
||||
help=_('Search pattern (regular expression) to be replaced with '
|
||||
|
@ -272,9 +272,11 @@ class PreProcessor(object):
|
||||
|
||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||
|
||||
content = unwrap.sub(' ', content)
|
||||
content = em_en_unwrap.sub('', content)
|
||||
content = shy_unwrap.sub('', content)
|
||||
return content
|
||||
|
||||
def txt_process(self, match):
|
||||
@ -461,6 +463,7 @@ class PreProcessor(object):
|
||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
html = chapdetect3.sub(self.chapter_break, html)
|
||||
|
||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||
# search for places where a first or second level heading is immediately followed by another
|
||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||
# headings and titles, images, etc
|
||||
|
Loading…
x
Reference in New Issue
Block a user