mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Sync to ldolse heuristics branch.
This commit is contained in:
commit
afb72b8a59
@ -137,7 +137,7 @@ def add_pipeline_options(parser, plumber):
|
|||||||
'italicize_common_cases', 'fix_indents',
|
'italicize_common_cases', 'fix_indents',
|
||||||
'html_unwrap_factor', 'unwrap_lines',
|
'html_unwrap_factor', 'unwrap_lines',
|
||||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||||
'dehyphenate',
|
'dehyphenate', 'renumber_headings',
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -532,7 +532,13 @@ OptionRecommendation(name='dehyphenate',
|
|||||||
help=_('Analyses hyphenated words throughout the document. The '
|
help=_('Analyses hyphenated words throughout the document. The '
|
||||||
'document itself is used as a dictionary to determine whether hyphens '
|
'document itself is used as a dictionary to determine whether hyphens '
|
||||||
'should be retained or removed.')),
|
'should be retained or removed.')),
|
||||||
|
|
||||||
|
OptionRecommendation(name='renumber_headings',
|
||||||
|
recommended_value=False, level=OptionRecommendation.LOW,
|
||||||
|
help=_('Looks for occurences of sequential <h1> or <h2> tags. '
|
||||||
|
'The tags are renumbered to prevent splitting in the middle '
|
||||||
|
'of chapter headings.')),
|
||||||
|
|
||||||
OptionRecommendation(name='sr1_search',
|
OptionRecommendation(name='sr1_search',
|
||||||
recommended_value='', level=OptionRecommendation.LOW,
|
recommended_value='', level=OptionRecommendation.LOW,
|
||||||
help=_('Search pattern (regular expression) to be replaced with '
|
help=_('Search pattern (regular expression) to be replaced with '
|
||||||
|
@ -272,9 +272,11 @@ class PreProcessor(object):
|
|||||||
|
|
||||||
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
unwrap = re.compile(u"%s" % unwrap_regex, re.UNICODE)
|
||||||
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
em_en_unwrap = re.compile(u"%s" % em_en_unwrap_regex, re.UNICODE)
|
||||||
|
shy_unwrap = re.compile(u"%s" % shy_unwrap_regex, re.UNICODE)
|
||||||
|
|
||||||
content = unwrap.sub(' ', content)
|
content = unwrap.sub(' ', content)
|
||||||
content = em_en_unwrap.sub('', content)
|
content = em_en_unwrap.sub('', content)
|
||||||
|
content = shy_unwrap.sub('', content)
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def txt_process(self, match):
|
def txt_process(self, match):
|
||||||
@ -461,11 +463,12 @@ class PreProcessor(object):
|
|||||||
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([*#•]+\s*)+)(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_break, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
|
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
# headings and titles, images, etc
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
# headings and titles, images, etc
|
||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||||
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
|
Loading…
x
Reference in New Issue
Block a user