mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
started scene break detection in heuristics, improved text conversion of Sigil compatible chapter titles
This commit is contained in:
parent
bacfc28d3a
commit
5c988788a0
@ -27,7 +27,7 @@ class HeuristicProcessor(object):
|
|||||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.softbreak = re.compile(r'\s*(?P<openline><p(?=\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
self.softbreak = re.compile(r'\s*(?P<openline><p(?=\sclass=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
|
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||||
|
|
||||||
def is_pdftohtml(self, src):
|
def is_pdftohtml(self, src):
|
||||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||||
@ -42,8 +42,10 @@ class HeuristicProcessor(object):
|
|||||||
" chapters. - " + unicode(chap))
|
" chapters. - " + unicode(chap))
|
||||||
return '<h2>'+chap+'</h2>\n'
|
return '<h2>'+chap+'</h2>\n'
|
||||||
else:
|
else:
|
||||||
txt_chap = html2text(chap)
|
delete_whitespace = re.compile('^\s*(?P<c>.*?)\s*$')
|
||||||
txt_title = html2text(title)
|
delete_quotes = re.compile('\'\"')
|
||||||
|
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(chap)))
|
||||||
|
txt_title = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(title)))
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
self.log.debug("marked " + unicode(self.html_preprocess_sections) +
|
||||||
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
" chapters & titles. - " + unicode(chap) + ", " + unicode(title))
|
||||||
@ -416,6 +418,12 @@ class HeuristicProcessor(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def detect_blank_formatting(self, html):
|
||||||
|
blanks_before_headings = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}(?=\s*<h\d)', re.IGNORECASE)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def detect_soft_breaks(self, html):
|
||||||
|
return html
|
||||||
|
|
||||||
def __call__(self, html):
|
def __call__(self, html):
|
||||||
self.log.debug("********* Heuristic processing HTML *********")
|
self.log.debug("********* Heuristic processing HTML *********")
|
||||||
@ -465,6 +473,8 @@ class HeuristicProcessor(object):
|
|||||||
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||||
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
html = self.markup_chapters(html, self.totalwords, blanks_between_paragraphs)
|
||||||
|
|
||||||
|
self.dump(html, 'after_chapter_markup')
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||||
html = self.markup_italicis(html)
|
html = self.markup_italicis(html)
|
||||||
|
|
||||||
@ -525,6 +535,8 @@ class HeuristicProcessor(object):
|
|||||||
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
html = doubleheading.sub('\g<firsthead>'+'\n<h3'+'\g<secondhead>'+'</h3>', html)
|
||||||
|
|
||||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||||
|
html = self.detect_blank_formatting(html)
|
||||||
|
html = self.detect_soft_breaks(html)
|
||||||
# Center separator lines
|
# Center separator lines
|
||||||
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
html = re.sub(u'<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*(?P<break>([*#•=✦]+\s*)+)\s*(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>', '<p style="text-align:center; margin-top:1.25em; margin-bottom:1.25em">' + '\g<break>' + '</p>', html)
|
||||||
if not self.blanks_deleted:
|
if not self.blanks_deleted:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user