mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
minor tweaks to preprocessing, backed out reflow change
This commit is contained in:
parent
f6de0bef13
commit
9a06996b16
@ -319,8 +319,8 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
# unwrap hyphenation - moved here so it's executed after header/footer removal
|
# unwrap hyphenation - moved here so it's executed after header/footer removal
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
# unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
|
# unwrap visible dashes and hyphens - don't delete they are often hyphens for
|
||||||
# hyphens are for compound words, formatting, etc
|
# for compound words, formatting, etc
|
||||||
end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'(?<=[-–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
# unwrap/delete soft hyphens
|
# unwrap/delete soft hyphens
|
||||||
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
@ -29,16 +29,12 @@ class PreProcessor(object):
|
|||||||
self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
|
self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
|
||||||
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
|
||||||
|
|
||||||
def chapter_link(self, match):
|
def chapter_break(self, match):
|
||||||
chap = match.group('sectionlink')
|
chap = match.group('section')
|
||||||
if not chap:
|
styles = match.group('styles')
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||||
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
|
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
|
||||||
return '<br style="page-break-before:always">'
|
return '<'+styles+' style="page-break-before:always">'+chap
|
||||||
else:
|
|
||||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
|
||||||
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
|
|
||||||
return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
|
|
||||||
|
|
||||||
def no_markup(self, raw, percent):
|
def no_markup(self, raw, percent):
|
||||||
'''
|
'''
|
||||||
@ -74,7 +70,7 @@ class PreProcessor(object):
|
|||||||
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
|
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
|
||||||
|
|
||||||
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
|
||||||
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
|
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||||
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
|
||||||
blanklines = blankreg.findall(html)
|
blanklines = blankreg.findall(html)
|
||||||
lines = linereg.findall(html)
|
lines = linereg.findall(html)
|
||||||
@ -100,8 +96,13 @@ class PreProcessor(object):
|
|||||||
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
|
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
|
||||||
html = chapdetect.sub(self.chapter_head, html)
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
|
||||||
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
|
|
||||||
|
if self.html_preprocess_sections < 10:
|
||||||
|
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
|
||||||
|
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
|
||||||
html = chapdetect2.sub(self.chapter_head, html)
|
html = chapdetect2.sub(self.chapter_head, html)
|
||||||
#
|
#
|
||||||
# Unwrap lines using punctation if the median length of all lines is less than 200
|
# Unwrap lines using punctation if the median length of all lines is less than 200
|
||||||
@ -110,13 +111,14 @@ class PreProcessor(object):
|
|||||||
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
|
||||||
if length < 200:
|
if length < 200:
|
||||||
self.log("Unwrapping Lines")
|
self.log("Unwrapping Lines")
|
||||||
html = unwrap.sub(' ', html)
|
html = unwrap.sub(' ', html)
|
||||||
|
|
||||||
# If still no sections after unwrapping lines break on lines with no punctuation
|
# If still no sections after unwrapping lines break on lines with no punctuation
|
||||||
if self.html_preprocess_sections < 10:
|
if self.html_preprocess_sections < 10:
|
||||||
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
|
self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
|
||||||
#self.log(html)
|
#self.log(html)
|
||||||
chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
|
chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</(i|b|u)>){0,2}\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||||
html = chapdetect3.sub(self.chapter_head, html)
|
html = chapdetect3.sub(self.chapter_break, html)
|
||||||
# search for places where a first or second level heading is immediately followed by another
|
# search for places where a first or second level heading is immediately followed by another
|
||||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||||
# headings and titles, images, etc
|
# headings and titles, images, etc
|
||||||
|
@ -408,10 +408,6 @@ class Page(object):
|
|||||||
# Fraction of text height that two strings' bottoms can differ by
|
# Fraction of text height that two strings' bottoms can differ by
|
||||||
# for them to be considered to be part of the same text fragment
|
# for them to be considered to be part of the same text fragment
|
||||||
LINE_FACTOR = 0.4
|
LINE_FACTOR = 0.4
|
||||||
|
|
||||||
# Percentage of the page heigth which should be considered header
|
|
||||||
# or footer to be discarded from reflow considerations
|
|
||||||
HEAD_FOOTER_MARGIN
|
|
||||||
|
|
||||||
# Multiplies the average line height when determining row height
|
# Multiplies the average line height when determining row height
|
||||||
# of a particular element to detect columns.
|
# of a particular element to detect columns.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user