diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 6123577191..46308b2ea0 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -319,8 +319,8 @@ class HTMLPreProcessor(object):
# unwrap hyphenation - moved here so it's executed after header/footer removal
if is_pdftohtml:
- # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these
- # hyphens are for compound words, formatting, etc
+ # unwrap visible dashes and hyphens - don't delete they are often hyphens for
+ # for compound words, formatting, etc
end_rules.append((re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 68cebb3a11..fb683bdb12 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -29,16 +29,12 @@ class PreProcessor(object):
self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return '
'+chap+'
\n'+title+'
\n'
- def chapter_link(self, match):
- chap = match.group('sectionlink')
- if not chap:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
- return '
'
- else:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
- return '
\n'+chap+'
'
+ def chapter_break(self, match):
+ chap = match.group('section')
+ styles = match.group('styles')
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+ return '<'+styles+' style="page-break-before:always">'+chap
def no_markup(self, raw, percent):
'''
@@ -74,7 +70,7 @@ class PreProcessor(object):
html = re.sub(r"\s*]*>\s*", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
- linereg = re.compile('(?<=)', re.IGNORECASE)
+ linereg = re.compile('(?<=
)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
blanklines = blankreg.findall(html)
lines = linereg.findall(html)
@@ -100,8 +96,13 @@ class PreProcessor(object):
chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
+
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
#
# Unwrap lines using punctation if the median length of all lines is less than 200
@@ -110,13 +111,14 @@ class PreProcessor(object):
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
if length < 200:
self.log("Unwrapping Lines")
- html = unwrap.sub(' ', html)
+ html = unwrap.sub(' ', html)
+
# If still no sections after unwrapping lines break on lines with no punctuation
if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+ self.log(str(self.html_preprocess_sections) + " split points marked, matching based on punctuation")
#self.log(html)
- chapdetect3 = re.compile(r'(]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*(
)(?P)?', re.IGNORECASE)
- html = chapdetect3.sub(self.chapter_head, html)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?((i|b|u)>){0,2}\s*()?\s*((i|b|u)>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 36848ddb8b..584d631d0b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,10 +408,6 @@ class Page(object):
# Fraction of text height that two strings' bottoms can differ by
# for them to be considered to be part of the same text fragment
LINE_FACTOR = 0.4
-
- # Percentage of the page heigth which should be considered header
- # or footer to be discarded from reflow considerations
- HEAD_FOOTER_MARGIN
# Multiplies the average line height when determining row height
# of a particular element to detect columns.