diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 7e85e24a83..5e3cac7714 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -113,11 +113,12 @@ class PreProcessor(object):
# Get rid of empty tags to simplify other processing
html = re.sub(ur'\s*\s*', ' ', html)
# Get rid of empty span, bold, & italics tags
- html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
+ html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html)
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*[ibu]>\s*){0,2}\s*[ibu]>", " ", html)
html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
- # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ # If more than 40% of the lines are empty paragraphs and the user has enabled remove
+ # paragraph spacing then delete blank lines to clean up spacing
linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
#multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
){2,}', re.IGNORECASE)
@@ -129,26 +130,63 @@ class PreProcessor(object):
'remove_paragraph_spacing', False):
self.log("deleting blank lines")
html = blankreg.sub('', html)
+ elif float(len(blanklines)) / float(len(lines)) > 0.40:
+ blanks_between_paragraphs = True
+ print "blanks between paragraphs is marked True"
+ else:
+ blanks_between_paragraphs = False
# Arrange line feeds and
tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*
", "\n", html)
html = re.sub(r"\s*\s*", "\n
", html)
# detect chapters/sections to match xpath or splitting logic
+ #
+ # Build the Regular Expressions in pieces
+ lookahead = "(?=<(p|div))"
+ chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*"
+ chapter_header_open = r"(?P"
+ chapter_header_close = ")\s*"
+ chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)\s[^>]*>)?\s*(?P=outer)>\s*"
+ if blanks_between_paragraphs:
+ blank_lines = "(\s*]*>\s*
){0,2}\s*"
+ else:
+ blank_lines = ""
+ opt_title_open = "("
+ title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*(<(?Pspan|[ibu])[^>]*>)?\s*"
+ title_header_open = "(?P"
+ title_header_close = ")\s*"
+ title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)\s[^>]*>)?\s*(?P=outer2)>"
+ opt_title_close = ")?"
+
+ default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
+ typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+ numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
+ uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ print chapter_marker
+ #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
#
# Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*([ibu]>){0,2}\s*()?\s*((p|/?br)>)\s*\s*(\s*]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE|re.VERBOSE)
+ chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ #chapdetect = re.compile(r"(?=<(p|div))<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(?P.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8})\s*((?P=)>)?\s*((?P=)>)?\s*((?P=)\s[^>]*>)?\s(?P=)>(<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(?P(\s*[\w\'\"-]+){1,5})\s*((?P=)>)?\s*((?P=)>)?\s*((?P=)\s[^>]*>)?\s(?P=)>)?", re.IGNORECASE)
+ #chapdetect = re.compile(r'(?=?(br|p))(<(?P(/?br|p))[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*([ibu]>){0,2}\s*()?\s*((?P=outer)>)\s*\s*(\s*]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE|re.VERBOSE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
+ #chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
+ #chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
###### Unwrap lines ######
@@ -179,7 +217,6 @@ class PreProcessor(object):
if hardbreaks or unwrap_factor < 0.4:
self.log("Unwrapping required, unwrapping Lines")
# Unwrap em/en dashes
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
# Dehyphenate
self.log("Unwrapping/Removing hyphens")
@@ -206,7 +243,7 @@ class PreProcessor(object):
if self.html_preprocess_sections < 10:
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
- html = chapdetect3.sub(self.chapter_break, html)
+ #html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc