diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 23d073cfa4..264b933047 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -356,7 +356,7 @@ class HTMLPreProcessor(object): (re.compile(r'
]+>'), lambda match : ''), # Detect Chapters to match default XPATH in GUI - (re.compile(r'\s*(?=[[a-z\d])' % length), lambda match: '')) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 555f42702b..f41f6abd08 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -77,6 +77,11 @@ class PreProcessor(object): def __call__(self, html): self.log("********* Preprocessing HTML *********") + + # Arrange line feeds and
tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*", "\n", html) + html = re.sub(r"\s*\s*", "\n
", html) + ###### Check Markup ###### # # some lit files don't have any
tags or equivalent (generally just plain text between @@ -135,9 +140,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - # Arrange line feeds and
tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*", "\n", html) - html = re.sub(r"\s*\s*", "\n
", html)
+ #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
@@ -160,11 +163,10 @@ class PreProcessor(object):
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
- uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+ uppercase_chapters = r"\s*.?([A-Z#\-]+\s{0,3}){1,5}\s*"
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+ #print chapter_marker
heading = re.compile('