From 105591980657bfc945e45825d33b94bb385486d6 Mon Sep 17 00:00:00 2001
From: ldolse \s*(?=[[a-z\d])' % length), lambda match: ''))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 555f42702b..f41f6abd08 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -77,6 +77,11 @@ class PreProcessor(object):
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
+
+ # Arrange line feeds and
\s*(?P
\s*){1,3}\s*(?P
)?', re.IGNORECASE), chap_head),
+ (re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
)?', re.IGNORECASE), chap_head),
# Cover the case where every letter in a chapter title is separated by a space
(re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
))?'), chap_head),
@@ -461,10 +461,10 @@ class HTMLPreProcessor(object):
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
if length:
- # print "The pdf line length returned is " + str(length)
+ print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?(i|b|u)>)?\s*(
\s*", "\n
", html) + ###### Check Markup ###### # # some lit files don't have any
tags or equivalent (generally just plain text between @@ -135,9 +140,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - # Arrange line feeds and
tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*", "\n", html) - html = re.sub(r"\s*\s*", "\n
", html)
+ #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
@@ -160,11 +163,10 @@ class PreProcessor(object):
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
- uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+ uppercase_chapters = r"\s*.?([A-Z#\-]+\s{0,3}){1,5}\s*"
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+ #print chapter_marker
heading = re.compile('