diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index fb55ee74fb..0421534f65 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -31,6 +31,12 @@ def chap_head(match):
else:
return '
', re.IGNORECASE), lambda match: '
'),
- # Remove page numbers
- (re.compile(r'\d+
', re.IGNORECASE), lambda match: ''),
# Replace
with
(re.compile(r'\s*', re.IGNORECASE), lambda match: ''),
- # Remove
- (re.compile(r'(.*)', re.IGNORECASE),
- lambda match: match.group() if \
- re.match('<', match.group(1).lstrip()) or \
- len(match.group(1)) < 40 else match.group(1)),
+
# Remove hyphenation
- (re.compile(r'-\n\r?'), lambda match: ''),
+ (re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
@@ -112,15 +112,12 @@ class HTMLPreProcessor(object):
(re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(
]*>)?(?p[^>]*>)?s*(?P(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?', re.IGNORECASE), chap_head),
- (re.compile(r'(
]*>)?(?p[^>]*>)?s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(||)?(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?(||)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P()?\s*\w+(\s+\w+)?()?)(
]*>|?p[^>]*>)))?', re.IGNORECASE), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
# Have paragraphs show better
(re.compile(r''), lambda match : ''),
- # Un wrap lines
- (re.compile(r'(?<=[^\.^\^?^!^"^”])\s*((i|b|u)>)*\s*
\s*(<(i|b|u)>)*\s*(?=[a-z0-9I])', re.UNICODE), lambda match: ' '),
-
# Clean up spaces
(re.compile(u'(?<=[\.,:;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
@@ -162,12 +159,12 @@ class HTMLPreProcessor(object):
elif self.is_book_designer(html):
rules = self.BOOK_DESIGNER
elif self.is_pdftohtml(html):
- # Add rules that require matching line length here
- #line_length_rules = [
- # (re.compile('%i' % line_length(html, .85)), lambda match:)
- #]
+ line_length_rules = [
+ # Un wrap using punctuation
+ (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .2), re.UNICODE), wrap_lines),
+ ]
- rules = self.PDFTOHTML # + line_length_rules
+ rules = self.PDFTOHTML + line_length_rules
else:
rules = []
for rule in self.PREPROCESS + rules: