diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 3385771228..310a636022 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -364,12 +364,15 @@ class HTMLPreProcessor(object): (re.compile(r'
]+>'), lambda match : ''), # Detect Chapters to match default XPATH in GUI - (re.compile(r''),
+ # Convert line breaks to paragraphs
+ (re.compile(r'
]*>\s*'), lambda match : '
'), + (re.compile(r'
]*>\s*'), lambda match : '\n'), + (re.compile(r'\s*'), lambda match : '
\n'), + # Clean up spaces (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Add space before and after italics @@ -455,9 +458,9 @@ class HTMLPreProcessor(object): # delete soft hyphens - moved here so it's executed after header/footer removal if is_pdftohtml: # unwrap/delete soft hyphens - end_rules.append((re.compile(u'[](\s*)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(u'[](
\s*\s*)+\s*(?=[[a-z\d])'), lambda match: '')) # unwrap/delete soft hyphens with formatting - end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) + end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(
\s*\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) # Make the more aggressive chapter marking regex optional with the preprocess option to # reduce false positives and move after header/footer removal @@ -475,7 +478,7 @@ class HTMLPreProcessor(object): end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*
\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation - (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
@@ -508,7 +511,15 @@ class HTMLPreProcessor(object):
if is_pdftohtml and length > -1:
# Dehyphenate
dehyphenator = Dehyphenator()
- html = dehyphenator(html,'pdf', length)
+ html = dehyphenator(html,'html', length)
+
+ if is_pdftohtml:
+ from calibre.ebooks.conversion.utils import PreProcessor
+ pdf_markup = PreProcessor(self.extra_opts, None)
+ totalwords = 0
+ totalwords = pdf_markup.get_word_count(html)
+ if totalwords > 7000:
+ html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess')
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index cda9d9cbba..3fd7f88434 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -18,6 +18,9 @@ class PreProcessor(object):
self.found_indents = 0
self.extra_opts = extra_opts
+ def is_pdftohtml(self, src):
+ return '' in src[:1000]
+
def chapter_head(self, match):
chap = match.group('chap')
title = match.group('title')
@@ -130,6 +133,15 @@ class PreProcessor(object):
chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)>)?\s*(?P=outer2)>"
+ is_pdftohtml = self.is_pdftohtml(html)
+ if is_pdftohtml:
+ print "this is a pdf"
+ chapter_line_open = "<(?P ]*>\s*