From 0c2ab9e32838933e0b3731f8cca72a0e98c36730 Mon Sep 17 00:00:00 2001
From: ldolse '),
+ # Convert line breaks to paragraphs
+ (re.compile(r' '),
+ (re.compile(r' '),
+ (re.compile(r'\s*'), lambda match : ' )+\s*(?=[[a-z\d])'), lambda match: ''))
+ end_rules.append((re.compile(u'[]( \s*)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
- end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s* )+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
+ end_rules.append((re.compile(u'[]\s*((i|u|b)>)+( \s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
# Make the more aggressive chapter marking regex optional with the preprocess option to
# reduce false positives and move after header/footer removal
@@ -475,7 +478,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s* \s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
- (re.compile(u'(?<=.{%i}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężı,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*(?P
\s*){1,3}\s*(?P
)?', re.IGNORECASE), chap_head),
+ #(re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
)?', re.IGNORECASE), chap_head),
# Cover the case where every letter in a chapter title is separated by a space
- (re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
))?'), chap_head),
+ #(re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
))?'), chap_head),
- # Have paragraphs show better
- (re.compile(r'
]*>\s*'), lambda match : '
\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
@@ -508,7 +511,15 @@ class HTMLPreProcessor(object):
if is_pdftohtml and length > -1:
# Dehyphenate
dehyphenator = Dehyphenator()
- html = dehyphenator(html,'pdf', length)
+ html = dehyphenator(html,'html', length)
+
+ if is_pdftohtml:
+ from calibre.ebooks.conversion.utils import PreProcessor
+ pdf_markup = PreProcessor(self.extra_opts, None)
+ totalwords = 0
+ totalwords = pdf_markup.get_word_count(html)
+ if totalwords > 7000:
+ html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess')
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index cda9d9cbba..3fd7f88434 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -18,6 +18,9 @@ class PreProcessor(object):
self.found_indents = 0
self.extra_opts = extra_opts
+ def is_pdftohtml(self, src):
+ return '' in src[:1000]
+
def chapter_head(self, match):
chap = match.group('chap')
title = match.group('title')
@@ -130,6 +133,15 @@ class PreProcessor(object):
chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)>)?\s*(?P=outer)>"
title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)>)?\s*(?P=outer2)>"
+ is_pdftohtml = self.is_pdftohtml(html)
+ if is_pdftohtml:
+ print "this is a pdf"
+ chapter_line_open = "<(?P ]*>\s*