diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index a0dfb5ea2b..da652c1a38 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -54,7 +54,7 @@ def chap_head(match): if not title: return '
)', re.DOTALL)
+ elif format == 'pdf':
+ linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
@@ -206,7 +209,7 @@ class HTMLPreProcessor(object):
(re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?', re.IGNORECASE), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
# Have paragraphs show better
@@ -289,7 +292,7 @@ class HTMLPreProcessor(object):
traceback.print_exc()
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
+ length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
end_rules.append(
# Un wrap using punctuation
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 73bc22be66..42a42a5837 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,6 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
+from calibre.ebooks.conversion.preprocess import line_length
class Link(object):
'''
@@ -489,5 +490,18 @@ class HTMLInput(InputFormatPlugin):
return (None, None)
return (None, raw)
-
-
+ def preprocess_html(self, html):
+ print "********* Preprocessing HTML *********\n"
+ # Detect Chapters to match the xpath in the GUI
+ chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P'+'\g
\n', html)
+ # Unwrap lines using punctation if the median length of all lines is less than 150
+ #
+ # Insert extra line feeds so the line length regex functions properly
+ html = re.sub(r"
]*>(\s*<(p|span|div)>\s*(p|span|div)[^>]*>)?\s*((p|span|div)>\s*
]*>)?)?\s*<(span|div|p)[^>]*>", " ", html) - return html + # Unwrap lines using punctation if the median length of all lines is less than 150 + # + # Insert extra line feeds so the line length regex functions properly + html = re.sub(r"
", "\n", html) + length = line_length('html', html, 0.4) + print "*** Median length is " + str(length) + " ***\n" + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P