diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index f2b19efa9b..c120f0a560 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -77,7 +77,6 @@ def line_length(format, raw, percent):
elif format == 'pdf':
linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
lines = linere.findall(raw)
- print "percent is " + str(percent)
lengths = []
for line in lines:
@@ -230,14 +229,17 @@ class HTMLPreProcessor(object):
# (re.compile(r'
\s*
', re.IGNORECASE), lambda match: '\n
'),
# unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- (re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''),
+ #(re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''),
+ # unwrap/delete soft hyphens
+ #(re.compile(u'[]\s*
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r'
'), # Clean up spaces @@ -322,21 +324,29 @@ class HTMLPreProcessor(object): import traceback print 'Failed to parse remove_footer regexp' traceback.print_exc() + + # unwrap hyphenation - moved here so it's executed after header/footer removal + if is_pdftohtml: + # unwrap visible dashes and hyphens - don't delete as 50% or more of the time these + # hyphens are for compound words, formatting, etc + end_rules.append((re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap/delete soft hyphens + end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: '')) + # unwrap/delete soft hyphens with formatting + end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
# Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
- end_rules.append(
- (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
- )
-
+ end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head))
+
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
- print "The pdf line length returned is " + str(length)
+ # print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P