\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
(re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
- # unwrap em/en dashes
- end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*\s*(?=[[a-z\d])' % length), lambda match: ''))
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9c57756d28..96df37f631 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal '
__docformat__ = 'restructuredtext en'
import re
-from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
class PreProcessor(object):
@@ -204,11 +204,12 @@ class PreProcessor(object):
format = 'html'
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
# more of the lines break in the same region of the document then unwrapping is required
- hardbreaks = line_length(format, html, .50, 'histogram')
- #print "Hard line breaks check returned "+str(hardbreaks)
+ docanalysis = DocAnalysis(format, html)
+ hardbreaks = docanalysis.line_histogram(.50)
+ self.log("Hard line breaks check returned "+str(hardbreaks))
# Calculate Length
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
- length = line_length(format, html, unwrap_factor, 'median')
+ length = docanalysis.line_length(unwrap_factor)
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
if hardbreaks or unwrap_factor < 0.4: