diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index d6b5460552..c42b29e0e4 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,49 +62,97 @@ def wrap_lines(match): else: return ital+' ' -def line_length(format, raw, percent): +def line_length(format, raw, percent, test_type): ''' - raw is the raw text to find the line length to use for wrapping. + Analyses the document to see if hard line breaks exist or to find the + median line length. + format is the type of document analysis will be done against. + raw is the raw text to determine the line length to use for wrapping. percentage is a decimal number, 0 - 1 which is used to determine how far in the list of line lengths to use. The list of line lengths is ordered smallest to larged and does not include duplicates. 0.5 is the median value. + test_type sets whether to use the line length to return the median or a + do a histogram analysis to see if unwrapping is required. ''' raw = raw.replace(' ', ' ') if format == 'html': - linere = re.compile('(?<=
)', re.DOTALL) + linere = re.compile('(?<=
]*>\s*
).*?(?=)', re.DOTALL) elif format == 'pdf': linere = re.compile('(?<=|[iub]>\s*
\s*<[iub]>)\s*(?P |[iub]>\s* \s*<[iub]>)\s*(?P
\s*(?P \s*(?P )?'), chap_head),)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+ length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
if length:
# print "The pdf line length returned is " + str(length)
end_rules.append(
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f38d02309a..7e85e24a83 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -153,7 +153,6 @@ class PreProcessor(object):
###### Unwrap lines ######
#
- self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries
@@ -168,25 +167,40 @@ class PreProcessor(object):
format = 'html'
else:
format = 'html'
-
+ # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+ # more of the lines break in the same region of the document then unwrapping is required
+ hardbreaks = line_length(format, html, .50, 'histogram')
+ print "Hard line breaks check returned "+str(hardbreaks)
# Calculate Length
- length = line_length(format, html, getattr(self.extra_opts,
- 'html_unwrap_factor', 0.4))
+ unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+ length = line_length(format, html, unwrap_factor, 'median')
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
- max_length = length * 1.4
- min_max = str("(?<=.{"+str(length)+"})(?\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+ # Dehyphenate
+ self.log("Unwrapping/Removing hyphens")
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html', length)
+ self.log("Done dehyphenating")
+ # Unwrap lines using punctation and line length
+ unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*((p|span|div)>)?\s*(?P