diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 7f384a27bd..4a2d56d957 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -61,32 +61,35 @@ def wrap_lines(match): return ' ' else: return ital+' ' - -def line_length(format, raw, percent, test_type): + +class DocAnalysis(object): ''' - Analyses the document to see if hard line breaks exist or to find the - median line length. + Provides various text analysis functions to determine how the document is structured. format is the type of document analysis will be done against. raw is the raw text to determine the line length to use for wrapping. - percentage is a decimal number, 0 - 1 which is used to determine - how far in the list of line lengths to use. The list of line lengths is - ordered smallest to larged and does not include duplicates. 0.5 is the - median value. - test_type sets whether to use the line length to return the median or a - do a histogram analysis to see if unwrapping is required. + Blank lines are excluded from analysis ''' - raw = raw.replace(' ', ' ') - if format == 'html': - linere = re.compile('(?<=
]*>\s*
).*?(?=)', re.DOTALL) - elif format == 'pdf': - linere = re.compile('(?<=]*>\s*
).*?(?=)', re.DOTALL) + elif format == 'pdf': + linere = re.compile('(?<=\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*(?=[[a-z\d])' % length), lambda match: ''))
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9c57756d28..96df37f631 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal