moved line_length into DocAnalysis class, added line_histogram function

2025-07-09 03:04:10 -04:00 · 2010-09-28 11:50:39 +08:00 · 2010-09-28 11:50:39 +08:00 · f5431765f4
commit f5431765f4
parent 94d01f9885
2 changed files with 47 additions and 36 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -62,19 +62,15 @@ def wrap_lines(match):
    else:
               return ital+' '
-def line_length(format, raw, percent, test_type):
+class DocAnalysis(object):
    '''
-    Analyses the document to see if hard line breaks exist or to find the 
+    Provides various text analysis functions to determine how the document is structured.
    median line length.
    format is the type of document analysis will be done against.
    raw is the raw text to determine the line length to use for wrapping.
-    percentage is a decimal number, 0 - 1 which is used to determine
+    Blank lines are excluded from analysis
    how far in the list of line lengths to use. The list of line lengths is
    ordered smallest to larged and does not include duplicates. 0.5 is the
    median value.
    test_type sets whether to use the line length to return the median or a
    do a histogram analysis to see if unwrapping is required.
    '''
    def __init__(self, format='html', raw=''):
        raw = raw.replace('&nbsp;', ' ')
        if format == 'html':
            linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
@ -82,11 +78,18 @@ def line_length(format, raw, percent, test_type):
            linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
        elif format == 'spanned_html':
            linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
-    lines = linere.findall(raw)
+        self.lines = linere.findall(raw)
-    if test_type == 'median':
+    def line_length(self, percent):
        '''
        Analyses the document to find the median line length.
        percentage is a decimal number, 0 - 1 which is used to determine
        how far in the list of line lengths to use. The list of line lengths is
        ordered smallest to larged and does not include duplicates. 0.5 is the
        median value.
        '''
        lengths = []
-        for line in lines:
+        for line in self.lines:
            if len(line) > 0:
                lengths.append(len(line))
@ -112,21 +115,27 @@ def line_length(format, raw, percent, test_type):
        return lengths[index]
-    if test_type == 'histogram':
+    def line_histogram(self, percent):
        '''
        Creates a broad histogram of the document to determine whether it incorporates hard
        line breaks.  Lines are sorted into 20 'buckets' based on length.
        percent is the percentage of lines that should be in a single bucket to return true
        The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
        '''
        minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
        maxLineLength=1900 # Discard larger than this to stay in range
        buckets=20 # Each line is divided into a bucket based on length
        #print "there are "+str(len(lines))+" lines"
-        max = 0
+        #max = 0
-        for line in lines:
+        #for line in self.lines:
-            l = len(line)
+        #    l = len(line)
-            if l > max:
+        #    if l > max:
-                max = l
+        #        max = l
        #print "max line found is "+str(max)
        # Build the line length histogram
        hRaw = [ 0 for i in range(0,buckets) ]
-        for line in lines:
+        for line in self.lines:
            l = len(line)
            if l > minLineLength and l < maxLineLength:
                    l = int(l/100)
@ -134,7 +143,7 @@ def line_length(format, raw, percent, test_type):
                    hRaw[l]+=1
        # Normalize the histogram into percents
-        totalLines = len(lines)
+        totalLines = len(self.lines)
        h = [ float(count)/totalLines for count in hRaw ]
        #print "\nhRaw histogram lengths are: "+str(hRaw)
        #print "              percents are: "+str(h)+"\n"
@ -454,15 +463,16 @@ class HTMLPreProcessor(object):
        length = -1
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
-            length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
+            docanalysis = DocAnalysis('pdf', html)
            length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
            if length:
                #print "The pdf line length returned is " + str(length)
                # unwrap em/en dashes
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
                end_rules.append(
                    # Un wrap using punctuation
                    (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
                )
                # unwrap em/en dashes
                end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
        for rule in self.PREPROCESS + start_rules:
            html = rule[0].sub(rule[1], html)
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
-from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
 from calibre.utils.logging import default_log
 class PreProcessor(object):
@ -204,11 +204,12 @@ class PreProcessor(object):
            format = 'html'
        # Check Line histogram to determine if the document uses hard line breaks, If 50% or 
        # more of the lines break in the same region of the document then unwrapping is required
-        hardbreaks = line_length(format, html, .50, 'histogram')
+        docanalysis = DocAnalysis(format, html)
-        #print "Hard line breaks check returned "+str(hardbreaks)
+        hardbreaks = docanalysis.line_histogram(.50)
        self.log("Hard line breaks check returned "+str(hardbreaks))
        # Calculate Length
        unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
-        length = line_length(format, html, unwrap_factor, 'median')
+        length = docanalysis.line_length(unwrap_factor)
        self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
        # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
        if hardbreaks or unwrap_factor < 0.4: