mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
moved line_length into DocAnalysis class, added line_histogram function
This commit is contained in:
parent
94d01f9885
commit
f5431765f4
@ -62,19 +62,15 @@ def wrap_lines(match):
|
|||||||
else:
|
else:
|
||||||
return ital+' '
|
return ital+' '
|
||||||
|
|
||||||
def line_length(format, raw, percent, test_type):
|
class DocAnalysis(object):
|
||||||
'''
|
'''
|
||||||
Analyses the document to see if hard line breaks exist or to find the
|
Provides various text analysis functions to determine how the document is structured.
|
||||||
median line length.
|
|
||||||
format is the type of document analysis will be done against.
|
format is the type of document analysis will be done against.
|
||||||
raw is the raw text to determine the line length to use for wrapping.
|
raw is the raw text to determine the line length to use for wrapping.
|
||||||
percentage is a decimal number, 0 - 1 which is used to determine
|
Blank lines are excluded from analysis
|
||||||
how far in the list of line lengths to use. The list of line lengths is
|
|
||||||
ordered smallest to larged and does not include duplicates. 0.5 is the
|
|
||||||
median value.
|
|
||||||
test_type sets whether to use the line length to return the median or a
|
|
||||||
do a histogram analysis to see if unwrapping is required.
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
def __init__(self, format='html', raw=''):
|
||||||
raw = raw.replace(' ', ' ')
|
raw = raw.replace(' ', ' ')
|
||||||
if format == 'html':
|
if format == 'html':
|
||||||
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||||
@ -82,11 +78,18 @@ def line_length(format, raw, percent, test_type):
|
|||||||
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
|
||||||
elif format == 'spanned_html':
|
elif format == 'spanned_html':
|
||||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||||
lines = linere.findall(raw)
|
self.lines = linere.findall(raw)
|
||||||
|
|
||||||
if test_type == 'median':
|
def line_length(self, percent):
|
||||||
|
'''
|
||||||
|
Analyses the document to find the median line length.
|
||||||
|
percentage is a decimal number, 0 - 1 which is used to determine
|
||||||
|
how far in the list of line lengths to use. The list of line lengths is
|
||||||
|
ordered smallest to larged and does not include duplicates. 0.5 is the
|
||||||
|
median value.
|
||||||
|
'''
|
||||||
lengths = []
|
lengths = []
|
||||||
for line in lines:
|
for line in self.lines:
|
||||||
if len(line) > 0:
|
if len(line) > 0:
|
||||||
lengths.append(len(line))
|
lengths.append(len(line))
|
||||||
|
|
||||||
@ -112,21 +115,27 @@ def line_length(format, raw, percent, test_type):
|
|||||||
|
|
||||||
return lengths[index]
|
return lengths[index]
|
||||||
|
|
||||||
if test_type == 'histogram':
|
def line_histogram(self, percent):
|
||||||
|
'''
|
||||||
|
Creates a broad histogram of the document to determine whether it incorporates hard
|
||||||
|
line breaks. Lines are sorted into 20 'buckets' based on length.
|
||||||
|
percent is the percentage of lines that should be in a single bucket to return true
|
||||||
|
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
|
||||||
|
'''
|
||||||
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
||||||
maxLineLength=1900 # Discard larger than this to stay in range
|
maxLineLength=1900 # Discard larger than this to stay in range
|
||||||
buckets=20 # Each line is divided into a bucket based on length
|
buckets=20 # Each line is divided into a bucket based on length
|
||||||
|
|
||||||
#print "there are "+str(len(lines))+" lines"
|
#print "there are "+str(len(lines))+" lines"
|
||||||
max = 0
|
#max = 0
|
||||||
for line in lines:
|
#for line in self.lines:
|
||||||
l = len(line)
|
# l = len(line)
|
||||||
if l > max:
|
# if l > max:
|
||||||
max = l
|
# max = l
|
||||||
#print "max line found is "+str(max)
|
#print "max line found is "+str(max)
|
||||||
# Build the line length histogram
|
# Build the line length histogram
|
||||||
hRaw = [ 0 for i in range(0,buckets) ]
|
hRaw = [ 0 for i in range(0,buckets) ]
|
||||||
for line in lines:
|
for line in self.lines:
|
||||||
l = len(line)
|
l = len(line)
|
||||||
if l > minLineLength and l < maxLineLength:
|
if l > minLineLength and l < maxLineLength:
|
||||||
l = int(l/100)
|
l = int(l/100)
|
||||||
@ -134,7 +143,7 @@ def line_length(format, raw, percent, test_type):
|
|||||||
hRaw[l]+=1
|
hRaw[l]+=1
|
||||||
|
|
||||||
# Normalize the histogram into percents
|
# Normalize the histogram into percents
|
||||||
totalLines = len(lines)
|
totalLines = len(self.lines)
|
||||||
h = [ float(count)/totalLines for count in hRaw ]
|
h = [ float(count)/totalLines for count in hRaw ]
|
||||||
#print "\nhRaw histogram lengths are: "+str(hRaw)
|
#print "\nhRaw histogram lengths are: "+str(hRaw)
|
||||||
#print " percents are: "+str(h)+"\n"
|
#print " percents are: "+str(h)+"\n"
|
||||||
@ -454,15 +463,16 @@ class HTMLPreProcessor(object):
|
|||||||
|
|
||||||
length = -1
|
length = -1
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
|
docanalysis = DocAnalysis('pdf', html)
|
||||||
|
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
if length:
|
if length:
|
||||||
#print "The pdf line length returned is " + str(length)
|
#print "The pdf line length returned is " + str(length)
|
||||||
|
# unwrap em/en dashes
|
||||||
|
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
# Un wrap using punctuation
|
# Un wrap using punctuation
|
||||||
(re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
(re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
|
||||||
)
|
)
|
||||||
# unwrap em/en dashes
|
|
||||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
|
||||||
|
|
||||||
for rule in self.PREPROCESS + start_rules:
|
for rule in self.PREPROCESS + start_rules:
|
||||||
html = rule[0].sub(rule[1], html)
|
html = rule[0].sub(rule[1], html)
|
||||||
|
@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
|
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
|
|
||||||
class PreProcessor(object):
|
class PreProcessor(object):
|
||||||
@ -204,11 +204,12 @@ class PreProcessor(object):
|
|||||||
format = 'html'
|
format = 'html'
|
||||||
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
||||||
# more of the lines break in the same region of the document then unwrapping is required
|
# more of the lines break in the same region of the document then unwrapping is required
|
||||||
hardbreaks = line_length(format, html, .50, 'histogram')
|
docanalysis = DocAnalysis(format, html)
|
||||||
#print "Hard line breaks check returned "+str(hardbreaks)
|
hardbreaks = docanalysis.line_histogram(.50)
|
||||||
|
self.log("Hard line breaks check returned "+str(hardbreaks))
|
||||||
# Calculate Length
|
# Calculate Length
|
||||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||||
length = line_length(format, html, unwrap_factor, 'median')
|
length = docanalysis.line_length(unwrap_factor)
|
||||||
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
|
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
|
||||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||||
if hardbreaks or unwrap_factor < 0.4:
|
if hardbreaks or unwrap_factor < 0.4:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user