mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
pep8
This commit is contained in:
parent
c052186b18
commit
49b0726efa
@ -148,20 +148,20 @@ class DocAnalysis(object):
|
|||||||
maxLineLength=1900 # Discard larger than this to stay in range
|
maxLineLength=1900 # Discard larger than this to stay in range
|
||||||
buckets=20 # Each line is divided into a bucket based on length
|
buckets=20 # Each line is divided into a bucket based on length
|
||||||
|
|
||||||
#print "there are "+str(len(lines))+" lines"
|
# print "there are "+str(len(lines))+" lines"
|
||||||
#max = 0
|
# max = 0
|
||||||
#for line in self.lines:
|
# for line in self.lines:
|
||||||
# l = len(line)
|
# l = len(line)
|
||||||
# if l > max:
|
# if l > max:
|
||||||
# max = l
|
# max = l
|
||||||
#print "max line found is "+str(max)
|
# print "max line found is "+str(max)
|
||||||
# Build the line length histogram
|
# Build the line length histogram
|
||||||
hRaw = [0 for i in range(0,buckets)]
|
hRaw = [0 for i in range(0,buckets)]
|
||||||
for line in self.lines:
|
for line in self.lines:
|
||||||
l = len(line)
|
l = len(line)
|
||||||
if l > minLineLength and l < maxLineLength:
|
if l > minLineLength and l < maxLineLength:
|
||||||
l = int(l/100)
|
l = int(l/100)
|
||||||
#print "adding "+str(l)
|
# print "adding "+str(l)
|
||||||
hRaw[l]+=1
|
hRaw[l]+=1
|
||||||
|
|
||||||
# Normalize the histogram into percents
|
# Normalize the histogram into percents
|
||||||
@ -170,8 +170,8 @@ class DocAnalysis(object):
|
|||||||
h = [float(count)/totalLines for count in hRaw]
|
h = [float(count)/totalLines for count in hRaw]
|
||||||
else:
|
else:
|
||||||
h = []
|
h = []
|
||||||
#print "\nhRaw histogram lengths are: "+str(hRaw)
|
# print "\nhRaw histogram lengths are: "+str(hRaw)
|
||||||
#print " percents are: "+str(h)+"\n"
|
# print " percents are: "+str(h)+"\n"
|
||||||
|
|
||||||
# Find the biggest bucket
|
# Find the biggest bucket
|
||||||
maxValue = 0
|
maxValue = 0
|
||||||
@ -180,10 +180,10 @@ class DocAnalysis(object):
|
|||||||
maxValue = h[i]
|
maxValue = h[i]
|
||||||
|
|
||||||
if maxValue < percent:
|
if maxValue < percent:
|
||||||
#print "Line lengths are too variable. Not unwrapping."
|
# print "Line lengths are too variable. Not unwrapping."
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
#print str(maxValue)+" of the lines were in one bucket"
|
# print str(maxValue)+" of the lines were in one bucket"
|
||||||
return True
|
return True
|
||||||
|
|
||||||
class Dehyphenator(object):
|
class Dehyphenator(object):
|
||||||
@ -577,7 +577,7 @@ class HTMLPreProcessor(object):
|
|||||||
docanalysis = DocAnalysis('pdf', html)
|
docanalysis = DocAnalysis('pdf', html)
|
||||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||||
if length:
|
if length:
|
||||||
#print "The pdf line length returned is " + str(length)
|
# print "The pdf line length returned is " + str(length)
|
||||||
# unwrap em/en dashes
|
# unwrap em/en dashes
|
||||||
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
|
||||||
end_rules.append(
|
end_rules.append(
|
||||||
@ -610,7 +610,7 @@ class HTMLPreProcessor(object):
|
|||||||
with open(os.path.join(odir, name), 'wb') as f:
|
with open(os.path.join(odir, name), 'wb') as f:
|
||||||
f.write(raw.encode('utf-8'))
|
f.write(raw.encode('utf-8'))
|
||||||
|
|
||||||
#dump(html, 'pre-preprocess')
|
# dump(html, 'pre-preprocess')
|
||||||
|
|
||||||
for rule in rules + end_rules:
|
for rule in rules + end_rules:
|
||||||
try:
|
try:
|
||||||
@ -636,7 +636,7 @@ class HTMLPreProcessor(object):
|
|||||||
if pdf_markup.get_word_count(html) > 7000:
|
if pdf_markup.get_word_count(html) > 7000:
|
||||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||||
|
|
||||||
#dump(html, 'post-preprocess')
|
# dump(html, 'post-preprocess')
|
||||||
|
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
# Handle broken XHTML w/ SVG (ugh)
|
||||||
if 'svg:' in html and SVG_NS not in html:
|
if 'svg:' in html and SVG_NS not in html:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user