preprocessing tweaks, fixed division by zero error in line_histogram

This commit is contained in:
ldolse 2010-10-11 00:14:00 +10:00
parent 704e266f0c
commit b45dc83783
3 changed files with 13 additions and 1 deletions

View File

@ -144,6 +144,8 @@ class DocAnalysis(object):
# Normalize the histogram into percents # Normalize the histogram into percents
totalLines = len(self.lines) totalLines = len(self.lines)
if totalLines == 0:
return False
h = [ float(count)/totalLines for count in hRaw ] h = [ float(count)/totalLines for count in hRaw ]
#print "\nhRaw histogram lengths are: "+str(hRaw) #print "\nhRaw histogram lengths are: "+str(hRaw)
#print " percents are: "+str(h)+"\n" #print " percents are: "+str(h)+"\n"

View File

@ -146,7 +146,7 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True" #print "blanks between paragraphs is marked True"
else: else:
blanks_between_paragraphs = False blanks_between_paragraphs = False
self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
# #
# Build the Regular Expressions in pieces # Build the Regular Expressions in pieces
@ -230,6 +230,7 @@ class PreProcessor(object):
html = dehyphenator(html,'html', length) html = dehyphenator(html,'html', length)
self.log("Done dehyphenating") self.log("Done dehyphenating")
# Unwrap lines using punctation and line length # Unwrap lines using punctation and line length
unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE)
unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html) html = unwrap.sub(' ', html)
#check any remaining hyphens, but only unwrap if there is a match #check any remaining hyphens, but only unwrap if there is a match
@ -259,5 +260,8 @@ class PreProcessor(object):
# put back non-breaking spaces in empty paragraphs to preserve original formatting # put back non-breaking spaces in empty paragraphs to preserve original formatting
html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html) html = blankreg.sub('\n'+r'\g<openline>'+u'\u00a0'+r'\g<closeline>', html)
# Center separator lines
html = re.sub(u'<p>\s*(?P<break>([*#•]+\s*)+)\s*</p>', '<p style="text-align:center">' + '\g<break>' + '</p>', html)
return html return html

View File

@ -9,6 +9,7 @@ import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
from calibre.ebooks.conversion.utils import PreProcessor
class PDBInput(InputFormatPlugin): class PDBInput(InputFormatPlugin):
@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin):
opf = reader.extract_content(os.getcwd()) opf = reader.extract_content(os.getcwd())
return opf return opf
def preprocess_html(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)