From b45dc837830b0e4f61b9cc19dfcc5f214589eb83 Mon Sep 17 00:00:00 2001 From: ldolse Date: Mon, 11 Oct 2010 00:14:00 +1000 Subject: [PATCH] preprocessing tweaks, fixed division by zero error in line_histogram --- src/calibre/ebooks/conversion/preprocess.py | 2 ++ src/calibre/ebooks/conversion/utils.py | 6 +++++- src/calibre/ebooks/pdb/input.py | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index c5ebae4bba..de01188829 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -144,6 +144,8 @@ class DocAnalysis(object): # Normalize the histogram into percents totalLines = len(self.lines) + if totalLines == 0: + return False h = [ float(count)/totalLines for count in hRaw ] #print "\nhRaw histogram lengths are: "+str(hRaw) #print " percents are: "+str(h)+"\n" diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 976ed6a8f4..a01c29f2fb 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -146,7 +146,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") + #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces @@ -230,6 +230,7 @@ class PreProcessor(object): html = dehyphenator(html,'html', length) self.log("Done dehyphenating") # Unwrap lines using punctation and line length + unwrap_quotes = re.compile(u"(?<=.{%i}\"')\s*\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*(?=[a-z])" % length, re.UNICODE) unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?\s*()?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*\s*)\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = unwrap.sub(' ', html) #check any remaining hyphens, but only unwrap if there is a match @@ -259,5 +260,8 @@ class PreProcessor(object): # put back non-breaking spaces in empty paragraphs to preserve original formatting html = blankreg.sub('\n'+r'\g'+u'\u00a0'+r'\g', html) + + # Center separator lines + html = re.sub(u'

\s*(?P([*#•]+\s*)+)\s*

', '

' + '\g' + '

', html) return html diff --git a/src/calibre/ebooks/pdb/input.py b/src/calibre/ebooks/pdb/input.py index dfe5b653dd..6850c48b16 100644 --- a/src/calibre/ebooks/pdb/input.py +++ b/src/calibre/ebooks/pdb/input.py @@ -9,6 +9,7 @@ import os from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.ebooks.pdb.header import PdbHeaderReader from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader +from calibre.ebooks.conversion.utils import PreProcessor class PDBInput(InputFormatPlugin): @@ -44,3 +45,8 @@ class PDBInput(InputFormatPlugin): opf = reader.extract_content(os.getcwd()) return opf + + def preprocess_html(self, options, html): + self.options = options + preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) + return preprocessor(html) \ No newline at end of file