From 8b7ef0984f4bed6acc64b6e6124352c65b22eb65 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 25 Sep 2010 12:53:40 -1000 Subject: [PATCH] ... --- src/calibre/ebooks/conversion/preprocess.py | 18 +++++++++--------- src/calibre/ebooks/conversion/utils.py | 11 ++++------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index c42b29e0e4..a18ff07d44 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -123,7 +123,7 @@ def line_length(format, raw, percent, test_type): l = len(line) if l > max: max = l - print "max line found is "+str(max) + #print "max line found is "+str(max) # Build the line length histogram hRaw = [ 0 for i in range(0,buckets) ] for line in lines: @@ -136,8 +136,8 @@ def line_length(format, raw, percent, test_type): # Normalize the histogram into percents totalLines = len(lines) h = [ float(count)/totalLines for count in hRaw ] - print "\nhRaw histogram lengths are: "+str(hRaw) - print " percents are: "+str(h)+"\n" + #print "\nhRaw histogram lengths are: "+str(hRaw) + #print " percents are: "+str(h)+"\n" # Find the biggest bucket maxValue = 0 @@ -183,27 +183,27 @@ class Dehyphenator(object): if self.prefixes.match(firsthalf) is None: lookupword = self.removeprefix.sub('', lookupword) booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) - print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) + #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) if self.format == 'html_cleanup': match = booklookup.search(self.html) hyphenmatch = re.search(u'%s' % hyphenated, self.html) if match: - print "Cleanup:returned dehyphenated word: " + str(dehyphenated) + #print "Cleanup:returned dehyphenated word: " + str(dehyphenated) return dehyphenated elif hyphenmatch: - print "Cleanup:returned hyphenated word: " + str(hyphenated) + #print "Cleanup:returned hyphenated word: " + str(hyphenated) return hyphenated else: - print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) + #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf) return firsthalf+u'\u2014'+wraptags+secondhalf else: match = booklookup.search(self.html) if match: - print "returned dehyphenated word: " + str(dehyphenated) + #print "returned dehyphenated word: " + str(dehyphenated) return dehyphenated else: - print "returned hyphenated word: " + str(hyphenated) + #print "returned hyphenated word: " + str(hyphenated) return hyphenated def __call__(self, html, format, length=1): diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 5e3cac7714..555f42702b 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -132,7 +132,7 @@ class PreProcessor(object): html = blankreg.sub('', html) elif float(len(blanklines)) / float(len(lines)) > 0.40: blanks_between_paragraphs = True - print "blanks between paragraphs is marked True" + #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False # Arrange line feeds and

tags so the line_length and no_markup functions work correctly @@ -163,7 +163,7 @@ class PreProcessor(object): uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*" chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) @@ -172,14 +172,11 @@ class PreProcessor(object): # Start with most typical chapter headings, get more aggressive until one works if self.html_preprocess_sections < 10: chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - #chapdetect = re.compile(r"(?=<(p|div))<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(?P.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8})\s*()>)?\s*()>)?\s*()\s[^>]*>)?\s)>(<(?Pp|div)[^>]*>\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(<(?Pspan|[ibu])\s[^>]*>)?\s*(?P(\s*[\w\'\"-]+){1,5})\s*(</(?P=<inner_six>)>)?\s*(</(?P=<inner_five>)>)?\s*(</(?P=<inner_four>)\s[^>]*>)?\s</(?P=<outer_two>)>)?", re.IGNORECASE) - #chapdetect = re.compile(r'(?=</?(br|p))(<(?P<outer>(/?br|p))[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(?P=outer)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE) html = chapdetect.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) - #chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) if self.html_preprocess_sections < 10: @@ -208,7 +205,7 @@ class PreProcessor(object): # Check Line histogram to determine if the document uses hard line breaks, If 50% or # more of the lines break in the same region of the document then unwrapping is required hardbreaks = line_length(format, html, .50, 'histogram') - print "Hard line breaks check returned "+str(hardbreaks) + #print "Hard line breaks check returned "+str(hardbreaks) # Calculate Length unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4) length = line_length(format, html, unwrap_factor, 'median') @@ -243,7 +240,7 @@ class PreProcessor(object): if self.html_preprocess_sections < 10: self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections)) chapdetect3 = re.compile(r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE) - #html = chapdetect3.sub(self.chapter_break, html) + html = chapdetect3.sub(self.chapter_break, html) # search for places where a first or second level heading is immediately followed by another # top level heading. demote the second heading to h3 to prevent splitting between chapter # headings and titles, images, etc