diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index bec15924d6..ac38a0097d 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -83,6 +83,24 @@ class PreProcessor(object): if min_lns > tot_htm_ends: return True + def dump(self, raw, where): + import os + dp = getattr(self.extra_opts, 'debug_pipeline', None) + if dp and os.path.exists(dp): + odir = os.path.join(dp, 'preprocess') + if not os.path.exists(odir): + os.makedirs(odir) + if os.path.exists(odir): + odir = os.path.join(odir, where) + if not os.path.exists(odir): + os.makedirs(odir) + name, i = None, 0 + while not name or os.path.exists(os.path.join(odir, name)): + i += 1 + name = '%04d.html'%i + with open(os.path.join(odir, name), 'wb') as f: + f.write(raw.encode('utf-8')) + def __call__(self, html): self.log("********* Preprocessing HTML *********") @@ -150,7 +168,7 @@ class PreProcessor(object): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") + #self.dump(html, 'before_chapter_markup') # detect chapters/sections to match xpath or splitting logic # # Build the Regular Expressions in pieces @@ -158,7 +176,7 @@ class PreProcessor(object): chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*" chapter_header_open = r"(?P" chapter_header_close = ")\s*" - chapter_line_close = "()?\s*()?\s*()?\s*\s*" + chapter_line_close = "()?\s*()?\s*()?\s*" if blanks_between_paragraphs: blank_lines = "(\s*]*>\s*

){0,2}\s*" else: @@ -169,7 +187,7 @@ class PreProcessor(object): title_header_close = ")\s*" title_line_close = "()?\s*()?\s*(]*>)?\s*" opt_title_close = ")?" - n_lookahead_open = "(?!=" + n_lookahead_open = "\s+(?!" n_lookahead_close = ")" default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)" @@ -181,10 +199,10 @@ class PreProcessor(object): full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is:\n" + n_lookahead + "\n\n" + #print "'normal' Chapter line - no title - is:\n" + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print "full chapter regex with lookahead is:\n" + chapter_marker + "\n\n" heading = re.compile(']*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") @@ -197,10 +215,10 @@ class PreProcessor(object): self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is " + n_lookahead + #print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) @@ -208,10 +226,10 @@ class PreProcessor(object): self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines") full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is " + n_lookahead + #print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html) @@ -219,10 +237,10 @@ class PreProcessor(object): self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is " + n_lookahead + #print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect2.sub(self.chapter_head, html) @@ -230,10 +248,10 @@ class PreProcessor(object): self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles") full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) - print "n_lookahead is " + n_lookahead - print "Chapter line is " + full_chapter_line + "\n\n" + #print "n_lookahead is " + n_lookahead + #print "Chapter line is " + full_chapter_line + "\n\n" chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close - print chapter_marker + #print chapter_marker chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) html = chapdetect2.sub(self.chapter_head, html)