From e9f5cb683db7425c8fdf6c01523d69e085f221e4 Mon Sep 17 00:00:00 2001 From: ldolse Date: Fri, 10 Dec 2010 13:29:36 -0800 Subject: [PATCH] tweaked chapter markup function --- src/calibre/ebooks/conversion/utils.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index c42068cfe0..0665cccb14 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -103,7 +103,6 @@ class PreProcessor(object): f.write(raw.encode('utf-8')) def get_word_count(self, html): - totalwords = 0 word_count_text = re.sub(r'(?s)]*>.*?', '', html) word_count_text = re.sub(r'<[^>]*>', '', word_count_text) wordcount = get_wordcount_obj(word_count_text) @@ -162,15 +161,13 @@ class PreProcessor(object): chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE) else: chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close - chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) - + chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE) html = chapdetect.sub(self.chapter_head, html) words_per_chptr = wordcount if words_per_chptr > 0 and self.html_preprocess_sections > 0: words_per_chptr = wordcount / self.html_preprocess_sections print "Total wordcount is: "+ str(wordcount)+", Average words per section is: "+str(words_per_chptr)+", Marked up "+str(self.html_preprocess_sections)+" chapters" - return html @@ -180,10 +177,11 @@ class PreProcessor(object): # Count the words in the document to estimate how many chapters to look for and whether # other types of processing are attempted + totalwords = 0 totalwords = self.get_word_count(html) - if totalwords < 10: - print "not enough text, not preprocessing" + if totalwords < 20: + self.log("not enough text, not preprocessing") return html # Arrange line feeds and

tags so the line_length and no_markup functions work correctly @@ -255,7 +253,7 @@ class PreProcessor(object): # detect chapters/sections to match xpath or splitting logic # - self.markup_chapters(html, totalwords, blanks_between_paragraphs) + html = self.markup_chapters(html, totalwords, blanks_between_paragraphs) ###### Unwrap lines ######