diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index d2bdba4928..54639df93c 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -473,12 +473,6 @@ class HTMLPreProcessor(object): # unwrap/delete soft hyphens with formatting end_rules.append((re.compile(u'[­]\s*()+(

\s*

\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) - # Make the more aggressive chapter marking regex optional with the preprocess option to - # reduce false positives and move after header/footer removal - if getattr(self.extra_opts, 'preprocess_html', None): - if is_pdftohtml: - end_rules.append((re.compile(r'

\s*(?P(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(){0,2})\s*

\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),) - length = -1 if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: docanalysis = DocAnalysis('pdf', html) @@ -525,11 +519,10 @@ class HTMLPreProcessor(object): html = dehyphenator(html,'html', length) if is_pdftohtml: - from calibre.ebooks.conversion.utils import PreProcessor - pdf_markup = PreProcessor(self.extra_opts, None) + from calibre.ebooks.conversion.utils import HeuristicProcessor + pdf_markup = HeuristicProcessor(self.extra_opts, None) totalwords = 0 - totalwords = pdf_markup.get_word_count(html) - if totalwords > 7000: + if pdf_markup.get_word_count(html) > 7000: html = pdf_markup.markup_chapters(html, totalwords, True) #dump(html, 'post-preprocess') diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 1a691b2e14..888d24d791 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -170,9 +170,9 @@ class HeuristicProcessor(object): # minimum of chapters to search for. A max limit is calculated to prevent things like OCR # or pdf page numbers from being treated as TOC markers if wordcount > 7000: - self.min_chapters = int(ceil(wordcount / 7000.)) - self.max_chapters = int(ceil(wordcount / 100.)) - #print "minimum chapters required are: "+str(self.min_chapters) + self.min_chapters = int(ceil(wordcount / 15000.)) + self.max_chapters = int(ceil(wordcount / 1200.)) + print "minimum chapters required are: "+str(self.min_chapters) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) self.html_preprocess_sections = len(heading.findall(html)) self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") @@ -272,6 +272,7 @@ class HeuristicProcessor(object): title_req = True strict_title = False self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ") + print "max chapters is "+str(self.max_chapters) if type_name == 'common': analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) elif self.min_chapters <= hits < self.max_chapters: