diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index d2bdba4928..54639df93c 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -473,12 +473,6 @@ class HTMLPreProcessor(object): # unwrap/delete soft hyphens with formatting end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(
\s*\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: '')) - # Make the more aggressive chapter marking regex optional with the preprocess option to - # reduce false positives and move after header/footer removal - if getattr(self.extra_opts, 'preprocess_html', None): - if is_pdftohtml: - end_rules.append((re.compile(r'
\s*(?P \s*(?P )?'), chap_head),)
-
length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
docanalysis = DocAnalysis('pdf', html)
@@ -525,11 +519,10 @@ class HTMLPreProcessor(object):
html = dehyphenator(html,'html', length)
if is_pdftohtml:
- from calibre.ebooks.conversion.utils import PreProcessor
- pdf_markup = PreProcessor(self.extra_opts, None)
+ from calibre.ebooks.conversion.utils import HeuristicProcessor
+ pdf_markup = HeuristicProcessor(self.extra_opts, None)
totalwords = 0
- totalwords = pdf_markup.get_word_count(html)
- if totalwords > 7000:
+ if pdf_markup.get_word_count(html) > 7000:
html = pdf_markup.markup_chapters(html, totalwords, True)
#dump(html, 'post-preprocess')
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 1a691b2e14..888d24d791 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -170,9 +170,9 @@ class HeuristicProcessor(object):
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
# or pdf page numbers from being treated as TOC markers
if wordcount > 7000:
- self.min_chapters = int(ceil(wordcount / 7000.))
- self.max_chapters = int(ceil(wordcount / 100.))
- #print "minimum chapters required are: "+str(self.min_chapters)
+ self.min_chapters = int(ceil(wordcount / 15000.))
+ self.max_chapters = int(ceil(wordcount / 1200.))
+ print "minimum chapters required are: "+str(self.min_chapters)
heading = re.compile('