mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
tweaked chapter thresholds
This commit is contained in:
parent
21ff2be472
commit
539f24213d
@ -21,7 +21,6 @@ class HeuristicProcessor(object):
|
||||
self.deleted_nbsps = False
|
||||
self.totalwords = 0
|
||||
self.min_chapters = 1
|
||||
self.max_chapters = 150
|
||||
self.chapters_no_title = 0
|
||||
self.chapters_with_title = 0
|
||||
self.blanks_deleted = False
|
||||
@ -169,9 +168,12 @@ class HeuristicProcessor(object):
|
||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||
# or pdf page numbers from being treated as TOC markers
|
||||
max_chapters = 150
|
||||
typical_chapters = 7000.
|
||||
if wordcount > 7000:
|
||||
self.min_chapters = int(ceil(wordcount / 15000.))
|
||||
self.max_chapters = int(ceil(wordcount / 1200.))
|
||||
if wordcount > 200000:
|
||||
typical_chapters = 15000.
|
||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||
print "minimum chapters required are: "+str(self.min_chapters)
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
|
Loading…
x
Reference in New Issue
Block a user