mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
fix pdf preprocess call
This commit is contained in:
parent
a0aa719bb0
commit
3ca18da2cf
@ -473,12 +473,6 @@ class HTMLPreProcessor(object):
|
|||||||
# unwrap/delete soft hyphens with formatting
|
# unwrap/delete soft hyphens with formatting
|
||||||
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
end_rules.append((re.compile(u'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
|
||||||
|
|
||||||
# Make the more aggressive chapter marking regex optional with the preprocess option to
|
|
||||||
# reduce false positives and move after header/footer removal
|
|
||||||
if getattr(self.extra_opts, 'preprocess_html', None):
|
|
||||||
if is_pdftohtml:
|
|
||||||
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
|
|
||||||
|
|
||||||
length = -1
|
length = -1
|
||||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||||
docanalysis = DocAnalysis('pdf', html)
|
docanalysis = DocAnalysis('pdf', html)
|
||||||
@ -525,11 +519,10 @@ class HTMLPreProcessor(object):
|
|||||||
html = dehyphenator(html,'html', length)
|
html = dehyphenator(html,'html', length)
|
||||||
|
|
||||||
if is_pdftohtml:
|
if is_pdftohtml:
|
||||||
from calibre.ebooks.conversion.utils import PreProcessor
|
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||||
pdf_markup = PreProcessor(self.extra_opts, None)
|
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||||
totalwords = 0
|
totalwords = 0
|
||||||
totalwords = pdf_markup.get_word_count(html)
|
if pdf_markup.get_word_count(html) > 7000:
|
||||||
if totalwords > 7000:
|
|
||||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||||
|
|
||||||
#dump(html, 'post-preprocess')
|
#dump(html, 'post-preprocess')
|
||||||
|
@ -170,9 +170,9 @@ class HeuristicProcessor(object):
|
|||||||
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||||
# or pdf page numbers from being treated as TOC markers
|
# or pdf page numbers from being treated as TOC markers
|
||||||
if wordcount > 7000:
|
if wordcount > 7000:
|
||||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
self.min_chapters = int(ceil(wordcount / 15000.))
|
||||||
self.max_chapters = int(ceil(wordcount / 100.))
|
self.max_chapters = int(ceil(wordcount / 1200.))
|
||||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
print "minimum chapters required are: "+str(self.min_chapters)
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
|
||||||
@ -272,6 +272,7 @@ class HeuristicProcessor(object):
|
|||||||
title_req = True
|
title_req = True
|
||||||
strict_title = False
|
strict_title = False
|
||||||
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
|
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||||
|
print "max chapters is "+str(self.max_chapters)
|
||||||
if type_name == 'common':
|
if type_name == 'common':
|
||||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
elif self.min_chapters <= hits < self.max_chapters:
|
elif self.min_chapters <= hits < self.max_chapters:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user