mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
implemented multi-pass analysis for chapter detection
This commit is contained in:
parent
05730e1886
commit
a0aa719bb0
@ -21,6 +21,7 @@ class HeuristicProcessor(object):
|
|||||||
self.deleted_nbsps = False
|
self.deleted_nbsps = False
|
||||||
self.totalwords = 0
|
self.totalwords = 0
|
||||||
self.min_chapters = 1
|
self.min_chapters = 1
|
||||||
|
self.max_chapters = 150
|
||||||
self.chapters_no_title = 0
|
self.chapters_no_title = 0
|
||||||
self.chapters_with_title = 0
|
self.chapters_with_title = 0
|
||||||
self.blanks_deleted = False
|
self.blanks_deleted = False
|
||||||
@ -132,7 +133,7 @@ class HeuristicProcessor(object):
|
|||||||
def markup_italicis(self, html):
|
def markup_italicis(self, html):
|
||||||
ITALICIZE_WORDS = [
|
ITALICIZE_WORDS = [
|
||||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetra', 'n.b.', 'N.b.',
|
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||||
]
|
]
|
||||||
@ -166,9 +167,11 @@ class HeuristicProcessor(object):
|
|||||||
with minimum false positives. Exits after finding a successful pattern
|
with minimum false positives. Exits after finding a successful pattern
|
||||||
'''
|
'''
|
||||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||||
# minimum of chapters to search for
|
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||||
|
# or pdf page numbers from being treated as TOC markers
|
||||||
if wordcount > 7000:
|
if wordcount > 7000:
|
||||||
self.min_chapters = int(ceil(wordcount / 7000.))
|
self.min_chapters = int(ceil(wordcount / 7000.))
|
||||||
|
self.max_chapters = int(ceil(wordcount / 100.))
|
||||||
#print "minimum chapters required are: "+str(self.min_chapters)
|
#print "minimum chapters required are: "+str(self.min_chapters)
|
||||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||||
self.html_preprocess_sections = len(heading.findall(html))
|
self.html_preprocess_sections = len(heading.findall(html))
|
||||||
@ -202,44 +205,84 @@ class HeuristicProcessor(object):
|
|||||||
n_lookahead_open = "\s+(?!"
|
n_lookahead_open = "\s+(?!"
|
||||||
n_lookahead_close = ")"
|
n_lookahead_close = ")"
|
||||||
|
|
||||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
|
||||||
|
|
||||||
|
analysis_result = []
|
||||||
|
|
||||||
chapter_types = [
|
chapter_types = [
|
||||||
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
|
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
|
||||||
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, "Searching for common section headings", 'common'],
|
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
|
||||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
||||||
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
||||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
||||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False, "Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
||||||
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
||||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
|
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False, "Searching for chapters with Uppercase Characters", 'uppercase' ] # Uppercase Chapters
|
||||||
]
|
]
|
||||||
|
|
||||||
def recurse_patterns(html, analyze):
|
def recurse_patterns(html, analyze):
|
||||||
# Start with most typical chapter headings, get more aggressive until one works
|
# Start with most typical chapter headings, get more aggressive until one works
|
||||||
for [chapter_type, lookahead_ignorecase, log_message, type_name] in chapter_types:
|
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
|
||||||
|
n_lookahead = ''
|
||||||
|
hits = 0
|
||||||
|
self.chapters_no_title = 0
|
||||||
|
self.chapters_with_title = 0
|
||||||
|
|
||||||
|
if n_lookahead_req:
|
||||||
|
lp_n_lookahead_open = n_lookahead_open
|
||||||
|
lp_n_lookahead_close = n_lookahead_close
|
||||||
|
else:
|
||||||
|
lp_n_lookahead_open = ''
|
||||||
|
lp_n_lookahead_close = ''
|
||||||
|
|
||||||
|
if strict_title:
|
||||||
|
lp_title = default_title
|
||||||
|
else:
|
||||||
|
lp_title = simple_title
|
||||||
|
|
||||||
|
if ignorecase:
|
||||||
|
arg_ignorecase = r'(?i)'
|
||||||
|
else:
|
||||||
|
arg_ignorecase = ''
|
||||||
|
|
||||||
|
if title_req:
|
||||||
|
lp_opt_title_open = ''
|
||||||
|
lp_opt_title_close = ''
|
||||||
|
else:
|
||||||
|
lp_opt_title_open = opt_title_open
|
||||||
|
lp_opt_title_close = opt_title_close
|
||||||
|
|
||||||
if self.html_preprocess_sections >= self.min_chapters:
|
if self.html_preprocess_sections >= self.min_chapters:
|
||||||
break
|
break
|
||||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
if n_lookahead_req:
|
||||||
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||||
if lookahead_ignorecase:
|
if not analyze:
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
|
self.log.debug("Marked " + unicode(self.html_preprocess_sections) + " headings, " + log_message)
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
|
|
||||||
else:
|
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||||
chapter_marker = init_lookahead+full_chapter_line+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close+n_lookahead_open+n_lookahead+n_lookahead_close
|
chapdetect = re.compile(r'%s' % chapter_marker)
|
||||||
chapdetect = re.compile(r'%s' % chapter_marker, re.UNICODE)
|
|
||||||
if analyze:
|
if analyze:
|
||||||
hits = len(chapdetect.findall(html))
|
hits = len(chapdetect.findall(html))
|
||||||
print unicode(type_name)+" had "+unicode(hits)+" hits"
|
if hits:
|
||||||
chapdetect.sub(self.analyze_title_matches, html)
|
chapdetect.sub(self.analyze_title_matches, html)
|
||||||
print unicode(self.chapters_no_title)+" chapters with no title"
|
if float(self.chapters_with_title) / float(hits) > .5:
|
||||||
print unicode(self.chapters_with_title)+" chapters with titles"
|
title_req = True
|
||||||
|
strict_title = False
|
||||||
|
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||||
|
if type_name == 'common':
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
elif self.min_chapters <= hits < self.max_chapters:
|
||||||
|
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
html = chapdetect.sub(self.chapter_head, html)
|
html = chapdetect.sub(self.chapter_head, html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
recurse_patterns(html, True)
|
recurse_patterns(html, True)
|
||||||
|
chapter_types = analysis_result
|
||||||
html = recurse_patterns(html, False)
|
html = recurse_patterns(html, False)
|
||||||
|
|
||||||
words_per_chptr = wordcount
|
words_per_chptr = wordcount
|
||||||
@ -293,7 +336,7 @@ class HeuristicProcessor(object):
|
|||||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||||
if len(pre.findall(html)) >= 1:
|
if len(pre.findall(html)) >= 1:
|
||||||
self.log.debug("Running Text Processing")
|
self.log.debug("Running Text Processing")
|
||||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
|
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
|
||||||
html = outerhtml.sub(self.txt_process, html)
|
html = outerhtml.sub(self.txt_process, html)
|
||||||
else:
|
else:
|
||||||
# Add markup naively
|
# Add markup naively
|
||||||
|
Loading…
x
Reference in New Issue
Block a user