tweaked chapter_markup for false positives/negatives

This commit is contained in:
ldolse 2011-01-19 00:18:11 +08:00
parent 4fd784a9c1
commit 80065cb443
2 changed files with 12 additions and 10 deletions

View File

@ -174,7 +174,7 @@ class HeuristicProcessor(object):
if wordcount > 200000: if wordcount > 200000:
typical_chapters = 15000. typical_chapters = 15000.
self.min_chapters = int(ceil(wordcount / typical_chapters)) self.min_chapters = int(ceil(wordcount / typical_chapters))
print "minimum chapters required are: "+str(self.min_chapters) self.log.debug("minimum chapters required are: "+str(self.min_chapters))
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html)) self.html_preprocess_sections = len(heading.findall(html))
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@ -208,12 +208,12 @@ class HeuristicProcessor(object):
n_lookahead_close = ")" n_lookahead_close = ")"
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)" default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)" simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
analysis_result = [] analysis_result = []
chapter_types = [ chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'], [r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles [r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines [r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters [r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
@ -274,10 +274,9 @@ class HeuristicProcessor(object):
title_req = True title_req = True
strict_title = False strict_title = False
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ") self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
print "max chapters is "+str(self.max_chapters)
if type_name == 'common': if type_name == 'common':
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
elif self.min_chapters <= hits < self.max_chapters: elif self.min_chapters <= hits < max_chapters:
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name]) analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
break break
else: else:
@ -423,7 +422,6 @@ class HeuristicProcessor(object):
except: except:
self.log.warn("Can't get wordcount") self.log.warn("Can't get wordcount")
print "found "+unicode(self.totalwords)+" words in the flow"
if self.totalwords < 50: if self.totalwords < 50:
self.log.warn("flow is too short, not running heuristics") self.log.warn("flow is too short, not running heuristics")
return html return html

View File

@ -285,10 +285,14 @@ remove all non-breaking-space entities.
:guilabel:`Detect and markup unformatted chapter headings and sub headings` :guilabel:`Detect and markup unformatted chapter headings and sub headings`
If your document does not have Chapter Markers and titles formatted differently from the rest of the text, If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
|app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used |app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used
for chapter headings; &lt;h3&gt; tags are used for any titles that are detected. This function will for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.
not create a TOC, but in many cases it will cause |app|'s default chapter detection settings to correctly
detect chapters and build a TOC. Adjust the Xpath under Structure Detection if a TOC is not automatically This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings
created. The inserted heading tags are not formatted, to apply formatting use the 'extra_css' option under to correctly detect chapters and build a TOC. Adjust the Xpath under Structure Detection if a TOC is not automatically
created. If there are no other headings used in the document then setting "//h:h2" under Structure Detection would
be the easiest way to create a TOC for the document.
The inserted headings are not formatted, to apply formatting use the 'extra_css' option under
the Look and Feel conversion settings. For example, to center heading tags, use the following:: the Look and Feel conversion settings. For example, to center heading tags, use the following::
h2, h3 { text-align: center } h2, h3 { text-align: center }