tweaked chapter_markup for false positives/negatives

This commit is contained in:
ldolse 2011-01-19 00:18:11 +08:00
parent 4fd784a9c1
commit 80065cb443
2 changed files with 12 additions and 10 deletions

View File

@ -174,7 +174,7 @@ class HeuristicProcessor(object):
if wordcount > 200000:
typical_chapters = 15000.
self.min_chapters = int(ceil(wordcount / typical_chapters))
print "minimum chapters required are: "+str(self.min_chapters)
self.log.debug("minimum chapters required are: "+str(self.min_chapters))
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log.debug("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@ -208,12 +208,12 @@ class HeuristicProcessor(object):
n_lookahead_close = ")"
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter).{0,50}?(</[ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
analysis_result = []
chapter_types = [
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\s|Prologue|Book\s|Part\s|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
[r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)\s*([\d\w-]+\:?\'?\s*){0,5}", True, True, True, False, "Searching for common section headings", 'common'],
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'], # Highest frequency headings which include titles
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>", True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
[r"[^'\"]?(\d+(\.|:))\s*([\dA-Z\-\'\"#,]+\s*){0,7}\s*", True, True, True, False, "Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
@ -274,10 +274,9 @@ class HeuristicProcessor(object):
title_req = True
strict_title = False
self.log.debug(unicode(type_name)+" had "+unicode(hits)+" hits - "+unicode(self.chapters_no_title)+" chapters with no title, "+unicode(self.chapters_with_title)+" chapters with titles, "+unicode(float(self.chapters_with_title) / float(hits))+" percent. ")
print "max chapters is "+str(self.max_chapters)
if type_name == 'common':
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
elif self.min_chapters <= hits < self.max_chapters:
elif self.min_chapters <= hits < max_chapters:
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
break
else:
@ -423,7 +422,6 @@ class HeuristicProcessor(object):
except:
self.log.warn("Can't get wordcount")
print "found "+unicode(self.totalwords)+" words in the flow"
if self.totalwords < 50:
self.log.warn("flow is too short, not running heuristics")
return html

View File

@ -285,10 +285,14 @@ remove all non-breaking-space entities.
:guilabel:`Detect and markup unformatted chapter headings and sub headings`
If your document does not have Chapter Markers and titles formatted differently from the rest of the text,
|app| can use this option to attempt detection them and surround them with heading tags. &lt;h2&gt; tags are used
for chapter headings; &lt;h3&gt; tags are used for any titles that are detected. This function will
not create a TOC, but in many cases it will cause |app|'s default chapter detection settings to correctly
detect chapters and build a TOC. Adjust the Xpath under Structure Detection if a TOC is not automatically
created. The inserted heading tags are not formatted, to apply formatting use the 'extra_css' option under
for chapter headings; &lt;h3&gt; tags are used for any titles that are detected.
This function will not create a TOC, but in many cases it will cause |app|'s default chapter detection settings
to correctly detect chapters and build a TOC. Adjust the Xpath under Structure Detection if a TOC is not automatically
created. If there are no other headings used in the document then setting "//h:h2" under Structure Detection would
be the easiest way to create a TOC for the document.
The inserted headings are not formatted, to apply formatting use the 'extra_css' option under
the Look and Feel conversion settings. For example, to center heading tags, use the following::
h2, h3 { text-align: center }