added negative lookahead to reduce false positive matches during chapter marking

This commit is contained in:
ldolse 2010-11-17 17:55:50 +08:00
parent b03b802394
commit fb124c50a7

View File

@ -83,6 +83,24 @@ class PreProcessor(object):
if min_lns > tot_htm_ends: if min_lns > tot_htm_ends:
return True return True
def dump(self, raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'preprocess')
if not os.path.exists(odir):
os.makedirs(odir)
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")
@ -150,7 +168,7 @@ class PreProcessor(object):
#print "blanks between paragraphs is marked True" #print "blanks between paragraphs is marked True"
else: else:
blanks_between_paragraphs = False blanks_between_paragraphs = False
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") #self.dump(html, 'before_chapter_markup')
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
# #
# Build the Regular Expressions in pieces # Build the Regular Expressions in pieces
@ -158,7 +176,7 @@ class PreProcessor(object):
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*" chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P<chap>" chapter_header_open = r"(?P<chap>"
chapter_header_close = ")\s*" chapter_header_close = ")\s*"
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>\s*" chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)>)?\s*</(?P=outer)>"
if blanks_between_paragraphs: if blanks_between_paragraphs:
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*" blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
else: else:
@ -169,7 +187,7 @@ class PreProcessor(object):
title_header_close = ")\s*" title_header_close = ")\s*"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>" title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
opt_title_close = ")?" opt_title_close = ")?"
n_lookahead_open = "(?!=" n_lookahead_open = "\s+(?!"
n_lookahead_close = ")" n_lookahead_close = ")"
default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)" default_title = r"(\s*[\w\'\"-]+){1,5}?(?=<)"
@ -181,10 +199,10 @@ class PreProcessor(object):
full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close full_chapter_line = chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead #print "n_lookahead is:\n" + n_lookahead + "\n\n"
print "Chapter line is " + full_chapter_line + "\n\n" #print "'normal' Chapter line - no title - is:\n" + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
print chapter_marker #print "full chapter regex with lookahead is:\n" + chapter_marker + "\n\n"
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html)) self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings") self.log("found " + unicode(self.html_preprocess_sections) + " pre-existing headings")
@ -197,10 +215,10 @@ class PreProcessor(object):
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters") self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters")
full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close full_chapter_line = chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead #print "n_lookahead is " + n_lookahead
print "Chapter line is " + full_chapter_line + "\n\n" #print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
print chapter_marker #print chapter_marker
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)
@ -208,10 +226,10 @@ class PreProcessor(object):
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines") self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying emphazised lines")
full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close full_chapter_line = chapter_line_open+chapter_header_open+emphasized_lines+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead #print "n_lookahead is " + n_lookahead
print "Chapter line is " + full_chapter_line + "\n\n" #print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
print chapter_marker #print chapter_marker
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)
@ -219,10 +237,10 @@ class PreProcessor(object):
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words") self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying with uppercase words")
full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close full_chapter_line = chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead #print "n_lookahead is " + n_lookahead
print "Chapter line is " + full_chapter_line + "\n\n" #print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
print chapter_marker #print chapter_marker
chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE) chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)
@ -230,10 +248,10 @@ class PreProcessor(object):
self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles") self.log("not enough chapters, only " + unicode(self.html_preprocess_sections) + ", trying numeric chapters with titles")
full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close full_chapter_line = chapter_line_open+chapter_header_open+numeric_titles+chapter_header_close+chapter_line_close
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line) n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
print "n_lookahead is " + n_lookahead #print "n_lookahead is " + n_lookahead
print "Chapter line is " + full_chapter_line + "\n\n" #print "Chapter line is " + full_chapter_line + "\n\n"
chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close chapter_marker = lookahead+full_chapter_line+blank_lines+n_lookahead_open+n_lookahead+n_lookahead_close+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
print chapter_marker #print chapter_marker
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE) chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)