tuned dehyphen code to better handle unwrapped docs, added line histogram function to determine whether a document has hard breaks or not

This commit is contained in:
ldolse 2010-09-21 18:18:50 -10:00
parent 301af532c6
commit 936451853c
2 changed files with 134 additions and 52 deletions

View File

@ -62,49 +62,97 @@ def wrap_lines(match):
else: else:
return ital+' ' return ital+' '
def line_length(format, raw, percent): def line_length(format, raw, percent, test_type):
''' '''
raw is the raw text to find the line length to use for wrapping. Analyses the document to see if hard line breaks exist or to find the
median line length.
format is the type of document analysis will be done against.
raw is the raw text to determine the line length to use for wrapping.
percentage is a decimal number, 0 - 1 which is used to determine percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use. The list of line lengths is how far in the list of line lengths to use. The list of line lengths is
ordered smallest to larged and does not include duplicates. 0.5 is the ordered smallest to larged and does not include duplicates. 0.5 is the
median value. median value.
test_type sets whether to use the line length to return the median or a
do a histogram analysis to see if unwrapping is required.
''' '''
raw = raw.replace(' ', ' ') raw = raw.replace(' ', ' ')
if format == 'html': if format == 'html':
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL) linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
elif format == 'pdf': elif format == 'pdf':
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL) linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html': elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL) linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
lines = linere.findall(raw) lines = linere.findall(raw)
lengths = [] if test_type == 'median':
for line in lines: lengths = []
if len(line) > 0: for line in lines:
lengths.append(len(line)) if len(line) > 0:
lengths.append(len(line))
if not lengths: if not lengths:
return 0 return 0
lengths = list(set(lengths)) lengths = list(set(lengths))
total = sum(lengths) total = sum(lengths)
avg = total / len(lengths) avg = total / len(lengths)
max_line = avg * 2 max_line = avg * 2
lengths = sorted(lengths) lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1): for i in range(len(lengths) - 1, -1, -1):
if lengths[i] > max_line: if lengths[i] > max_line:
del lengths[i] del lengths[i]
if percent > 1: if percent > 1:
percent = 1 percent = 1
if percent < 0: if percent < 0:
percent = 0 percent = 0
index = int(len(lengths) * percent) - 1 index = int(len(lengths) * percent) - 1
return lengths[index] return lengths[index]
if test_type == 'histogram':
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
maxLineLength=1900 # Discard larger than this to stay in range
buckets=20 # Each line is divided into a bucket based on length
#print "there are "+str(len(lines))+" lines"
max = 0
for line in lines:
l = len(line)
if l > max:
max = l
print "max line found is "+str(max)
# Build the line length histogram
hRaw = [ 0 for i in range(0,buckets) ]
for line in lines:
l = len(line)
if l > minLineLength and l < maxLineLength:
l = int(l/100)
#print "adding "+str(l)
hRaw[l]+=1
# Normalize the histogram into percents
totalLines = len(lines)
h = [ float(count)/totalLines for count in hRaw ]
print "\nhRaw histogram lengths are: "+str(hRaw)
print " percents are: "+str(h)+"\n"
# Find the biggest bucket
maxValue = 0
peakPosition = 0
for i in range(0,len(h)):
if h[i] > maxValue:
maxValue = h[i]
peakPosition = i
if maxValue < percent:
#print "Line lengths are too variable. Not unwrapping."
return False
else:
#print str(maxValue)+" of the lines were in one bucket"
return True
class Dehyphenator(object): class Dehyphenator(object):
''' '''
@ -117,7 +165,7 @@ class Dehyphenator(object):
def __init__(self): def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match - # Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex' # don't add suffixes which are also complete words, such as 'able' or 'sex'
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation # remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE) self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE) self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
@ -125,34 +173,54 @@ class Dehyphenator(object):
def dehyphenate(self, match): def dehyphenate(self, match):
firsthalf = match.group('firstpart') firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart') secondhalf = match.group('secondpart')
try:
wraptags = match.group('wraptags')
except:
wraptags = ''
hyphenated = str(firsthalf) + "-" + str(secondhalf) hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf) dehyphenated = str(firsthalf) + str(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated) lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None: if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword) lookupword = self.removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE) booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
match = booklookup.search(self.html) if self.format == 'html_cleanup':
if match: match = booklookup.search(self.html)
#print "returned dehyphenated word: " + str(dehyphenated) hyphenmatch = re.search(u'%s' % hyphenated, self.html)
return dehyphenated if match:
print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
elif hyphenmatch:
print "Cleanup:returned hyphenated word: " + str(hyphenated)
return hyphenated
else:
print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf
else: else:
#print "returned hyphenated word: " + str(hyphenated) match = booklookup.search(self.html)
return hyphenated if match:
print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
else:
print "returned hyphenated word: " + str(hyphenated)
return hyphenated
def __call__(self, html, format, length=1): def __call__(self, html, format, length=1):
self.html = html self.html = html
self.format = format
if format == 'html': if format == 'html':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
elif format == 'pdf': elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html) html = intextmatch.sub(self.dehyphenate, html)
return html return html
class CSSPreProcessor(object): class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
@ -388,7 +456,7 @@ class HTMLPreProcessor(object):
end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),) end_rules.append((re.compile(r'<p>\s*(?P<chap>(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*(</[ibu]>){0,2})\s*<p>\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<p>)?'), chap_head),)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
if length: if length:
# print "The pdf line length returned is " + str(length) # print "The pdf line length returned is " + str(length)
end_rules.append( end_rules.append(

View File

@ -153,7 +153,6 @@ class PreProcessor(object):
###### Unwrap lines ###### ###### Unwrap lines ######
# #
self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags # Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so # span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries # that lines can be un-wrapped across page boundaries
@ -168,25 +167,40 @@ class PreProcessor(object):
format = 'html' format = 'html'
else: else:
format = 'html' format = 'html'
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
# more of the lines break in the same region of the document then unwrapping is required
hardbreaks = line_length(format, html, .50, 'histogram')
print "Hard line breaks check returned "+str(hardbreaks)
# Calculate Length # Calculate Length
length = line_length(format, html, getattr(self.extra_opts, unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
'html_unwrap_factor', 0.4)) length = line_length(format, html, unwrap_factor, 'median')
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
max_length = length * 1.4 # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})") if hardbreaks or unwrap_factor < 0.4:
# self.log("Unwrapping required, unwrapping Lines")
# Unwrap em/en dashes, delete soft-hyphens # Unwrap em/en dashes
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
# Dehyphenate
self.log("Unwrapping/Removing hyphens")
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html', length)
self.log("Done dehyphenating")
# Unwrap lines using punctation and line length
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html)
#check any remaining hyphens, but only unwrap if there is a match
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length)
else:
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
self.log("Cleaning up hyphenation")
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length)
self.log("Done dehyphenating")
# delete soft hyphens
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
# Dehyphenate
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html', length)
# Unwrap lines using punctation and line length
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html)
# If still no sections after unwrapping mark split points on lines with no punctuation # If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10: