Various improvements to preprocessing code. Also replace a couple of unicode punctuation characters that the SONY readers dont support when the Outpu profile is sony

This commit is contained in:
Kovid Goyal 2010-09-28 15:43:22 -06:00
commit f54ee65bcc
6 changed files with 265 additions and 123 deletions

View File

@ -9,6 +9,8 @@ __docformat__ = 'restructuredtext en'
import datetime import datetime
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
now = datetime.datetime.now() now = datetime.datetime.now()
title = 'The AJC' title = 'The AJC'

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Nealz Nuze' title = 'Nealz Nuze'
language = 'en' language = 'en'

View File

@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re import re
class AdvancedUserRecipe1282101454(BasicNewsRecipe): class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Popular Science' title = 'Popular Science'

View File

@ -1,3 +1,4 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement from __future__ import with_statement
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
@ -251,6 +252,9 @@ class OutputProfile(Plugin):
#: The character used to represent a star in ratings #: The character used to represent a star in ratings
ratings_char = u'*' ratings_char = u'*'
#: Unsupported unicode characters to be replaced during preprocessing
unsupported_unicode_chars = []
@classmethod @classmethod
def tags_to_string(cls, tags): def tags_to_string(cls, tags):
return escape(', '.join(tags)) return escape(', '.join(tags))
@ -422,6 +426,8 @@ class SonyReaderOutput(OutputProfile):
dpi = 168.451 dpi = 168.451
fbase = 12 fbase = 12
fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24] fsizes = [7.5, 9, 10, 12, 15.5, 20, 22, 24]
unsupported_unicode_chars = [u'\u201f', u'\u201b']
class KoboReaderOutput(OutputProfile): class KoboReaderOutput(OutputProfile):

View File

@ -62,49 +62,104 @@ def wrap_lines(match):
else: else:
return ital+' ' return ital+' '
def line_length(format, raw, percent): class DocAnalysis(object):
''' '''
raw is the raw text to find the line length to use for wrapping. Provides various text analysis functions to determine how the document is structured.
percentage is a decimal number, 0 - 1 which is used to determine format is the type of document analysis will be done against.
how far in the list of line lengths to use. The list of line lengths is raw is the raw text to determine the line length to use for wrapping.
ordered smallest to larged and does not include duplicates. 0.5 is the Blank lines are excluded from analysis
median value.
''' '''
raw = raw.replace('&nbsp;', ' ')
if format == 'html':
linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
lines = linere.findall(raw)
lengths = [] def __init__(self, format='html', raw=''):
for line in lines: raw = raw.replace('&nbsp;', ' ')
if len(line) > 0: if format == 'html':
lengths.append(len(line)) linere = re.compile('(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
self.lines = linere.findall(raw)
if not lengths: def line_length(self, percent):
return 0 '''
Analyses the document to find the median line length.
percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use. The list of line lengths is
ordered smallest to larged and does not include duplicates. 0.5 is the
median value.
'''
lengths = []
for line in self.lines:
if len(line) > 0:
lengths.append(len(line))
lengths = list(set(lengths)) if not lengths:
total = sum(lengths) return 0
avg = total / len(lengths)
max_line = avg * 2
lengths = sorted(lengths) lengths = list(set(lengths))
for i in range(len(lengths) - 1, -1, -1): total = sum(lengths)
if lengths[i] > max_line: avg = total / len(lengths)
del lengths[i] max_line = avg * 2
if percent > 1: lengths = sorted(lengths)
percent = 1 for i in range(len(lengths) - 1, -1, -1):
if percent < 0: if lengths[i] > max_line:
percent = 0 del lengths[i]
index = int(len(lengths) * percent) - 1 if percent > 1:
percent = 1
if percent < 0:
percent = 0
return lengths[index] index = int(len(lengths) * percent) - 1
return lengths[index]
def line_histogram(self, percent):
'''
Creates a broad histogram of the document to determine whether it incorporates hard
line breaks. Lines are sorted into 20 'buckets' based on length.
percent is the percentage of lines that should be in a single bucket to return true
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
'''
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
maxLineLength=1900 # Discard larger than this to stay in range
buckets=20 # Each line is divided into a bucket based on length
#print "there are "+str(len(lines))+" lines"
#max = 0
#for line in self.lines:
# l = len(line)
# if l > max:
# max = l
#print "max line found is "+str(max)
# Build the line length histogram
hRaw = [ 0 for i in range(0,buckets) ]
for line in self.lines:
l = len(line)
if l > minLineLength and l < maxLineLength:
l = int(l/100)
#print "adding "+str(l)
hRaw[l]+=1
# Normalize the histogram into percents
totalLines = len(self.lines)
h = [ float(count)/totalLines for count in hRaw ]
#print "\nhRaw histogram lengths are: "+str(hRaw)
#print " percents are: "+str(h)+"\n"
# Find the biggest bucket
maxValue = 0
for i in range(0,len(h)):
if h[i] > maxValue:
maxValue = h[i]
if maxValue < percent:
#print "Line lengths are too variable. Not unwrapping."
return False
else:
#print str(maxValue)+" of the lines were in one bucket"
return True
class Dehyphenator(object): class Dehyphenator(object):
''' '''
@ -117,42 +172,62 @@ class Dehyphenator(object):
def __init__(self): def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match - # Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex' # don't add suffixes which are also complete words, such as 'able' or 'sex'
self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE) self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation # remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE) self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE) self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
def dehyphenate(self, match): def dehyphenate(self, match):
firsthalf = match.group('firstpart') firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart') secondhalf = match.group('secondpart')
try:
wraptags = match.group('wraptags')
except:
wraptags = ''
hyphenated = str(firsthalf) + "-" + str(secondhalf) hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf) dehyphenated = str(firsthalf) + str(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated) lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None: if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword) lookupword = self.removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated) #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
match = booklookup.search(self.html) try:
if match: searchresult = self.html.find(str.lower(lookupword))
#print "returned dehyphenated word: " + str(dehyphenated) except:
return dehyphenated
else:
#print "returned hyphenated word: " + str(hyphenated)
return hyphenated return hyphenated
if self.format == 'html_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
#print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
elif self.html.find(hyphenated) != -1:
#print "Cleanup:returned hyphenated word: " + str(hyphenated)
return hyphenated
else:
#print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
return firsthalf+u'\u2014'+wraptags+secondhalf
else:
if self.html.find(lookupword) != -1 or searchresult != -1:
#print "returned dehyphenated word: " + str(dehyphenated)
return dehyphenated
else:
#print " returned hyphenated word: " + str(hyphenated)
return hyphenated
def __call__(self, html, format, length=1): def __call__(self, html, format, length=1):
self.html = html self.html = html
self.format = format
if format == 'html': if format == 'html':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)' % length)
elif format == 'pdf': elif format == 'pdf':
intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^“"\s>]+)-\s*(<p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length) intextmatch = re.compile(u'(?<=.{%i})(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-\s*(?P<wraptags><p>|</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)'% length)
elif format == 'individual_words': elif format == 'individual_words':
intextmatch = re.compile('>[^<]*\b(?P<firstpart>[^"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet intextmatch = re.compile(u'>[^<]*\b(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)"\s>]+)-(?P<secondpart)\w+)\b[^<]*<') # for later, not called anywhere yet
elif format == 'html_cleanup':
intextmatch = re.compile(u'(?P<firstpart>[^\[\]\\\^\$\.\|\?\*\+\(\)“"\s>]+)-\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html) html = intextmatch.sub(self.dehyphenate, html)
return html return html
class CSSPreProcessor(object): class CSSPreProcessor(object):
PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}') PAGE_PAT = re.compile(r'@page[^{]*?{[^}]*?}')
@ -286,7 +361,7 @@ class HTMLPreProcessor(object):
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Detect Chapters to match default XPATH in GUI # Detect Chapters to match default XPATH in GUI
(re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head), (re.compile(r'<br>\s*(?P<chap>(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Kapitel|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*(</[ibu]>){0,2})\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*<br>)?', re.IGNORECASE), chap_head),
# Cover the case where every letter in a chapter title is separated by a space # Cover the case where every letter in a chapter title is separated by a space
(re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head), (re.compile(r'<br>\s*(?P<chap>([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(<br>\s*){1,3}\s*(?P<title>(<[ibu]>){0,2}(\s*\w+){1,4}\s*(</[ibu]>){0,2}\s*(<br>))?'), chap_head),
@ -374,10 +449,8 @@ class HTMLPreProcessor(object):
print 'Failed to parse remove_footer regexp' print 'Failed to parse remove_footer regexp'
traceback.print_exc() traceback.print_exc()
# unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal # delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml: if is_pdftohtml:
# unwrap em/en dashes
end_rules.append((re.compile(u'(?<=[–—])\s*<p>\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens # unwrap/delete soft hyphens
end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: '')) end_rules.append((re.compile(u'[­](\s*<p>)+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting # unwrap/delete soft hyphens with formatting
@ -391,12 +464,15 @@ class HTMLPreProcessor(object):
length = -1 length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) docanalysis = DocAnalysis('pdf', html)
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
if length: if length:
# print "The pdf line length returned is " + str(length) #print "The pdf line length returned is " + str(length)
# unwrap em/en dashes
end_rules.append((re.compile(u'(?<=.{%i}[–—])\s*<p>\s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append( end_rules.append(
# Un wrap using punctuation # Un wrap using punctuation
(re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines), (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
) )
for rule in self.PREPROCESS + start_rules: for rule in self.PREPROCESS + start_rules:
@ -454,6 +530,14 @@ class HTMLPreProcessor(object):
if getattr(self.extra_opts, 'smarten_punctuation', False): if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html) html = self.smarten_punctuation(html)
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
if unsupported_unicode_chars:
from calibre.ebooks.unidecode.unidecoder import Unidecoder
unidecoder = Unidecoder()
for char in unsupported_unicode_chars:
asciichar = unidecoder.decode(char)
html = html.replace(char, asciichar)
return html return html
def smarten_punctuation(self, html): def smarten_punctuation(self, html):

View File

@ -6,7 +6,7 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re import re
from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
class PreProcessor(object): class PreProcessor(object):
@ -77,13 +77,18 @@ class PreProcessor(object):
def __call__(self, html): def __call__(self, html):
self.log("********* Preprocessing HTML *********") self.log("********* Preprocessing HTML *********")
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html)
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
###### Check Markup ###### ###### Check Markup ######
# #
# some lit files don't have any <p> tags or equivalent (generally just plain text between # some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding # <pre> tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1): if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now") self.log("not enough paragraph markers, adding now")
# check if content is in pre tags, use txt procesor to mark up if so # check if content is in pre tags, use txt processor to mark up if so
pre = re.compile(r'<pre>', re.IGNORECASE) pre = re.compile(r'<pre>', re.IGNORECASE)
if len(pre.findall(html)) == 1: if len(pre.findall(html)) == 1:
self.log("Running Text Processing") self.log("Running Text Processing")
@ -113,47 +118,77 @@ class PreProcessor(object):
# Get rid of empty <o:p> tags to simplify other processing # Get rid of empty <o:p> tags to simplify other processing
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
# Get rid of empty span, bold, & italics tags # Get rid of empty span, bold, & italics tags
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing # If more than 40% of the lines are empty paragraphs and the user has enabled remove
# paragraph spacing then delete blank lines to clean up spacing
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE) #multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE)
blanklines = blankreg.findall(html) blanklines = blankreg.findall(html)
lines = linereg.findall(html) lines = linereg.findall(html)
blanks_between_paragraphs = False
if len(lines) > 1: if len(lines) > 1:
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts, if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
'remove_paragraph_spacing', False): 'remove_paragraph_spacing', False):
self.log("deleting blank lines") self.log("deleting blank lines")
html = blankreg.sub('', html) html = blankreg.sub('', html)
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly elif float(len(blanklines)) / float(len(lines)) > 0.40:
html = re.sub(r"\s*</p>", "</p>\n", html) blanks_between_paragraphs = True
html = re.sub(r"\s*<p>\s*", "\n<p>", html) #print "blanks between paragraphs is marked True"
else:
blanks_between_paragraphs = False
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic # detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
lookahead = "(?=<(p|div))"
chapter_line_open = "<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*"
chapter_header_open = r"(?P<chap>"
chapter_header_close = ")\s*"
chapter_line_close = "(</(?P=inner3)>)?\s*(</(?P=inner2)>)?\s*(</(?P=inner1)\s[^>]*>)?\s*</(?P=outer)>\s*"
if blanks_between_paragraphs:
blank_lines = "(\s*<p[^>]*>\s*</p>){0,2}\s*"
else:
blank_lines = ""
opt_title_open = "("
title_line_open = "<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*"
title_header_open = "(?P<title>"
title_header_close = ")\s*"
title_line_close = "(</(?P=inner6)>)?\s*(</(?P=inner5)>)?\s*(</(?P=inner4)\s[^>]*>)?\s*</(?P=outer2)>"
opt_title_close = ")?"
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
#print chapter_marker
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE) heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html)) self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings") self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
# #
# Start with most typical chapter headings, get more aggressive until one works # Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</[ibu]>){0,2})\s*(</span>)?s*(</[ibu]>){0,2}\s*(</span>)?\s*(</(p|/?br)>)\s*\s*(\s*<p[^>]*>\s*</p>){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.IGNORECASE|re.VERBOSE) chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect.sub(self.chapter_head, html) html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters") self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words") self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</[ibu]>){0,2})\s*(</span>)?\s*(</[ibu]>){0,2}\s*(</(br|p)>))?', re.UNICODE) chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html) html = chapdetect2.sub(self.chapter_head, html)
###### Unwrap lines ###### ###### Unwrap lines ######
# #
self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags # Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so # span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries # that lines can be un-wrapped across page boundaries
@ -168,25 +203,40 @@ class PreProcessor(object):
format = 'html' format = 'html'
else: else:
format = 'html' format = 'html'
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
# more of the lines break in the same region of the document then unwrapping is required
docanalysis = DocAnalysis(format, html)
hardbreaks = docanalysis.line_histogram(.50)
self.log("Hard line breaks check returned "+str(hardbreaks))
# Calculate Length # Calculate Length
length = line_length(format, html, getattr(self.extra_opts, unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
'html_unwrap_factor', 0.4)) length = docanalysis.line_length(unwrap_factor)
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***") self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
max_length = length * 1.4 # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
min_max = str("(?<=.{"+str(length)+"})(?<!.{"+str(max_length)+"})") if hardbreaks or unwrap_factor < 0.4:
# self.log("Unwrapping required, unwrapping Lines")
# Unwrap em/en dashes, delete soft-hyphens # Unwrap em/en dashes
#self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n") html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html) # Dehyphenate
html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html) self.log("Unwrapping/Removing hyphens")
# Dehyphenate dehyphenator = Dehyphenator()
dehyphenator = Dehyphenator() html = dehyphenator(html,'html', length)
html = dehyphenator(html,'html', length) self.log("Done dehyphenating")
# Unwrap lines using punctation and line length
unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
html = unwrap.sub(' ', html)
#check any remaining hyphens, but only unwrap if there is a match
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length)
else:
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
self.log("Cleaning up hyphenation")
dehyphenator = Dehyphenator()
html = dehyphenator(html,'html_cleanup', length)
self.log("Done dehyphenating")
# Unwrap lines using punctation and line length # delete soft hyphens
unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?<!\&\w{4});))\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) html = re.sub(u'\xad\s*(</span>\s*(</[iubp]>\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
html = unwrap.sub(' ', html)
# If still no sections after unwrapping mark split points on lines with no punctuation # If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10: if self.html_preprocess_sections < 10: