sync to ldolse heuristics branch.

This commit is contained in:
John Schember 2011-01-14 18:51:42 -05:00
commit aed47e4b3a
8 changed files with 103 additions and 72 deletions

View File

@ -160,7 +160,7 @@ class InputFormatPlugin(Plugin):
''' '''
raise NotImplementedError() raise NotImplementedError()
def preprocess_html(self, opts, html): def heuristics(self, opts, html):
''' '''
This method is called by the conversion pipeline on all HTML before it This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on is parsed. It is meant to be used to do any required preprocessing on

View File

@ -491,8 +491,8 @@ OptionRecommendation(name='enable_heuristics',
OptionRecommendation(name='markup_chapter_headings', OptionRecommendation(name='markup_chapter_headings',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Detect chapter headings and sub headings. Change ' help=_('Detect unformatted chapter headings and sub headings. Change '
'them to h1 and h2 tags.')), 'them to h2 and h3 tags.')),
OptionRecommendation(name='italicize_common_cases', OptionRecommendation(name='italicize_common_cases',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
@ -508,26 +508,30 @@ OptionRecommendation(name='html_unwrap_factor',
recommended_value=0.40, level=OptionRecommendation.LOW, recommended_value=0.40, level=OptionRecommendation.LOW,
help=_('Scale used to determine the length at which a line should ' help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.4, just below the median line length.')), 'default is 0.4, just below the median line length. If only a '
'few lines in the document require unwrapping this value should '
'be reduced')),
OptionRecommendation(name='unwrap_lines', OptionRecommendation(name='unwrap_lines',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Unwrap lines.')), help=_('Unwrap lines using punctuation and other formatting clues.')),
OptionRecommendation(name='delete_blank_paragraphs', OptionRecommendation(name='delete_blank_paragraphs',
recommended_value=True, level=OptionRecommendation.LOW, recommended_value=True, level=OptionRecommendation.LOW,
help=_('Remove empyt paragraphs from the document')), help=_('Remove empty paragraphs from the document when they exist between '
'every other paragraph')),
OptionRecommendation(name='format_scene_breaks', OptionRecommendation(name='format_scene_breaks',
recommended_value=False, level=OptionRecommendation.LOW, recommended_value=False, level=OptionRecommendation.LOW,
help=_('Replace soft scene breaks that use multiple blank lines ' help=_('Detects left aligned scene break markers and center aligns them. '
'with horizontal rules.')), 'Replace soft scene breaks that use multiple blank lines with'
'horizontal rules.')),
OptionRecommendation(name='dehyphenate', OptionRecommendation(name='dehyphenate',
recommended_value=True, level=OptionRecommendation.LOW, recommended_value=True, level=OptionRecommendation.LOW,
help=_('Combine words that are separated by a hyphen. ' help=_('Analyses hyphenated words throughout the document. The '
'This is for cases where a word is hyphenated across ' 'document itself is used as a dictionary to determine whether hyphens '
'two lines to denote the characters from a single word.')), 'should be retained or removed.')),
OptionRecommendation(name='sr1_search', OptionRecommendation(name='sr1_search',
recommended_value='', level=OptionRecommendation.LOW, recommended_value='', level=OptionRecommendation.LOW,
@ -1008,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
Create an OEBBook. Create an OEBBook.
''' '''
from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
opts.preprocess_html, opts) opts.enable_heuristics, opts)
if not encoding: if not encoding:
encoding = None encoding = None
oeb = OEBBook(log, html_preprocessor, oeb = OEBBook(log, html_preprocessor,

View File

@ -113,6 +113,11 @@ class PreProcessor(object):
return wordcount.words return wordcount.words
def markup_chapters(self, html, wordcount, blanks_between_paragraphs): def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
'''
Searches for common chapter headings throughout the document
attempts multiple patterns based on likelihood of a match
with minimum false positives. Exits after finding a successful pattern
'''
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the # Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for # minimum of chapters to search for
self.min_chapters = 1 self.min_chapters = 1
@ -185,6 +190,10 @@ class PreProcessor(object):
return html return html
def punctuation_unwrap(self, length, content, format): def punctuation_unwrap(self, length, content, format):
'''
Unwraps lines based on line length and punctuation
supports range of potential html markup and text files
'''
# define the pieces of the regex # define the pieces of the regex
lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?<!\&\w{4});))" # (?<!\&\w{4});) is a semicolon not part of an entity
line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?" line_ending = "\s*</(span|p|div)>\s*(</(p|span|div)>)?"
@ -201,53 +210,38 @@ class PreProcessor(object):
return content return content
def __call__(self, html): def text_process_pre(self, html):
self.log("********* Preprocessing HTML *********") pre = re.compile(r'<pre>', re.IGNORECASE)
if len(pre.findall(html)) == 1:
self.log("Running Text Processing")
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
separate_paragraphs_single_line
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
html = outerhtml.sub('\g<text>', html)
html = separate_paragraphs_single_line(html)
html = preserve_spaces(html)
html = convert_basic(html, epub_split_size_kb=0)
else:
# Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
return html
# Count the words in the document to estimate how many chapters to look for and whether def arrange_htm_line_endings(self, html):
# other types of processing are attempted
totalwords = 0
totalwords = self.get_word_count(html)
if totalwords < 50:
self.log("not enough text, not preprocessing")
return html
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html) html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\g<tag>"+">\n", html)
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html) html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\g<tag>"+"\g<style>"+">", html)
return html
###### Check Markup ###### def fix_nbsp_indents(self, html):
#
# some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
# check if content is in pre tags, use txt processor to mark up if so
pre = re.compile(r'<pre>', re.IGNORECASE)
if len(pre.findall(html)) == 1:
self.log("Running Text Processing")
from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
separate_paragraphs_single_line
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*)(?=</pre>).*', re.IGNORECASE|re.DOTALL)
html = outerhtml.sub('\g<text>', html)
html = separate_paragraphs_single_line(html)
html = preserve_spaces(html)
html = convert_basic(html, epub_split_size_kb=0)
else:
# Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
###### Mark Indents/Cleanup ######
#
# Replace series of non-breaking spaces with text-indent
txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE) txtindent = re.compile(ur'<p(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
html = txtindent.sub(self.insert_indent, html) html = txtindent.sub(self.insert_indent, html)
if self.found_indents > 1: if self.found_indents > 1:
self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles") self.log("replaced "+unicode(self.found_indents)+ " nbsp indents with inline styles")
return html
def cleanup_markup(self, html):
# remove remaining non-breaking spaces # remove remaining non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html) html = re.sub(ur'\u00a0', ' ', html)
# Get rid of various common microsoft specific tags which can cause issues later # Get rid of various common microsoft specific tags which can cause issues later
@ -259,27 +253,60 @@ class PreProcessor(object):
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html) html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*</[ibu]>\s*){0,2}\s*</[ibu]>", " ", html)
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html) html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]>\s*</span>){0,2}\s*</span>\s*", " ", html)
return html
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
totalwords = 0
totalwords = self.get_word_count(html)
if totalwords < 50:
self.log("flow is too short, not running heuristics")
return html
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = self.arrange_htm_line_endings(html)
###### Check Markup ######
#
# some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
# check if content is in pre tags, use txt processor to mark up if so
html = self.text_process_pre(html)
###### Mark Indents/Cleanup ######
#
# Replace series of non-breaking spaces with text-indent
html = self.fix_nbsp_indents(html)
html = self.cleanup_markup(html)
# ADE doesn't render <br />, change to empty paragraphs # ADE doesn't render <br />, change to empty paragraphs
#html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html) #html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
# If more than 40% of the lines are empty paragraphs and the user has enabled remove # If more than 40% of the lines are empty paragraphs and the user has enabled delete
# paragraph spacing then delete blank lines to clean up spacing # blank paragraphs then delete blank lines to clean up spacing
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL) linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE) blankreg = re.compile(r'\s*(?P<openline><p(?!\sid=\"softbreak\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
#multi_blank = re.compile(r'(\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>){2,}', re.IGNORECASE) multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>){2,}', re.IGNORECASE)
blanklines = blankreg.findall(html) blanklines = blankreg.findall(html)
lines = linereg.findall(html) lines = linereg.findall(html)
blanks_between_paragraphs = False blanks_between_paragraphs = False
print "delete blank paragraphs is "+str(getattr(self.extra_opts, 'delete_blank_paragraphs', False))
if len(lines) > 1: if len(lines) > 1:
self.log("There are " + unicode(len(blanklines)) + " blank lines. " + self.log("There are " + unicode(len(blanklines)) + " blank lines. " +
unicode(float(len(blanklines)) / float(len(lines))) + " percent blank") unicode(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
'remove_paragraph_spacing', False): if float(len(blanklines)) / float(len(lines)) > 0.40:
self.log("deleting blank lines")
html = blankreg.sub('', html)
elif float(len(blanklines)) / float(len(lines)) > 0.40:
blanks_between_paragraphs = True blanks_between_paragraphs = True
#print "blanks between paragraphs is marked True" print "blanks between paragraphs is marked True"
else: else:
blanks_between_paragraphs = False blanks_between_paragraphs = False
@ -289,7 +316,12 @@ class PreProcessor(object):
html = self.markup_chapters(html, totalwords, blanks_between_paragraphs) html = self.markup_chapters(html, totalwords, blanks_between_paragraphs)
if blanks_between_paragraphs and getattr(self.extra_opts,
'delete_blank_paragraphs', False):
self.log("deleting blank lines")
html = multi_blank.sub('\n<p id="softbreak" style="margin-top:1.5em; margin-bottom:1.5em"> </p>', html)
html = blankreg.sub('', html)
###### Unwrap lines ###### ###### Unwrap lines ######
# #
# Some OCR sourced files have line breaks in the html using a combination of span & p tags # Some OCR sourced files have line breaks in the html using a combination of span & p tags

View File

@ -486,7 +486,7 @@ class HTMLInput(InputFormatPlugin):
return (None, None) return (None, None)
return (None, raw) return (None, raw)
def preprocess_html(self, options, html): def heuristics(self, options, html):
self.options = options self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html) return preprocessor(html)

View File

@ -53,7 +53,7 @@ class LITInput(InputFormatPlugin):
pre.append(ne) pre.append(ne)
def preprocess_html(self, options, html): def heuristics(self, options, html):
self.options = options self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html) return preprocessor(html)

View File

@ -420,7 +420,7 @@ class LRFInput(InputFormatPlugin):
styles.write() styles.write()
return os.path.abspath('content.opf') return os.path.abspath('content.opf')
def preprocess_html(self, options, html): def heuristics(self, options, html):
self.options = options self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None)) preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html) return preprocessor(html)

View File

@ -39,7 +39,7 @@ class MOBIInput(InputFormatPlugin):
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path return mr.created_opf_path
def preprocess_html(self, options, html): def heuristics(self, options, html):
# search for places where a first or second level heading is immediately followed by another # search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter # top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc # headings and titles, images, etc

View File

@ -32,8 +32,3 @@ class PDBInput(InputFormatPlugin):
opf = reader.extract_content(os.getcwd()) opf = reader.extract_content(os.getcwd())
return opf return opf
def preprocess_html(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)