preprocess updates for lit, html, and pdf

This commit is contained in:
ldolse 2010-09-11 21:02:44 +10:00
parent 480eccb0b0
commit cf7cc4de4d
6 changed files with 129 additions and 142 deletions

View File

@ -214,7 +214,6 @@ class HTMLPreProcessor(object):
(re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(u'˙\s*(<br.*?>)*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'), (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
# If pdf printed from a browser then the header/footer has a reliable pattern # If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
@ -225,13 +224,6 @@ class HTMLPreProcessor(object):
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''), (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
# Remove <hr> tags # Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'), (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
# Replace <br><br> with <p>
# (re.compile(r'<br>\s*<br>', re.IGNORECASE), lambda match: '\n<p>'),
# unwrap hyphenation - don't delete the hyphen (often doesn't split words)
#(re.compile(u'(?<=[-–—])\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
# unwrap/delete soft hyphens
#(re.compile(u'[­]\s*<br>\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background # Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'), (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),

View File

@ -3,4 +3,124 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.conversion.preprocess import line_length
from calibre.utils.logging import default_log
from lxml import etree
class PreProcessor(object):
html_preprocess_sections = 0
def __init__(self, args):
self.args = args
self.log = default_log
def chapter_head(self, match):
chap = match.group('chap')
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
return '<h2>'+chap+'</h2>\n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
def chapter_link(self, match):
chap = match.group('sectionlink')
if not chap:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
return '<br style="page-break-before:always">'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
inspect. Percent is the minimum percent of line endings which should
be marked up to return true.
'''
htm_end_ere = re.compile('</p>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
if percent > 1:
percent = 1
if percent < 0:
percent = 0
min_lns = tot_ln_fds * percent
self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
if min_lns > tot_htm_ends:
return True
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
# remove non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html)
# Get rid of empty <o:p> tags to simplify other processing
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
# Get rid of empty span tags
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
blanklines = blankreg.findall(html)
lines = linereg.findall(html)
if len(lines) > 1:
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40:
self.log("deleting blank lines")
html = blankreg.sub('', html)
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html)
html = re.sub(r"\s*<p>\s*", "\n<p>", html)
# some lit files don't have any <p> tags or equivalent, check and
# mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
# detect chapters/sections to match xpath or splitting logic
#
# Start with most typical chapter headings
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
#
# Unwrap lines using punctation if the median length of all lines is less than 200
length = line_length('html', html, 0.4)
self.log("*** Median line length is " + str(length) + " ***")
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
if length < 200:
self.log("Unwrapping Lines")
html = unwrap.sub(' ', html)
# If still no sections after unwrapping lines break on lines with no punctuation
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
#self.log(html)
chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_head, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
return html

View File

@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path from calibre import unicode_path
from calibre.utils.localization import get_lang from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.ebooks.conversion.preprocess import line_length from calibre.ebooks.conversion.utils import PreProcessor
class Link(object): class Link(object):
''' '''
@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw) return (None, raw)
def preprocess_html(self, html): def preprocess_html(self, html):
if not hasattr(self, 'log'): preprocessor = PreProcessor(html)
from calibre.utils.logging import default_log html = preprocessor(html)
self.log = default_log
self.log("********* Preprocessing HTML - HTML Input plugin *********")
# Detect Chapters to match the xpath in the GUI
chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE)
html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html)
# Unwrap lines using punctation if the median length of all lines is less than 150
#
# Insert extra line feeds so the line length regex functions properly
html = re.sub(r"</p>", "</p>\n", html)
length = line_length('html', html, 0.4)
self.log.debug("*** Median length is " + str(length) + " ***")
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
if length < 150:
html = unwrap.sub(' ', html)
return html return html

View File

@ -6,10 +6,8 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import re
from calibre.customize.conversion import InputFormatPlugin from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length from calibre.ebooks.conversion.utils import PreProcessor
class LITInput(InputFormatPlugin): class LITInput(InputFormatPlugin):
@ -18,7 +16,6 @@ class LITInput(InputFormatPlugin):
author = 'Marshall T. Vandegrift' author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML' description = 'Convert LIT files to HTML'
file_types = set(['lit']) file_types = set(['lit'])
html_preprocess_sections = 0
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
@ -57,115 +54,7 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html): def preprocess_html(self, html):
preprocessor = PreProcessor(html)
def chapter_head(match): html = preprocessor(html)
chap = match.group('chap')
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
return '<h2>'+chap+'</h2>\n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n'
def chapter_link(match):
chap = match.group('sectionlink')
if not chap:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
return '<br style="page-break-before:always">'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>'
def no_markup(raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
inspect. Percent is the minimum percent of line endings which should
be marked up to return true.
'''
htm_end_ere = re.compile('</p>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
if percent > 1:
percent = 1
if percent < 0:
percent = 0
min_lns = tot_ln_fds * percent
self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
if min_lns > tot_htm_ends:
return True
self.log("********* Preprocessing HTML *********")
# remove non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html)
# Get rid of empty <o:p> tags to simplify other processing
html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html)
# Get rid of empty span tags
html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE)
blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE)
blanklines = blankreg.findall(html)
lines = linereg.findall(html)
if len(lines) > 1:
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40:
self.log("deleting blank lines")
html = blankreg.sub('', html)
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*</p>", "</p>\n", html)
# some lit files don't have any <p> tags or equivalent, check and
# mark up line endings if required before proceeding
if no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
# detect chapters/sections to match xpath or splitting logic
#
# Mark split points based on embedded links
chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE)
html = chaplink.sub(chapter_link, html)
# Continue with alternate patterns, start with most typical chapter headings
if self.html_preprocess_sections < 10:
chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE)
html = chapdetect.sub(chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE)
html = chapdetect2.sub(chapter_head, html)
#
# Unwrap lines using punctation if the median length of all lines is less than 150
length = line_length('html', html, 0.4)
self.log("*** Median line length is " + str(length) + " ***")
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
if length < 150:
self.log("Unwrapping Lines")
html = unwrap.sub(' ', html)
# If still no sections after unwrapping lines break on lines with no punctuation
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
#self.log(html)
chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE)
html = chapdetect3.sub(chapter_head, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html)
return html return html

View File

@ -21,7 +21,7 @@ class Reader(FormatReader):
self.options = options self.options = options
setattr(self.options, 'new_pdf_engine', False) setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False) setattr(self.options, 'no_images', False)
setattr(self.options, 'unwrap_factor', 0.5) setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir): def extract_content(self, output_dir):
self.log.info('Extracting PDF...') self.log.info('Extracting PDF...')

View File

@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
OptionRecommendation(name='unwrap_factor', recommended_value=0.45, OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should ' help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.45, this is the median line length.')), 'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False, OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.')) help=_('Use the new PDF conversion engine.'))
]) ])