From cf7cc4de4d9b9fa5e4b22c5ce2cb63c099165589 Mon Sep 17 00:00:00 2001 From: ldolse Date: Sat, 11 Sep 2010 21:02:44 +1000 Subject: [PATCH] preprocess updates for lit, html, and pdf --- src/calibre/ebooks/conversion/preprocess.py | 8 -- src/calibre/ebooks/conversion/utils.py | 122 +++++++++++++++++++- src/calibre/ebooks/html/input.py | 20 +--- src/calibre/ebooks/lit/input.py | 117 +------------------ src/calibre/ebooks/pdb/pdf/reader.py | 2 +- src/calibre/ebooks/pdf/input.py | 2 +- 6 files changed, 129 insertions(+), 142 deletions(-) diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index c120f0a560..6123577191 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -214,7 +214,6 @@ class HTMLPreProcessor(object): (re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'), (re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'), - # If pdf printed from a browser then the header/footer has a reliable pattern (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''), @@ -225,13 +224,6 @@ class HTMLPreProcessor(object): (re.compile(r'', re.IGNORECASE), lambda match: ''), # Remove
tags (re.compile(r'', re.IGNORECASE), lambda match: '
'), - # Replace

with

- # (re.compile(r'
\s*
', re.IGNORECASE), lambda match: '\n

'), - - # unwrap hyphenation - don't delete the hyphen (often doesn't split words) - #(re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''), - # unwrap/delete soft hyphens - #(re.compile(u'[­]\s*
\s*(?=[[a-z\d])'), lambda match: ''), # Remove gray background (re.compile(r']+>'), lambda match : ''), diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 52be473372..68cebb3a11 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -3,4 +3,124 @@ __license__ = 'GPL v3' __copyright__ = '2010, Kovid Goyal ' -__docformat__ = 'restructuredtext en' \ No newline at end of file +__docformat__ = 'restructuredtext en' + +import re +from calibre.ebooks.conversion.preprocess import line_length +from calibre.utils.logging import default_log +from lxml import etree + +class PreProcessor(object): + html_preprocess_sections = 0 + + def __init__(self, args): + self.args = args + self.log = default_log + + def chapter_head(self, match): + chap = match.group('chap') + title = match.group('title') + if not title: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) + return '

'+chap+'

\n' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) + return '

'+chap+'

\n

'+title+'

\n' + + def chapter_link(self, match): + chap = match.group('sectionlink') + if not chap: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links") + return '
' + else: + self.html_preprocess_sections = self.html_preprocess_sections + 1 + self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap)) + return '
\n

'+chap+'

' + + def no_markup(self, raw, percent): + ''' + Detects total marked up line endings in the file. raw is the text to + inspect. Percent is the minimum percent of line endings which should + be marked up to return true. + ''' + htm_end_ere = re.compile('

', re.DOTALL) + line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) + htm_end = htm_end_ere.findall(raw) + line_end = line_end_ere.findall(raw) + tot_htm_ends = len(htm_end) + tot_ln_fds = len(line_end) + self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***") + + if percent > 1: + percent = 1 + if percent < 0: + percent = 0 + + min_lns = tot_ln_fds * percent + self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true") + if min_lns > tot_htm_ends: + return True + + def __call__(self, html): + self.log("********* Preprocessing HTML *********") + # remove non-breaking spaces + html = re.sub(ur'\u00a0', ' ', html) + # Get rid of empty tags to simplify other processing + html = re.sub(ur'\s*\s*', ' ', html) + # Get rid of empty span tags + html = re.sub(r"\s*]*>\s*", " ", html) + + # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing + linereg = re.compile('(?<=)', re.IGNORECASE) + blankreg = re.compile(r'\s*]*>\s*(<(b|i|u)>)?\s*()?\s*

', re.IGNORECASE) + blanklines = blankreg.findall(html) + lines = linereg.findall(html) + if len(lines) > 1: + self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") + if float(len(blanklines)) / float(len(lines)) > 0.40: + self.log("deleting blank lines") + html = blankreg.sub('', html) + # Arrange line feeds and

tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*

", "

\n", html) + html = re.sub(r"\s*

\s*", "\n

", html) + + # some lit files don't have any

tags or equivalent, check and + # mark up line endings if required before proceeding + if self.no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?)(\n)') + html = add_markup.sub('

\n

', html) + + # detect chapters/sections to match xpath or splitting logic + # + # Start with most typical chapter headings + chapdetect = re.compile(r'(?=]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(){0,2})\s*()?s*()?\s*(){0,2}\s*()\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) + html = chapdetect.sub(self.chapter_head, html) + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern") + chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) + html = chapdetect2.sub(self.chapter_head, html) + # + # Unwrap lines using punctation if the median length of all lines is less than 200 + length = line_length('html', html, 0.4) + self.log("*** Median line length is " + str(length) + " ***") + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + if length < 200: + self.log("Unwrapping Lines") + html = unwrap.sub(' ', html) + # If still no sections after unwrapping lines break on lines with no punctuation + if self.html_preprocess_sections < 10: + self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation") + #self.log(html) + chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE) + html = chapdetect3.sub(self.chapter_head, html) + # search for places where a first or second level heading is immediately followed by another + # top level heading. demote the second heading to h3 to prevent splitting between chapter + # headings and titles, images, etc + doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) + html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) + + return html \ No newline at end of file diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 35a8a1a9bc..e83216ae1f 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows from calibre import unicode_path from calibre.utils.localization import get_lang from calibre.utils.filenames import ascii_filename -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class Link(object): ''' @@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin): return (None, raw) def preprocess_html(self, html): - if not hasattr(self, 'log'): - from calibre.utils.logging import default_log - self.log = default_log - self.log("********* Preprocessing HTML - HTML Input plugin *********") - # Detect Chapters to match the xpath in the GUI - chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) - html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - # Unwrap lines using punctation if the median length of all lines is less than 150 - # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"</p>", "</p>\n", html) - length = line_length('html', html, 0.4) - self.log.debug("*** Median length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - html = unwrap.sub(' ', html) + preprocessor = PreProcessor(html) + html = preprocessor(html) return html diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index 35dad501be..58e7bc84bf 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -6,10 +6,8 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' -import re - from calibre.customize.conversion import InputFormatPlugin -from calibre.ebooks.conversion.preprocess import line_length +from calibre.ebooks.conversion.utils import PreProcessor class LITInput(InputFormatPlugin): @@ -18,7 +16,6 @@ class LITInput(InputFormatPlugin): author = 'Marshall T. Vandegrift' description = 'Convert LIT files to HTML' file_types = set(['lit']) - html_preprocess_sections = 0 def convert(self, stream, options, file_ext, log, accelerators): @@ -57,115 +54,7 @@ class LITInput(InputFormatPlugin): def preprocess_html(self, html): - - def chapter_head(match): - chap = match.group('chap') - title = match.group('title') - if not title: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap)) - return '<h2>'+chap+'</h2>\n' - else: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title)) - return '<h2>'+chap+'</h2>\n<h3>'+title+'</h3>\n' - - def chapter_link(match): - chap = match.group('sectionlink') - if not chap: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links") - return '<br style="page-break-before:always">' - else: - self.html_preprocess_sections = self.html_preprocess_sections + 1 - self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap)) - return '<br clear="all" style="page-break-before:always">\n<h2>'+chap+'</h2>' - - - def no_markup(raw, percent): - ''' - Detects total marked up line endings in the file. raw is the text to - inspect. Percent is the minimum percent of line endings which should - be marked up to return true. - ''' - htm_end_ere = re.compile('</p>', re.DOTALL) - line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL) - htm_end = htm_end_ere.findall(raw) - line_end = line_end_ere.findall(raw) - tot_htm_ends = len(htm_end) - tot_ln_fds = len(line_end) - self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***") - - if percent > 1: - percent = 1 - if percent < 0: - percent = 0 - - min_lns = tot_ln_fds * percent - self.log("There must be more than " + str(min_lns) + " unmarked lines to return true") - if min_lns > tot_htm_ends: - return True - - self.log("********* Preprocessing HTML *********") - # remove non-breaking spaces - html = re.sub(ur'\u00a0', ' ', html) - # Get rid of empty <o:p> tags to simplify other processing - html = re.sub(ur'\s*<o:p>\s*</o:p>', ' ', html) - # Get rid of empty span tags - html = re.sub(r"\s*<span[^>]*>\s*</span>", " ", html) - - # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing - linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE) - blankreg = re.compile(r'\s*<p[^>]*>\s*(<(b|i|u)>)?\s*(</(b|i|u)>)?\s*</p>', re.IGNORECASE) - blanklines = blankreg.findall(html) - lines = linereg.findall(html) - if len(lines) > 1: - self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") - if float(len(blanklines)) / float(len(lines)) > 0.40: - self.log("deleting blank lines") - html = blankreg.sub('', html) - # Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*</p>", "</p>\n", html) - - # some lit files don't have any <p> tags or equivalent, check and - # mark up line endings if required before proceeding - if no_markup(html, 0.1): - self.log("not enough paragraph markers, adding now") - add_markup = re.compile('(?<!>)(\n)') - html = add_markup.sub('</p>\n<p>', html) - - # detect chapters/sections to match xpath or splitting logic - # - # Mark split points based on embedded links - chaplink = re.compile(r'<a\sname[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<sectionlink>[^\s<]+(\s*[^\s<]+){0,4})?\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*</a>', re.IGNORECASE) - html = chaplink.sub(chapter_link, html) - # Continue with alternate patterns, start with most typical chapter headings - if self.html_preprocess_sections < 10: - chapdetect = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}s*(<span[^>]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*(</(i|b|u)>){0,2})\s*(</span>)?s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.IGNORECASE) - html = chapdetect.sub(chapter_head, html) - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern") - chapdetect2 = re.compile(r'(?=</?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<title>(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</(br|p)>))?', re.UNICODE) - html = chapdetect2.sub(chapter_head, html) - # - # Unwrap lines using punctation if the median length of all lines is less than 150 - length = line_length('html', html, 0.4) - self.log("*** Median line length is " + str(length) + " ***") - unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) - if length < 150: - self.log("Unwrapping Lines") - html = unwrap.sub(' ', html) - # If still no sections after unwrapping lines break on lines with no punctuation - if self.html_preprocess_sections < 10: - self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation") - #self.log(html) - chapdetect3 = re.compile(r'(<p[^>]*>)\s*(<(i|b|u)>){0,2}\s*(<span[^>]*>)?\s*(?P<chap>(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*(</(i|b|u)>){0,2})\s*(</span>)?\s*(</(i|b|u)>){0,2}\s*(</p>)(?P<title>)?', re.IGNORECASE) - html = chapdetect3.sub(chapter_head, html) - # search for places where a first or second level heading is immediately followed by another - # top level heading. demote the second heading to h3 to prevent splitting between chapter - # headings and titles, images, etc - doubleheading = re.compile(r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE) - html = doubleheading.sub('\g<firsthead>'+'<h3'+'\g<secondhead>'+'</h3>', html) - + preprocessor = PreProcessor(html) + html = preprocessor(html) return html diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py index 3ae9f8ccca..c151551866 100644 --- a/src/calibre/ebooks/pdb/pdf/reader.py +++ b/src/calibre/ebooks/pdb/pdf/reader.py @@ -21,7 +21,7 @@ class Reader(FormatReader): self.options = options setattr(self.options, 'new_pdf_engine', False) setattr(self.options, 'no_images', False) - setattr(self.options, 'unwrap_factor', 0.5) + setattr(self.options, 'unwrap_factor', 0.45) def extract_content(self, output_dir): self.log.info('Extracting PDF...') diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 113c3d99d8..14b3552b04 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin): OptionRecommendation(name='unwrap_factor', recommended_value=0.45, help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' - 'default is 0.45, this is the median line length.')), + 'default is 0.45, just below the median line length.')), OptionRecommendation(name='new_pdf_engine', recommended_value=False, help=_('Use the new PDF conversion engine.')) ])