From cf7cc4de4d9b9fa5e4b22c5ce2cb63c099165589 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 11 Sep 2010 21:02:44 +1000
Subject: [PATCH] preprocess updates for lit, html, and pdf
---
src/calibre/ebooks/conversion/preprocess.py | 8 --
src/calibre/ebooks/conversion/utils.py | 122 +++++++++++++++++++-
src/calibre/ebooks/html/input.py | 20 +---
src/calibre/ebooks/lit/input.py | 117 +------------------
src/calibre/ebooks/pdb/pdf/reader.py | 2 +-
src/calibre/ebooks/pdf/input.py | 2 +-
6 files changed, 129 insertions(+), 142 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c120f0a560..6123577191 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -214,7 +214,6 @@ class HTMLPreProcessor(object):
(re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'),
-
# If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
@@ -225,13 +224,6 @@ class HTMLPreProcessor(object):
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
(re.compile(r'', re.IGNORECASE), lambda match: '
'),
- # Replace
with
- # (re.compile(r'
\s*
', re.IGNORECASE), lambda match: '\n
'),
-
- # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- #(re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''),
- # unwrap/delete soft hyphens
- #(re.compile(u'[]\s*
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r'
]+>'), lambda match : ''),
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 52be473372..68cebb3a11 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -3,4 +3,124 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal '
-__docformat__ = 'restructuredtext en'
\ No newline at end of file
+__docformat__ = 'restructuredtext en'
+
+import re
+from calibre.ebooks.conversion.preprocess import line_length
+from calibre.utils.logging import default_log
+from lxml import etree
+
+class PreProcessor(object):
+ html_preprocess_sections = 0
+
+ def __init__(self, args):
+ self.args = args
+ self.log = default_log
+
+ def chapter_head(self, match):
+ chap = match.group('chap')
+ title = match.group('title')
+ if not title:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ return ''+chap+'
\n'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ return ''+chap+'
\n'+title+'
\n'
+
+ def chapter_link(self, match):
+ chap = match.group('sectionlink')
+ if not chap:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+ return '
'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+ return '
\n'+chap+'
'
+
+ def no_markup(self, raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile('
', re.DOTALL)
+ line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+ htm_end = htm_end_ere.findall(raw)
+ line_end = line_end_ere.findall(raw)
+ tot_htm_ends = len(htm_end)
+ tot_ln_fds = len(line_end)
+ self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
+
+ min_lns = tot_ln_fds * percent
+ self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+ if min_lns > tot_htm_ends:
+ return True
+
+ def __call__(self, html):
+ self.log("********* Preprocessing HTML *********")
+ # remove non-breaking spaces
+ html = re.sub(ur'\u00a0', ' ', html)
+ # Get rid of empty tags to simplify other processing
+ html = re.sub(ur'\s*\s*', ' ', html)
+ # Get rid of empty span tags
+ html = re.sub(r"\s*]*>\s*", " ", html)
+
+ # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ linereg = re.compile('(?<=)', re.IGNORECASE)
+ blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
+ blanklines = blankreg.findall(html)
+ lines = linereg.findall(html)
+ if len(lines) > 1:
+ self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+ if float(len(blanklines)) / float(len(lines)) > 0.40:
+ self.log("deleting blank lines")
+ html = blankreg.sub('', html)
+ # Arrange line feeds and tags so the line_length and no_markup functions work correctly
+ html = re.sub(r"\s*", "\n", html)
+ html = re.sub(r"\s*\s*", "\n
", html)
+
+ # some lit files don't have any
tags or equivalent, check and
+ # mark up line endings if required before proceeding
+ if self.no_markup(html, 0.1):
+ self.log("not enough paragraph markers, adding now")
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n', html)
+
+ # detect chapters/sections to match xpath or splitting logic
+ #
+ # Start with most typical chapter headings
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ html = chapdetect.sub(self.chapter_head, html)
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#-]+\s*){1,9}|\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
+ #
+ # Unwrap lines using punctation if the median length of all lines is less than 200
+ length = line_length('html', html, 0.4)
+ self.log("*** Median line length is " + str(length) + " ***")
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ if length < 200:
+ self.log("Unwrapping Lines")
+ html = unwrap.sub(' ', html)
+ # If still no sections after unwrapping lines break on lines with no punctuation
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
+ #self.log(html)
+ chapdetect3 = re.compile(r'(]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*(
)(?P)?', re.IGNORECASE)
+ html = chapdetect3.sub(self.chapter_head, html)
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+''+'
', html)
+
+ return html
\ No newline at end of file
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 35a8a1a9bc..e83216ae1f 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw)
def preprocess_html(self, html):
- if not hasattr(self, 'log'):
- from calibre.utils.logging import default_log
- self.log = default_log
- self.log("********* Preprocessing HTML - HTML Input plugin *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
- html = chapdetect.sub(''+'\g'+'
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- #
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
", "\n", html)
- length = line_length('html', html, 0.4)
- self.log.debug("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- html = unwrap.sub(' ', html)
+ preprocessor = PreProcessor(html)
+ html = preprocessor(html)
return html
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 35dad501be..58e7bc84bf 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -6,10 +6,8 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re
-
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class LITInput(InputFormatPlugin):
@@ -18,7 +16,6 @@ class LITInput(InputFormatPlugin):
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
- html_preprocess_sections = 0
def convert(self, stream, options, file_ext, log,
accelerators):
@@ -57,115 +54,7 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
-
- def chapter_head(match):
- chap = match.group('chap')
- title = match.group('title')
- if not title:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
- return ''+chap+'
\n'
- else:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
- return ''+chap+'
\n'+title+'
\n'
-
- def chapter_link(match):
- chap = match.group('sectionlink')
- if not chap:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
- return '
'
- else:
- self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
- return '
\n'+chap+'
'
-
-
- def no_markup(raw, percent):
- '''
- Detects total marked up line endings in the file. raw is the text to
- inspect. Percent is the minimum percent of line endings which should
- be marked up to return true.
- '''
- htm_end_ere = re.compile('', re.DOTALL)
- line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
- htm_end = htm_end_ere.findall(raw)
- line_end = line_end_ere.findall(raw)
- tot_htm_ends = len(htm_end)
- tot_ln_fds = len(line_end)
- self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
-
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
-
- min_lns = tot_ln_fds * percent
- self.log("There must be more than " + str(min_lns) + " unmarked lines to return true")
- if min_lns > tot_htm_ends:
- return True
-
- self.log("********* Preprocessing HTML *********")
- # remove non-breaking spaces
- html = re.sub(ur'\u00a0', ' ', html)
- # Get rid of empty tags to simplify other processing
- html = re.sub(ur'\s*\s*', ' ', html)
- # Get rid of empty span tags
- html = re.sub(r"\s*]*>\s*", " ", html)
-
- # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
- linereg = re.compile('(?<=)', re.IGNORECASE)
- blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
- blanklines = blankreg.findall(html)
- lines = linereg.findall(html)
- if len(lines) > 1:
- self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
- if float(len(blanklines)) / float(len(lines)) > 0.40:
- self.log("deleting blank lines")
- html = blankreg.sub('', html)
- # Arrange line feeds and tags so the line_length and no_markup functions work correctly
- html = re.sub(r"\s*", "\n", html)
-
- # some lit files don't have any tags or equivalent, check and
- # mark up line endings if required before proceeding
- if no_markup(html, 0.1):
- self.log("not enough paragraph markers, adding now")
- add_markup = re.compile('(?)(\n)')
- html = add_markup.sub('
\n', html)
-
- # detect chapters/sections to match xpath or splitting logic
- #
- # Mark split points based on embedded links
- chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P[^\s<]+(\s*[^\s<]+){0,4})?\s*()?\s*((i|b|u)>){0,2}\s*', re.IGNORECASE)
- html = chaplink.sub(chapter_link, html)
- # Continue with alternate patterns, start with most typical chapter headings
- if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}s*(]*>)?\s*.?(\d+\.?|Introduction|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*((i|b|u)>){0,2})\s*()?s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
- html = chapdetect.sub(chapter_head, html)
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
- html = chapdetect2.sub(chapter_head, html)
- #
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', html, 0.4)
- self.log("*** Median line length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- self.log("Unwrapping Lines")
- html = unwrap.sub(' ', html)
- # If still no sections after unwrapping lines break on lines with no punctuation
- if self.html_preprocess_sections < 10:
- self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", splitting based on punctuation")
- #self.log(html)
- chapdetect3 = re.compile(r'(]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?([a-z]+\s*){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*(
)(?P)?', re.IGNORECASE)
- html = chapdetect3.sub(chapter_head, html)
- # search for places where a first or second level heading is immediately followed by another
- # top level heading. demote the second heading to h3 to prevent splitting between chapter
- # headings and titles, images, etc
- doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
- html = doubleheading.sub('\g'+''+'
', html)
-
+ preprocessor = PreProcessor(html)
+ html = preprocessor(html)
return html
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
self.options = options
setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False)
- setattr(self.options, 'unwrap_factor', 0.5)
+ setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir):
self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 113c3d99d8..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.45, this is the median line length.')),
+ 'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])