From 5c951fb9628617133f17ead6d1393ea84b7c6412 Mon Sep 17 00:00:00 2001
From: ldolse
Date: Sat, 4 Sep 2010 15:12:29 +1000
Subject: [PATCH] Preprocessing Updates
---
src/calibre/ebooks/conversion/preprocess.py | 26 +++--
src/calibre/ebooks/html/input.py | 2 +-
src/calibre/ebooks/lit/input.py | 104 ++++++++++++++++++--
src/calibre/ebooks/mobi/input.py | 10 ++
src/calibre/ebooks/pdf/reflow.py | 4 +
5 files changed, 132 insertions(+), 14 deletions(-)
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 957418f1fd..2954fd7c26 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,6 +62,7 @@ def wrap_lines(match):
else:
return ital+' '
+
def line_length(format, raw, percent):
'''
raw is the raw text to find the line length to use for wrapping.
@@ -191,32 +192,36 @@ class HTMLPreProcessor(object):
(re.compile(u'¸\s*()*\s*c', re.UNICODE), lambda match: u'ç'),
(re.compile(u'¸\s*()*\s*C', re.UNICODE), lambda match: u'Ç'),
+ # If pdf printed from a browser then the header/footer has a reliable pattern
+ (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
+
+ # Center separator lines
+ (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '\n
' + match.group(1) + '
'),
+
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
(re.compile(r'', re.IGNORECASE), lambda match: '
'),
# Replace
with
- (re.compile(r'\s*', re.IGNORECASE), lambda match: ''),
+ # (re.compile(r'
\s*
', re.IGNORECASE), lambda match: '\n
'),
- # Remove hyphenation
- (re.compile(r'-\n\r?'), lambda match: ''),
+ # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
+ (re.compile(r'(?<=[-–])\s*
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>((i|b)>)?)?)?(br|p)[^>]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+(\s\w+)?)?\s*((i|b)>((i|b)>)?)?)\s*(?(br|p)[^>]*>\s*){1,3}\s*(?P(<(i|b)>)?(\s*\w+){1,4}\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
# Have paragraphs show better
(re.compile(r''), lambda match : ''),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Connect paragraphs split by -
- (re.compile(u'(?<=[^\s][-–])[\s]*(
)*[\s]*()*\s*(?=[^\s])'), lambda match: ''),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
(re.compile(r'(?=\w)'), lambda match: ' '),
+
]
# Fix Book Designer markup
@@ -293,6 +298,13 @@ class HTMLPreProcessor(object):
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
+
+ # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+ if getattr(self.extra_opts, 'preprocess_html', None):
+ if is_pdftohtml:
+ end_rules.append(
+ (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P([A-Z-\'"!]{3,})\s*(\d+|[A-Z]+(\s*[A-Z]+)?)?)\s*((i|b)>((i|b)>)?)?\s*(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
+ )
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index d57bfddd3e..35a8a1a9bc 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin):
if not hasattr(self, 'log'):
from calibre.utils.logging import default_log
self.log = default_log
- self.log("********* Preprocessing HTML *********")
+ self.log("********* Preprocessing HTML - HTML Input plugin *********")
# Detect Chapters to match the xpath in the GUI
chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
html = chapdetect.sub(''+'\g'+'
\n', html)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 9bf20fb1d4..f7bb0fbfd9 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -11,12 +11,14 @@ import re
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
+
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
+ html_preprocess_sections = 0
def convert(self, stream, options, file_ext, log,
accelerators):
@@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
+
+ def chapter_head(match):
+ chap = match.group('chap')
+ title = match.group('title')
+ if not title:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ return ''+chap+'
\n'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ return ''+chap+'
\n'+title+'
\n'
+
+ def chapter_link(match):
+ chap = match.group('sectionlink')
+ if not chap:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+ return '
'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+ return '
\n'+chap+'
'
+
+
+ def no_markup(raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile('
', re.DOTALL)
+ line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+ htm_end = htm_end_ere.findall(raw)
+ line_end = line_end_ere.findall(raw)
+ tot_htm_ends = len(htm_end)
+ tot_ln_fds = len(line_end)
+ self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
+
+ min_lns = tot_ln_fds * percent
+ self.log("There must be more than " + str(min_lns) + " unmarked lines to be true")
+ if min_lns > tot_htm_ends:
+ return True
+
self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
- html = chapdetect.sub(''+'\g'+'
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
+ # remove non-breaking spaces
+ html = re.sub(ur'\u00a0', ' ', html)
+ # Get rid of empty tags to simplify other processing
+ html = re.sub(ur'\s*\s*', ' ', html)
+ # Get rid of empty span tags
+ html = re.sub(r"\s*]*>\s*", " ", html)
+
+ # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ linereg = re.compile('(?<=)', re.IGNORECASE)
+ blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
+ blanklines = blankreg.findall(html)
+ lines = linereg.findall(html)
+ if len(lines) > 1:
+ self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+ if float(len(blanklines)) / float(len(lines)) > 0.40:
+ self.log("deleting blank lines")
+ html = blankreg.sub('', html)
+ # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ html = re.sub(r"\s*
", "
\n", html)
+
+ # some lit files don't have any tags or equivalent, check and
+ # mark up line endings if required before proceeding
+ if no_markup(html, 0.1):
+ self.log("not enough paragraph markers, adding now")
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n', html)
+
+ # detect chapters/sections to match xpath or splitting logic
#
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
", "\n", html)
+ # Mark split points based on embedded links
+ chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P[^\s<]+(\s*[^\s<]+){0,4})?\s*()?\s*((i|b|u)>){0,2}\s*', re.IGNORECASE)
+ html = chaplink.sub(chapter_link, html)
+ # Continue with alternate patterns, start with most typical chapter headings
+ if self.html_preprocess_sections < 10:
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}.?(\d+\.?|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\s*){0,4}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ html = chapdetect.sub(chapter_head, html)
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying a more aggressive pattern")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}\s*.?(([A-Z#]+\s*){1,9}|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P(<(i|b|u)>){0,2}(\s*[\w\'\"-]+){1,5}\s*((i|b|u)>){0,2})\s*()?\s*((i|b|u)>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(chapter_head, html)
+
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+''+'
', html)
+ #
+ # Unwrap lines using punctation if the median length of all lines is less than 150
length = line_length('html', html, 0.4)
self.log("*** Median length is " + str(length) + " ***")
unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 487e70c04f..b8dc7a9560 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -3,6 +3,7 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
+import re
from calibre.customize.conversion import InputFormatPlugin
class MOBIInput(InputFormatPlugin):
@@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
include_meta_content_type=False))
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path
+
+ def preprocess_html(self, html):
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+''+'
', html)
+ return html
+
diff --git a/src/calibre/ebooks/pdf/reflow.py b/src/calibre/ebooks/pdf/reflow.py
index 584d631d0b..36848ddb8b 100644
--- a/src/calibre/ebooks/pdf/reflow.py
+++ b/src/calibre/ebooks/pdf/reflow.py
@@ -408,6 +408,10 @@ class Page(object):
# Fraction of text height that two strings' bottoms can differ by
# for them to be considered to be part of the same text fragment
LINE_FACTOR = 0.4
+
+ # Percentage of the page heigth which should be considered header
+ # or footer to be discarded from reflow considerations
+ HEAD_FOOTER_MARGIN
# Multiplies the average line height when determining row height
# of a particular element to detect columns.