diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 957418f1fd..2954fd7c26 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -62,6 +62,7 @@ def wrap_lines(match):
else:
return ital+' '
+
def line_length(format, raw, percent):
'''
raw is the raw text to find the line length to use for wrapping.
@@ -191,32 +192,36 @@ class HTMLPreProcessor(object):
(re.compile(u'¸\s*( \n ' + match.group(1) + '
- (re.compile(r' '),
+ # (re.compile(r' '),
- # Remove hyphenation
- (re.compile(r'- '),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Connect paragraphs split by -
- (re.compile(u'(?<=[^\s][-–])[\s]*( )*\s*(?=[^\s])'), lambda match: ''),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
(re.compile(r'(?=\w)'), lambda match: ' '),
+
]
# Fix Book Designer markup
@@ -293,6 +298,13 @@ class HTMLPreProcessor(object):
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
+
+ # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+ if getattr(self.extra_opts, 'preprocess_html', None):
+ if is_pdftohtml:
+ end_rules.append(
+ (re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P )', re.IGNORECASE)
+ blankreg = re.compile(r'\s* ]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
+
+ # Center separator lines
+ (re.compile(u'
\s*(?P
'), lambda match: '
tags
(re.compile(r'
'),
# Replace
with
\s*
', re.IGNORECASE), lambda match: '\n
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head),
+ )
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index d57bfddd3e..35a8a1a9bc 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -494,7 +494,7 @@ class HTMLInput(InputFormatPlugin):
if not hasattr(self, 'log'):
from calibre.utils.logging import default_log
self.log = default_log
- self.log("********* Preprocessing HTML *********")
+ self.log("********* Preprocessing HTML - HTML Input plugin *********")
# Detect Chapters to match the xpath in the GUI
chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P'+'\g
\n', html)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 9bf20fb1d4..f7bb0fbfd9 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -11,12 +11,14 @@ import re
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.preprocess import line_length
+
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = set(['lit'])
+ html_preprocess_sections = 0
def convert(self, stream, options, file_ext, log,
accelerators):
@@ -55,14 +57,104 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
+
+ def chapter_head(match):
+ chap = match.group('chap')
+ title = match.group('title')
+ if not title:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ return ''+chap+'
\n'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ return ''+chap+'
\n'+title+'
\n'
+
+ def chapter_link(match):
+ chap = match.group('sectionlink')
+ if not chap:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+ return '
'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+ return '
\n'+chap+'
'
+
+
+ def no_markup(raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile(''+'\g
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
+ # remove non-breaking spaces
+ html = re.sub(ur'\u00a0', ' ', html)
+ # Get rid of empty
tags or equivalent, check and + # mark up line endings if required before proceeding + if no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?)(\n)') + html = add_markup.sub('
\n', html) + + # detect chapters/sections to match xpath or splitting logic # - # Insert extra line feeds so the line length regex functions properly - html = re.sub(r"
", "\n", html) + # Mark split points based on embedded links + chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P