diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 46308b2ea0..f6277956c8 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -62,7 +62,6 @@ def wrap_lines(match): else: return ital+' ' - def line_length(format, raw, percent): ''' raw is the raw text to find the line length to use for wrapping. @@ -76,6 +75,8 @@ def line_length(format, raw, percent): linere = re.compile('(?<=
)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<= '),
@@ -238,8 +240,7 @@ class HTMLPreProcessor(object):
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
-
+ (re.compile(r'(?=\w)'), lambda match: ' '),
]
# Fix Book Designer markup
@@ -327,10 +328,11 @@ class HTMLPreProcessor(object):
# unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s* )+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
- # Make the more aggressive chapter marking regex optional with the preprocess option to reduce false positives
+ # Make the more aggressive chapter marking regex optional with the preprocess option to
+ # reduce false positives and move after header/footer removal
if getattr(self.extra_opts, 'preprocess_html', None):
if is_pdftohtml:
- end_rules.append((re.compile(r'(?=<(/?br|p|hr))(<(/?br|p|hr)[^>]*)?>\s*(<(i|b)>(<(i|b)>)?)?\s*(?P \s*(?P \s*(?P )?'), chap_head),)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index fb683bdb12..abfa43e7ed 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -8,10 +8,10 @@ __docformat__ = 'restructuredtext en'
import re
from calibre.ebooks.conversion.preprocess import line_length
from calibre.utils.logging import default_log
-from lxml import etree
class PreProcessor(object):
html_preprocess_sections = 0
+ found_indents = 0
def __init__(self, args):
self.args = args
@@ -22,11 +22,11 @@ class PreProcessor(object):
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
return ' '
+ else:
+ return ' '+span
+ else:
+ if not span:
+ return ' '
+ else:
+ return ' '+span
+
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
@@ -48,7 +63,7 @@ class PreProcessor(object):
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
- self.log("*** There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked endings***")
+ self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
if percent > 1:
percent = 1
@@ -56,13 +71,18 @@ class PreProcessor(object):
percent = 0
min_lns = tot_ln_fds * percent
- self.log("There must be fewer than " + str(min_lns) + " unmarked lines to return true")
+ self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
if min_lns > tot_htm_ends:
return True
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
- # remove non-breaking spaces
+ # Replace series of non-breaking spaces with text-indent
+ txtindent = re.compile(ur' [^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+ html = txtindent.sub(self.insert_indent, html)
+ if self.found_indents > 1:
+ self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+ # remove remaining non-breaking spaces
html = re.sub(ur'\u00a0', ' ', html)
# Get rid of empty \s*", "\n ", html)
- # some lit files don't have any tags or equivalent, check and
- # mark up line endings if required before proceeding
+ # some lit files don't have any tags or equivalent (generally just plain text between
+ # ', html)
# detect chapters/sections to match xpath or splitting logic
+ heading = re.compile(' ]*>', re.IGNORECASE)
+ spans_reg = re.compile(']*>', re.IGNORECASE)
+ paras = len(paras_reg.findall(html))
+ spans = len(spans_reg.findall(html))
+ if spans > 1:
+ if float(paras) / float(spans) < 0.75:
+ format = 'spanned_html'
+ else:
+ format = 'html'
+ else:
+ format = 'html'
+
+ # Calculate Length
+ length = line_length(format, html, 0.4)
+ self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+ #
+ # Unwrap and/or delete soft-hyphens, hyphens
+ html = re.sub(u'\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+ html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+
+ # Unwrap lines using punctation if the median length of all lines is less than 200
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P
).*?(?=
)', re.DOTALL)
+ elif format == 'spanned_html':
+ linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
@@ -223,14 +224,15 @@ class HTMLPreProcessor(object):
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
- (re.compile(r'
'),
+ (re.compile(r'
'),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P
\s*(?P
\s*){1,3}\s*(?P
\s*(?P
\s*){1,3}\s*(?P
)?', re.IGNORECASE), chap_head),
+ # Cover the case where every letter in a chapter title is separated by a space
+ (re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
))?'), chap_head),
# Have paragraphs show better
(re.compile(r'
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P
]*>|?p[^>]*>)))?'), chap_head))
+ end_rules.append((re.compile(r''+chap+'
\n'
else:
self.html_preprocess_sections = self.html_preprocess_sections + 1
- self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
return ''+chap+'
\n'+title+'
\n'
def chapter_break(self, match):
@@ -35,7 +35,22 @@ class PreProcessor(object):
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
return '<'+styles+' style="page-break-before:always">'+chap
-
+
+ def insert_indent(self, match):
+ pstyle = match.group('formatting')
+ span = match.group('span')
+ self.found_indents = self.found_indents + 1
+ if pstyle:
+ if not span:
+ return ' tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
add_markup = re.compile('(?)(\n)')
html = add_markup.sub('