diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index a0dfb5ea2b..da652c1a38 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -54,7 +54,7 @@ def chap_head(match):
if not title:
return '
'+chap+'
\n'
else:
- return ''+chap+'
\n'+title+'
\n'
+ return ''+chap+'
\n'+title+'
\n'
def wrap_lines(match):
ital = match.group('ital')
@@ -63,7 +63,7 @@ def wrap_lines(match):
else:
return ital+' '
-def line_length(raw, percent):
+def line_length(format, raw, percent):
'''
raw is the raw text to find the line length to use for wrapping.
percentage is a decimal number, 0 - 1 which is used to determine
@@ -72,7 +72,10 @@ def line_length(raw, percent):
median value.
'''
raw = raw.replace(' ', ' ')
- linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
+ if format == 'html':
+ linere = re.compile('(?<=)', re.DOTALL)
+ elif format == 'pdf':
+ linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
@@ -206,7 +209,7 @@ class HTMLPreProcessor(object):
(re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(||)?(Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(||)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P()?\s*\w+(\s+\w+)?()?)(
]*>|?p[^>]*>)))?', re.IGNORECASE), chap_head),
+ (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?((i|b)>((i|b)>)?)?)?(br|p)[^>]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
(re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
# Have paragraphs show better
@@ -289,7 +292,7 @@ class HTMLPreProcessor(object):
traceback.print_exc()
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
+ length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
end_rules.append(
# Un wrap using punctuation
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 73bc22be66..51c74228b7 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,6 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
+from calibre.ebooks.conversion.preprocess import line_length
class Link(object):
'''
@@ -489,5 +490,18 @@ class HTMLInput(InputFormatPlugin):
return (None, None)
return (None, raw)
-
-
+ def preprocess_html(self, html):
+ self.log("********* Preprocessing HTML *********")
+ # Detect Chapters to match the xpath in the GUI
+ chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
+ html = chapdetect.sub(''+'\g'+'
\n', html)
+ # Unwrap lines using punctation if the median length of all lines is less than 150
+ #
+ # Insert extra line feeds so the line length regex functions properly
+ html = re.sub(r"
", "\n", html)
+ length = line_length('html', html, 0.4)
+ self.log.debug("*** Median length is " + str(length) + " ***")
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ if length < 150:
+ html = unwrap.sub(' ', html)
+ return html
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index bb69f3b568..9bf20fb1d4 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en'
import re
from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.conversion.preprocess import line_length
class LITInput(InputFormatPlugin):
@@ -21,6 +22,7 @@ class LITInput(InputFormatPlugin):
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
+ self.log = log
return create_oebbook(log, stream, options, self, reader=LitReader)
def postprocess_book(self, oeb, opts, log):
@@ -53,8 +55,18 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
+ self.log("********* Preprocessing HTML *********")
+ # Detect Chapters to match the xpath in the GUI
chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
html = chapdetect.sub(''+'\g'+'
\n', html)
- html = re.sub(r"(?<=.{65}[a-z,\IA])\s*(span|p|div)>\s*((p|span|div)>\s*]*>(\s*<(p|span|div)>\s*(p|span|div)[^>]*>)?\s*((p|span|div)>\s*
]*>)?)?\s*<(span|div|p)[^>]*>", " ", html)
- return html
+ # Unwrap lines using punctation if the median length of all lines is less than 150
+ #
+ # Insert extra line feeds so the line length regex functions properly
+ html = re.sub(r"
", "\n", html)
+ length = line_length('html', html, 0.4)
+ self.log("*** Median length is " + str(length) + " ***")
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ if length < 150:
+ html = unwrap.sub(' ', html)
+ return html
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 5447e69403..adda8794ca 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,6 +7,7 @@ import os, glob, re, textwrap
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.conversion.preprocess import line_length
class InlineClass(etree.XSLTExtension):
@@ -184,6 +185,7 @@ class RTFInput(InputFormatPlugin):
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
+ self.options = options
self.log = log
self.log('Converting RTF to XML...')
#Name of the preprocesssed RTF file
@@ -226,6 +228,17 @@ class RTFInput(InputFormatPlugin):
with open(html, 'wb') as f:
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
+ if self.options.preprocess_html:
+ self.log("********* Preprocessing HTML *********")
+ # Detect Chapters to match the xpath in the GUI
+ chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)\s*\s*
', re.IGNORECASE)
+ res = chapdetect.sub(''+'\g'+'
\n', res)
+ # Unwrap lines using punctation if the median length of all lines is less than 150
+ length = line_length('html', res, 0.4)
+ self.log("*** Median length is " + str(length) + " ***")
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*()?\s*(?P]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*(]*>)?\s*" % length, re.UNICODE)
+ if length < 150:
+ res = unwrap.sub(' ', res)
f.write(res)
self.write_inline_css(inline_class)
stream.seek(0)