diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index a0dfb5ea2b..da652c1a38 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -54,7 +54,7 @@ def chap_head(match): if not title: return '

'+chap+'


\n' else: - return '

'+chap+'
\n'+title+'


\n' + return '

'+chap+'

\n

'+title+'

\n' def wrap_lines(match): ital = match.group('ital') @@ -63,7 +63,7 @@ def wrap_lines(match): else: return ital+' ' -def line_length(raw, percent): +def line_length(format, raw, percent): ''' raw is the raw text to find the line length to use for wrapping. percentage is a decimal number, 0 - 1 which is used to determine @@ -72,7 +72,10 @@ def line_length(raw, percent): median value. ''' raw = raw.replace(' ', ' ') - linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) + if format == 'html': + linere = re.compile('(?<=)', re.DOTALL) + elif format == 'pdf': + linere = re.compile('(?<=
).*?(?=
)', re.DOTALL) lines = linere.findall(raw) lengths = [] @@ -206,7 +209,7 @@ class HTMLPreProcessor(object): (re.compile(ur'\u00a0'), lambda match : ' '), # Detect Chapters to match default XPATH in GUI - (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(||)?(Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(||)?)(]*>|]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(]*>|]*>))((?P(<i>)?\s*\w+(\s+\w+)?(</i>)?)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head), + (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head), (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head), # Have paragraphs show better @@ -289,7 +292,7 @@ class HTMLPreProcessor(object): traceback.print_exc() if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01: - length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) + length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor')) if length: end_rules.append( # Un wrap using punctuation diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py index 73bc22be66..51c74228b7 100644 --- a/src/calibre/ebooks/html/input.py +++ b/src/calibre/ebooks/html/input.py @@ -24,6 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows from calibre import unicode_path from calibre.utils.localization import get_lang from calibre.utils.filenames import ascii_filename +from calibre.ebooks.conversion.preprocess import line_length class Link(object): ''' @@ -489,5 +490,18 @@ class HTMLInput(InputFormatPlugin): return (None, None) return (None, raw) - - + def preprocess_html(self, html): + self.log("********* Preprocessing HTML *********") + # Detect Chapters to match the xpath in the GUI + chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) + html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) + # Unwrap lines using punctation if the median length of all lines is less than 150 + # + # Insert extra line feeds so the line length regex functions properly + html = re.sub(r"</p>", "</p>\n", html) + length = line_length('html', html, 0.4) + self.log.debug("*** Median length is " + str(length) + " ***") + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + if length < 150: + html = unwrap.sub(' ', html) + return html diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py index bb69f3b568..9bf20fb1d4 100644 --- a/src/calibre/ebooks/lit/input.py +++ b/src/calibre/ebooks/lit/input.py @@ -9,6 +9,7 @@ __docformat__ = 'restructuredtext en' import re from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.conversion.preprocess import line_length class LITInput(InputFormatPlugin): @@ -21,6 +22,7 @@ class LITInput(InputFormatPlugin): accelerators): from calibre.ebooks.lit.reader import LitReader from calibre.ebooks.conversion.plumber import create_oebbook + self.log = log return create_oebbook(log, stream, options, self, reader=LitReader) def postprocess_book(self, oeb, opts, log): @@ -53,8 +55,18 @@ class LITInput(InputFormatPlugin): def preprocess_html(self, html): + self.log("********* Preprocessing HTML *********") + # Detect Chapters to match the xpath in the GUI chapdetect = re.compile(r'(?=</?(br|p|span))(</?(br|p|span)[^>]*>)?\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)(</?(p|br|span)[^>]*>)', re.IGNORECASE) html = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', html) - html = re.sub(r"(?<=.{65}[a-z,\IA])\s*</(span|p|div)>\s*(</(p|span|div)>\s*<p[^>]*>(\s*<(p|span|div)>\s*</(p|span|div)[^>]*>)?\s*(</(p|span|div)>\s*<p[^>]*>)?)?\s*<(span|div|p)[^>]*>", " ", html) - return html + # Unwrap lines using punctation if the median length of all lines is less than 150 + # + # Insert extra line feeds so the line length regex functions properly + html = re.sub(r"</p>", "</p>\n", html) + length = line_length('html', html, 0.4) + self.log("*** Median length is " + str(length) + " ***") + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</(span|p|div)>\s*(</(p|span|div)>)?\s*(?P<up2threeblanks><(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*</(span|p|div)>\s*)</(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE) + if length < 150: + html = unwrap.sub(' ', html) + return html diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py index 5447e69403..adda8794ca 100644 --- a/src/calibre/ebooks/rtf/input.py +++ b/src/calibre/ebooks/rtf/input.py @@ -7,6 +7,7 @@ import os, glob, re, textwrap from lxml import etree from calibre.customize.conversion import InputFormatPlugin +from calibre.ebooks.conversion.preprocess import line_length class InlineClass(etree.XSLTExtension): @@ -184,6 +185,7 @@ class RTFInput(InputFormatPlugin): from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException + self.options = options self.log = log self.log('Converting RTF to XML...') #Name of the preprocesssed RTF file @@ -226,6 +228,17 @@ class RTFInput(InputFormatPlugin): with open(html, 'wb') as f: res = transform.tostring(result) res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] + if self.options.preprocess_html: + self.log("********* Preprocessing HTML *********") + # Detect Chapters to match the xpath in the GUI + chapdetect = re.compile(r'<p[^>]*>\s*<span[^>]*>\s*(?P<chap>(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?(</(i|b)></(i|b)>|</(i|b)>)?)\s*</span>\s*</p>', re.IGNORECASE) + res = chapdetect.sub('<h2>'+'\g<chap>'+'</h2>\n', res) + # Unwrap lines using punctation if the median length of all lines is less than 150 + length = line_length('html', res, 0.4) + self.log("*** Median length is " + str(length) + " ***") + unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*</span>\s*(</p>)?\s*(?P<up2threeblanks><p[^>]*>\s*(<span[^>]*>\s*</span>\s*)</p>\s*){0,3}\s*<p[^>]*>\s*(<span[^>]*>)?\s*" % length, re.UNICODE) + if length < 150: + res = unwrap.sub(' ', res) f.write(res) self.write_inline_css(inline_class) stream.seek(0)