diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index c120f0a560..6123577191 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -214,7 +214,6 @@ class HTMLPreProcessor(object):
(re.compile(u'˙\s*(
- # (re.compile(r' '),
-
- # unwrap hyphenation - don't delete the hyphen (often doesn't split words)
- #(re.compile(u'(?<=[-–—])\s*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
@@ -225,13 +224,6 @@ class HTMLPreProcessor(object):
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
(re.compile(r'
'),
- # Replace
with
\s*
', re.IGNORECASE), lambda match: '\n
\s*(?=[[a-z\d])'), lambda match: ''),
- # unwrap/delete soft hyphens
- #(re.compile(u'[]\s*
\s*(?=[[a-z\d])'), lambda match: ''),
# Remove gray background
(re.compile(r''+chap+'
\n'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ return ''+chap+'
\n'+title+'
\n'
+
+ def chapter_link(self, match):
+ chap = match.group('sectionlink')
+ if not chap:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links")
+ return '
'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on links. - " + str(chap))
+ return '
\n'+chap+'
'
+
+ def no_markup(self, raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile('
)', re.IGNORECASE) + blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE) + blanklines = blankreg.findall(html) + lines = linereg.findall(html) + if len(lines) > 1: + self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") + if float(len(blanklines)) / float(len(lines)) > 0.40: + self.log("deleting blank lines") + html = blankreg.sub('', html) + # Arrange line feeds and tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*", "\n", html) + html = re.sub(r"\s*\s*", "\n
", html) + + # some lit files don't have any
tags or equivalent, check and + # mark up line endings if required before proceeding + if self.no_markup(html, 0.1): + self.log("not enough paragraph markers, adding now") + add_markup = re.compile('(?)(\n)') + html = add_markup.sub('
\n', html)
+
+ # detect chapters/sections to match xpath or splitting logic
+ #
+ # Start with most typical chapter headings
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P ]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P'+'
', html)
+
+ return html
\ No newline at end of file
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 35a8a1a9bc..e83216ae1f 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw)
def preprocess_html(self, html):
- if not hasattr(self, 'log'):
- from calibre.utils.logging import default_log
- self.log = default_log
- self.log("********* Preprocessing HTML - HTML Input plugin *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P'+'\g
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- #
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
)', re.IGNORECASE) - blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE) - blanklines = blankreg.findall(html) - lines = linereg.findall(html) - if len(lines) > 1: - self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank") - if float(len(blanklines)) / float(len(lines)) > 0.40: - self.log("deleting blank lines") - html = blankreg.sub('', html) - # Arrange line feeds and tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*", "\n", html) - - # some lit files don't have anytags or equivalent, check and - # mark up line endings if required before proceeding - if no_markup(html, 0.1): - self.log("not enough paragraph markers, adding now") - add_markup = re.compile('(?)(\n)') - html = add_markup.sub('
\n', html)
-
- # detect chapters/sections to match xpath or splitting logic
- #
- # Mark split points based on embedded links
- chaplink = re.compile(r']*>\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P ]*>)\s*(<(i|b|u)>){0,2}\s*(]*>)?\s*(?P'+'
', html)
-
+ preprocessor = PreProcessor(html)
+ html = preprocessor(html)
return html
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
self.options = options
setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False)
- setattr(self.options, 'unwrap_factor', 0.5)
+ setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir):
self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 113c3d99d8..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -25,7 +25,7 @@ class PDFInput(InputFormatPlugin):
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.45, this is the median line length.')),
+ 'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])