diff --git a/resources/recipes/nspm.recipe b/resources/recipes/nspm.recipe
index 29f2cfc5e3..58b782415b 100644
--- a/resources/recipes/nspm.recipe
+++ b/resources/recipes/nspm.recipe
@@ -6,7 +6,7 @@ nspm.rs
import re
from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import Tag, NavigableString
+from calibre.ebooks.BeautifulSoup import NavigableString
class Nspm(BasicNewsRecipe):
title = 'Nova srpska politicka misao'
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 256bcce6fc..7742a20a21 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -75,6 +75,8 @@ def line_length(format, raw, percent):
linere = re.compile('(?<=
)', re.DOTALL)
elif format == 'pdf':
linere = re.compile('(?<=
).*?(?=
)', re.DOTALL)
+ elif format == 'spanned_html':
+ linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
lengths = []
@@ -223,34 +225,33 @@ class HTMLPreProcessor(object):
# ˙
(re.compile(u'˙\s*()*\s*z', re.UNICODE), lambda match: u'ż'),
(re.compile(u'˙\s*()*\s*Z', re.UNICODE), lambda match: u'Ż'),
-
+
+ # If pdf printed from a browser then the header/footer has a reliable pattern
+ (re.compile(r'((?<=)\s*file:////?[A-Z].*
|file:////?[A-Z].*
(?=\s*
))', re.IGNORECASE), lambda match: ''),
+
+ # Center separator lines
+ (re.compile(u'
\s*(?P([*#•]+\s*)+)\s*
'), lambda match: '\n
' + match.group(1) + '
'),
# Remove page links
(re.compile(r'', re.IGNORECASE), lambda match: ''),
# Remove
tags
- (re.compile(r'', re.IGNORECASE), lambda match: '
'),
- # Replace
with
- (re.compile(r'\s*', re.IGNORECASE), lambda match: ''),
-
- # Remove hyphenation
- (re.compile(r'-\n\r?'), lambda match: ''),
+ (re.compile(r'', re.IGNORECASE), lambda match: '
'),
# Remove gray background
(re.compile(r']+>'), lambda match : ''),
# Detect Chapters to match default XPATH in GUI
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?((i|b)>((i|b)>)?)?)?(br|p)[^>]*>\s*(?P(<(i|b)>)?\s*\w+(\s*\w+)?\s*((i|b)>)?\s*(?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
- (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P([A-Z \'"!]{5,})\s*(\d+|\w+)?)(?p[^>]*>|
]*>)\n?((?=()?\s*\w+(\s+\w+)?()?(
]*>|?p[^>]*>))((?P.*)(
]*>|?p[^>]*>)))?'), chap_head),
-
+ (re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Chapter|Epilogue|Prologue|Book|Part|Dedication|Volume|Preface|Acknowledgments)\s*([\d\w-]+\s*){0,3}\s*([ibu]>){0,2})\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*
)?', re.IGNORECASE), chap_head),
+ # Cover the case where every letter in a chapter title is separated by a space
+ (re.compile(r'
\s*(?P([A-Z]\s+){4,}\s*([\d\w-]+\s*){0,3}\s*)\s*(
\s*){1,3}\s*(?P(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*(
))?'), chap_head),
+
# Have paragraphs show better
(re.compile(r''), lambda match : ''),
# Clean up spaces
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
- # Connect paragraphs split by -
- (re.compile(u'(?<=[^\s][-–])[\s]*(
)*[\s]*()*\s*(?=[^\s])'), lambda match: ''),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
- (re.compile(r'(?=\w)'), lambda match: ' '),
+ (re.compile(r'(?=\w)'), lambda match: ' '),
]
# Fix Book Designer markup
@@ -327,13 +328,30 @@ class HTMLPreProcessor(object):
import traceback
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
-
+
+ # unwrap hyphenation - moved here so it's executed after header/footer removal
+ if is_pdftohtml:
+ # unwrap visible dashes and hyphens - don't delete they are often hyphens for
+ # for compound words, formatting, etc
+ end_rules.append((re.compile(u'(?<=[-–—])\s*
\s*(?=[[a-z\d])'), lambda match: ''))
+ # unwrap/delete soft hyphens
+ end_rules.append((re.compile(u'[](\s*
)+\s*(?=[[a-z\d])'), lambda match: ''))
+ # unwrap/delete soft hyphens with formatting
+ end_rules.append((re.compile(u'[]\s*((i|u|b)>)+(\s*
)+\s*(<(i|u|b)>)+\s*(?=[[a-z\d])'), lambda match: ''))
+
+ # Make the more aggressive chapter marking regex optional with the preprocess option to
+ # reduce false positives and move after header/footer removal
+ if getattr(self.extra_opts, 'preprocess_html', None):
+ if is_pdftohtml:
+ end_rules.append((re.compile(r'
\s*(?P(<[ibu]>){0,2}\s*([A-Z \'"!]{3,})\s*([\dA-Z:]+\s){0,4}\s*([ibu]>){0,2})\s*\s*(?P
(<[ibu]>){0,2}(\s*\w+){1,4}\s*([ibu]>){0,2}\s*)?'), chap_head),)
+
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
if length:
+ # print "The pdf line length returned is " + str(length)
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+ (re.compile(r'(?<=.{%i}[a-z,;:)\-IA])\s*(?P(i|b|u)>)?\s*(\s*)+\s*(?=(<(i|b|u)>)?\s*[\w\d$(])' % length, re.UNICODE), wrap_lines),
)
for rule in self.PREPROCESS + start_rules:
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
new file mode 100644
index 0000000000..5301f70a16
--- /dev/null
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, Kovid Goyal '
+__docformat__ = 'restructuredtext en'
+
+import re
+from calibre.ebooks.conversion.preprocess import line_length
+from calibre.utils.logging import default_log
+
+class PreProcessor(object):
+
+ def __init__(self, log=None):
+ self.log = default_log if log is None else log
+ self.html_preprocess_sections = 0
+ self.found_indents = 0
+
+ def chapter_head(self, match):
+ chap = match.group('chap')
+ title = match.group('title')
+ if not title:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("found " + str(self.html_preprocess_sections) + " chapters. - " + str(chap))
+ return ''+chap+'
\n'
+ else:
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("found " + str(self.html_preprocess_sections) + " chapters & titles. - " + str(chap) + ", " + str(title))
+ return ''+chap+'
\n'+title+'
\n'
+
+ def chapter_break(self, match):
+ chap = match.group('section')
+ styles = match.group('styles')
+ self.html_preprocess_sections = self.html_preprocess_sections + 1
+ self.log("marked " + str(self.html_preprocess_sections) + " section markers based on punctuation. - " + str(chap))
+ return '<'+styles+' style="page-break-before:always">'+chap
+
+ def insert_indent(self, match):
+ pstyle = match.group('formatting')
+ span = match.group('span')
+ self.found_indents = self.found_indents + 1
+ if pstyle:
+ if not span:
+ return ''
+ else:
+ return '
'+span
+ else:
+ if not span:
+ return '
'
+ else:
+ return '
'+span
+
+ def no_markup(self, raw, percent):
+ '''
+ Detects total marked up line endings in the file. raw is the text to
+ inspect. Percent is the minimum percent of line endings which should
+ be marked up to return true.
+ '''
+ htm_end_ere = re.compile('
', re.DOTALL)
+ line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
+ htm_end = htm_end_ere.findall(raw)
+ line_end = line_end_ere.findall(raw)
+ tot_htm_ends = len(htm_end)
+ tot_ln_fds = len(line_end)
+ self.log("There are " + str(tot_ln_fds) + " total Line feeds, and " + str(tot_htm_ends) + " marked up endings")
+
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
+
+ min_lns = tot_ln_fds * percent
+ self.log("There must be fewer than " + str(min_lns) + " unmarked lines to add markup")
+ if min_lns > tot_htm_ends:
+ return True
+
+ def __call__(self, html):
+ self.log("********* Preprocessing HTML *********")
+ # Replace series of non-breaking spaces with text-indent
+ txtindent = re.compile(ur'[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
+ html = txtindent.sub(self.insert_indent, html)
+ if self.found_indents > 1:
+ self.log("replaced "+str(self.found_indents)+ " nbsp indents with inline styles")
+ # remove remaining non-breaking spaces
+ html = re.sub(ur'\u00a0', ' ', html)
+ # Get rid of empty tags to simplify other processing
+ html = re.sub(ur'\s*\s*', ' ', html)
+ # Get rid of empty span tags
+ html = re.sub(r"\s*]*>\s*", " ", html)
+
+ # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL)
+ blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
+ blanklines = blankreg.findall(html)
+ lines = linereg.findall(html)
+ if len(lines) > 1:
+ self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
+ if float(len(blanklines)) / float(len(lines)) > 0.40:
+ self.log("deleting blank lines")
+ html = blankreg.sub('', html)
+ # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ html = re.sub(r"\s*
", "
\n", html)
+ html = re.sub(r"\s*\s*", "\n
", html)
+
+ # some lit files don't have any
tags or equivalent (generally just plain text between
+ #
tags), check and mark up line endings if required before proceeding
+ if self.no_markup(html, 0.1):
+ self.log("not enough paragraph markers, adding now")
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('
\n', html)
+
+ # detect chapters/sections to match xpath or splitting logic
+ heading = re.compile(']*>', re.IGNORECASE)
+ self.html_preprocess_sections = len(heading.findall(html))
+ self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
+ #
+ # Start with most typical chapter headings, get more aggressive until one works
+ if self.html_preprocess_sections < 10:
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}s*(]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ html = chapdetect.sub(self.chapter_head, html)
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
+
+ if self.html_preprocess_sections < 10:
+ self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*()?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ html = chapdetect2.sub(self.chapter_head, html)
+
+ # Unwrap lines
+ #
+ self.log("Unwrapping Lines")
+ # Some OCR sourced files have line breaks in the html using a combination of span & p tags
+ # span are used for hard line breaks, p for new paragraphs. Determine which is used so
+ # that lines can be un-wrapped across page boundaries
+ paras_reg = re.compile(']*>', re.IGNORECASE)
+ spans_reg = re.compile(']*>', re.IGNORECASE)
+ paras = len(paras_reg.findall(html))
+ spans = len(spans_reg.findall(html))
+ if spans > 1:
+ if float(paras) / float(spans) < 0.75:
+ format = 'spanned_html'
+ else:
+ format = 'html'
+ else:
+ format = 'html'
+
+ # Calculate Length
+ length = line_length(format, html, 0.4)
+ self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+ #
+ # Unwrap and/or delete soft-hyphens, hyphens
+ html = re.sub(u'\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
+ html = re.sub(u'(?<=[-–—])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])', '', html)
+
+ # Unwrap lines using punctation if the median length of all lines is less than 200
+ unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ html = unwrap.sub(' ', html)
+
+ # If still no sections after unwrapping mark split points on lines with no punctuation
+ if self.html_preprocess_sections < 10:
+ self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
+ #self.log(html)
+ chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
+ html = chapdetect3.sub(self.chapter_break, html)
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+''+'
', html)
+
+ return html
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index d57bfddd3e..084d48e54b 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -24,7 +24,7 @@ from calibre.constants import islinux, isfreebsd, iswindows
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@@ -491,20 +491,6 @@ class HTMLInput(InputFormatPlugin):
return (None, raw)
def preprocess_html(self, html):
- if not hasattr(self, 'log'):
- from calibre.utils.logging import default_log
- self.log = default_log
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
- html = chapdetect.sub(''+'\g'+'
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- #
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
", "
\n", html)
- length = line_length('html', html, 0.4)
- self.log.debug("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- html = unwrap.sub(' ', html)
- return html
+ preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ return preprocessor(html)
+
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 9bf20fb1d4..65f5c607a2 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -6,10 +6,9 @@ __license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
-import re
-
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
+
class LITInput(InputFormatPlugin):
@@ -55,18 +54,6 @@ class LITInput(InputFormatPlugin):
def preprocess_html(self, html):
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r'(?=?(br|p|span))(?(br|p|span)[^>]*>)?\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)(?(p|br|span)[^>]*>)', re.IGNORECASE)
- html = chapdetect.sub(''+'\g'+'
\n', html)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- #
- # Insert extra line feeds so the line length regex functions properly
- html = re.sub(r"
", "
\n", html)
- length = line_length('html', html, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*(span|p|div)>\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- html = unwrap.sub(' ', html)
- return html
+ preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ return preprocessor(html)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index 487e70c04f..b8dc7a9560 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -3,6 +3,7 @@ __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal '
__docformat__ = 'restructuredtext en'
+import re
from calibre.customize.conversion import InputFormatPlugin
class MOBIInput(InputFormatPlugin):
@@ -37,3 +38,12 @@ class MOBIInput(InputFormatPlugin):
include_meta_content_type=False))
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path
+
+ def preprocess_html(self, html):
+ # search for places where a first or second level heading is immediately followed by another
+ # top level heading. demote the second heading to h3 to prevent splitting between chapter
+ # headings and titles, images, etc
+ doubleheading = re.compile(r'(?P]*>.+?\s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?)', re.IGNORECASE)
+ html = doubleheading.sub('\g'+''+'
', html)
+ return html
+
diff --git a/src/calibre/ebooks/pdb/pdf/reader.py b/src/calibre/ebooks/pdb/pdf/reader.py
index 3ae9f8ccca..c151551866 100644
--- a/src/calibre/ebooks/pdb/pdf/reader.py
+++ b/src/calibre/ebooks/pdb/pdf/reader.py
@@ -21,7 +21,7 @@ class Reader(FormatReader):
self.options = options
setattr(self.options, 'new_pdf_engine', False)
setattr(self.options, 'no_images', False)
- setattr(self.options, 'unwrap_factor', 0.5)
+ setattr(self.options, 'unwrap_factor', 0.45)
def extract_content(self, output_dir):
self.log.info('Extracting PDF...')
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 64a089281e..14b3552b04 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -22,10 +22,10 @@ class PDFInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
- OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
+ OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
- 'default is 0.5, this is the median line length.')),
+ 'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine.'))
])
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index adda8794ca..000c603c1c 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -7,7 +7,7 @@ import os, glob, re, textwrap
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.preprocess import line_length
+from calibre.ebooks.conversion.utils import PreProcessor
class InlineClass(etree.XSLTExtension):
@@ -229,16 +229,8 @@ class RTFInput(InputFormatPlugin):
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html:
- self.log("********* Preprocessing HTML *********")
- # Detect Chapters to match the xpath in the GUI
- chapdetect = re.compile(r']*>\s*]*>\s*(?P(<(i|b)><(i|b)>|<(i|b)>)?(.?Chapter|Epilogue|Prologue|Book|Part|Dedication)\s*([\d\w-]+(\s\w+)?)?((i|b)>(i|b)>|(i|b)>)?)\s*\s*
', re.IGNORECASE)
- res = chapdetect.sub(''+'\g'+'
\n', res)
- # Unwrap lines using punctation if the median length of all lines is less than 150
- length = line_length('html', res, 0.4)
- self.log("*** Median length is " + str(length) + " ***")
- unwrap = re.compile(r"(?<=.{%i}[a-z,;:\IA])\s*\s*()?\s*(?P]*>\s*(]*>\s*\s*)
\s*){0,3}\s*]*>\s*(]*>)?\s*" % length, re.UNICODE)
- if length < 150:
- res = unwrap.sub(' ', res)
+ preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ res = preprocessor(res)
f.write(res)
self.write_inline_css(inline_class)
stream.seek(0)
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 626c68ea63..b2ee421922 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -46,7 +46,7 @@
0.010000000000000
- 0.500000000000000
+ 0.450000000000000