diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 11975094e3..77ae507867 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -694,7 +694,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
'''
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
- opts.preprocess_html, getattr(opts, 'pdf_line_length', 0.5))
+ opts.preprocess_html, opts)
oeb = OEBBook(log, html_preprocessor,
pretty_print=opts.pretty_print, input_encoding=encoding)
if not populate:
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 43bb52b8ad..f9788fdba8 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -140,8 +140,6 @@ class HTMLPreProcessor(object):
(re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
# Connect paragraphs split by -
(re.compile(u'(?<=[^\s][-–])[\s]*(
)*[\s]*()*\s*(?=[^\s])'), lambda match: ''),
- # Remove - that splits words
- (re.compile(u'(?<=[^\s])[-–]+(?=[^\s])'), lambda match: ''),
# Add space before and after italics
(re.compile(u'(?'), lambda match: ' '),
(re.compile(r'(?=\w)'), lambda match: ' '),
@@ -163,10 +161,10 @@ class HTMLPreProcessor(object):
lambda match : '
%s
'%(match.group(1),)),
]
def __init__(self, input_plugin_preprocess, plugin_preprocess,
- pdf_line_length):
+ extra_opts=None):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
- self.pdf_line_length = pdf_line_length
+ self.extra_opts = extra_opts
def is_baen(self, src):
return re.compile(r'(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
- ]
+ start_rules = []
+ end_rules = []
- rules = self.PDFTOHTML + line_length_rules
+ if getattr(self.extra_opts, 'remove_header', None):
+ start_rules.append(
+ (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
+ )
+ if getattr(self.extra_opts, 'remove_footer', None):
+ start_rules.append(
+ (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
+ )
+ if getattr(self.extra_opts, 'unwrap_factor', None):
+ length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
+ if length:
+ end_rules.append(
+ # Un wrap using punctuation
+ (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P(i|b|u)>)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+ )
+
+ rules = start_rules + self.PDFTOHTML + end_rules
else:
rules = []
for rule in self.PREPROCESS + rules:
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 3b82becc1f..e17d50869e 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -20,10 +20,20 @@ class PDFInput(InputFormatPlugin):
options = set([
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
- OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
+ OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.5, this is the median line length.')),
+ OptionRecommendation(name='remove_header', recommended_value=False,
+ help=_('Use a regular expression to try and remove the header.')),
+ OptionRecommendation(name='header_regex',
+ recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)',
+ help=_('The regular expression to use to remove the header.')),
+ OptionRecommendation(name='remove_footer', recommended_value=False,
+ help=_('Use a regular expression to try and remove the footer.')),
+ OptionRecommendation(name='footer_regex',
+ recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)',
+ help=_('The regular expression to use to remove the footer.')),
])
def convert(self, stream, options, file_ext, log,
@@ -42,12 +52,7 @@ class PDFInput(InputFormatPlugin):
images = os.listdir(os.getcwd())
images.remove('index.html')
for i in images:
- # Remove the - from the file name because it causes problems.
- # The reference to the image with the - will be changed to not
- # include it later in the conversion process.
- new_i = i.replace('-', '')
- os.rename(i, new_i)
- manifest.append((new_i, None))
+ manifest.append((i, None))
log.debug('Generating manifest...')
opf.create_manifest(manifest)
diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py
index 71e4bc0ef3..bfd658526c 100644
--- a/src/calibre/gui2/convert/pdf_input.py
+++ b/src/calibre/gui2/convert/pdf_input.py
@@ -4,8 +4,13 @@ __license__ = 'GPL 3'
__copyright__ = '2009, John Schember '
__docformat__ = 'restructuredtext en'
+import re
+
+from PyQt4.Qt import SIGNAL
+
from calibre.gui2.convert.pdf_input_ui import Ui_Form
from calibre.gui2.convert import Widget
+from calibre.gui2 import qstring_to_unicode, error_dialog
class PluginWidget(Widget, Ui_Form):
@@ -14,6 +19,31 @@ class PluginWidget(Widget, Ui_Form):
def __init__(self, parent, get_option, get_help, db=None, book_id=None):
Widget.__init__(self, parent, 'pdf_input',
- ['no_images', 'pdf_line_length'])
+ ['no_images', 'unwrap_factor', 'remove_header', 'header_regex',
+ 'remove_footer', 'footer_regex'])
self.db, self.book_id = db, book_id
self.initialize_options(get_option, get_help, db, book_id)
+
+ self.opt_header_regex.setEnabled(self.opt_remove_header.isChecked())
+ self.opt_footer_regex.setEnabled(self.opt_remove_footer.isChecked())
+
+ self.connect(self.opt_remove_header, SIGNAL('stateChanged(int)'), self.header_regex_state)
+ self.connect(self.opt_remove_footer, SIGNAL('stateChanged(int)'), self.footer_regex_state)
+
+ def header_regex_state(self, state):
+ self.opt_header_regex.setEnabled(state)
+
+ def footer_regex_state(self, state):
+ self.opt_footer_regex.setEnabled(state)
+
+ def pre_commit_check(self):
+ for x in ('header_regex', 'footer_regex'):
+ x = getattr(self, 'opt_'+x)
+ try:
+ pat = qstring_to_unicode(x.text())
+ re.compile(pat)
+ except Exception, err:
+ error_dialog(self, _('Invalid regular expression'),
+ _('Invalid regular expression: %s')%err).exec_()
+ return False
+ return True
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 35b840ded0..d34c6d404b 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -14,14 +14,14 @@
Form
- -
+
-
Line Un-Wrapping Factor:
- -
+
-
Qt::Vertical
@@ -34,8 +34,8 @@
- -
-
+
-
+
1.000000000000000
@@ -47,13 +47,33 @@
- -
+
-
No Images
+ -
+
+
+ Remove Header
+
+
+
+ -
+
+
+ Remove Footer
+
+
+
+ -
+
+
+ -
+
+