From eb896d010f4e1661b35664fa51f04e94ac3fa5f3 Mon Sep 17 00:00:00 2001 From: John Schember Date: Sat, 4 Jul 2009 14:14:53 -0400 Subject: [PATCH] PDF Input: User can specify regex to use to remove header and footer. Preprocessor: Able to use options from input plugins. --- src/calibre/ebooks/conversion/plumber.py | 2 +- src/calibre/ebooks/conversion/preprocess.py | 33 +++++++++++++-------- src/calibre/ebooks/pdf/input.py | 19 +++++++----- src/calibre/gui2/convert/pdf_input.py | 32 +++++++++++++++++++- src/calibre/gui2/convert/pdf_input.ui | 30 +++++++++++++++---- 5 files changed, 90 insertions(+), 26 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 11975094e3..77ae507867 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -694,7 +694,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, ''' from calibre.ebooks.oeb.base import OEBBook html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, - opts.preprocess_html, getattr(opts, 'pdf_line_length', 0.5)) + opts.preprocess_html, opts) oeb = OEBBook(log, html_preprocessor, pretty_print=opts.pretty_print, input_encoding=encoding) if not populate: diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index 43bb52b8ad..f9788fdba8 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -140,8 +140,6 @@ class HTMLPreProcessor(object): (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '), # Connect paragraphs split by - (re.compile(u'(?<=[^\s][-–])[\s]*(

)*[\s]*(

)*\s*(?=[^\s])'), lambda match: ''), - # Remove - that splits words - (re.compile(u'(?<=[^\s])[-–]+(?=[^\s])'), lambda match: ''), # Add space before and after italics (re.compile(u'(?'), lambda match: ' '), (re.compile(r'(?=\w)'), lambda match: ' '), @@ -163,10 +161,10 @@ class HTMLPreProcessor(object): lambda match : '

%s

'%(match.group(1),)), ] def __init__(self, input_plugin_preprocess, plugin_preprocess, - pdf_line_length): + extra_opts=None): self.input_plugin_preprocess = input_plugin_preprocess self.plugin_preprocess = plugin_preprocess - self.pdf_line_length = pdf_line_length + self.extra_opts = extra_opts def is_baen(self, src): return re.compile(r')?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), - ] + start_rules = [] + end_rules = [] - rules = self.PDFTOHTML + line_length_rules + if getattr(self.extra_opts, 'remove_header', None): + start_rules.append( + (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') + ) + if getattr(self.extra_opts, 'remove_footer', None): + start_rules.append( + (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') + ) + if getattr(self.extra_opts, 'unwrap_factor', None): + length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) + if length: + end_rules.append( + # Un wrap using punctuation + (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), + ) + + rules = start_rules + self.PDFTOHTML + end_rules else: rules = [] for rule in self.PREPROCESS + rules: diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index 3b82becc1f..e17d50869e 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -20,10 +20,20 @@ class PDFInput(InputFormatPlugin): options = set([ OptionRecommendation(name='no_images', recommended_value=False, help=_('Do not extract images from the document')), - OptionRecommendation(name='pdf_line_length', recommended_value=0.5, + OptionRecommendation(name='unwrap_factor', recommended_value=0.5, help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'default is 0.5, this is the median line length.')), + OptionRecommendation(name='remove_header', recommended_value=False, + help=_('Use a regular expression to try and remove the header.')), + OptionRecommendation(name='header_regex', + recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)', + help=_('The regular expression to use to remove the header.')), + OptionRecommendation(name='remove_footer', recommended_value=False, + help=_('Use a regular expression to try and remove the footer.')), + OptionRecommendation(name='footer_regex', + recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)', + help=_('The regular expression to use to remove the footer.')), ]) def convert(self, stream, options, file_ext, log, @@ -42,12 +52,7 @@ class PDFInput(InputFormatPlugin): images = os.listdir(os.getcwd()) images.remove('index.html') for i in images: - # Remove the - from the file name because it causes problems. - # The reference to the image with the - will be changed to not - # include it later in the conversion process. - new_i = i.replace('-', '') - os.rename(i, new_i) - manifest.append((new_i, None)) + manifest.append((i, None)) log.debug('Generating manifest...') opf.create_manifest(manifest) diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py index 71e4bc0ef3..bfd658526c 100644 --- a/src/calibre/gui2/convert/pdf_input.py +++ b/src/calibre/gui2/convert/pdf_input.py @@ -4,8 +4,13 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' +import re + +from PyQt4.Qt import SIGNAL + from calibre.gui2.convert.pdf_input_ui import Ui_Form from calibre.gui2.convert import Widget +from calibre.gui2 import qstring_to_unicode, error_dialog class PluginWidget(Widget, Ui_Form): @@ -14,6 +19,31 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, 'pdf_input', - ['no_images', 'pdf_line_length']) + ['no_images', 'unwrap_factor', 'remove_header', 'header_regex', + 'remove_footer', 'footer_regex']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) + + self.opt_header_regex.setEnabled(self.opt_remove_header.isChecked()) + self.opt_footer_regex.setEnabled(self.opt_remove_footer.isChecked()) + + self.connect(self.opt_remove_header, SIGNAL('stateChanged(int)'), self.header_regex_state) + self.connect(self.opt_remove_footer, SIGNAL('stateChanged(int)'), self.footer_regex_state) + + def header_regex_state(self, state): + self.opt_header_regex.setEnabled(state) + + def footer_regex_state(self, state): + self.opt_footer_regex.setEnabled(state) + + def pre_commit_check(self): + for x in ('header_regex', 'footer_regex'): + x = getattr(self, 'opt_'+x) + try: + pat = qstring_to_unicode(x.text()) + re.compile(pat) + except Exception, err: + error_dialog(self, _('Invalid regular expression'), + _('Invalid regular expression: %s')%err).exec_() + return False + return True diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui index 35b840ded0..d34c6d404b 100644 --- a/src/calibre/gui2/convert/pdf_input.ui +++ b/src/calibre/gui2/convert/pdf_input.ui @@ -14,14 +14,14 @@ Form - + Line Un-Wrapping Factor: - + Qt::Vertical @@ -34,8 +34,8 @@ - - + + 1.000000000000000 @@ -47,13 +47,33 @@ - + No Images + + + + Remove Header + + + + + + + Remove Footer + + + + + + + + +