From 794eba4b46fa0764a6a6dd0b3b1175a34574d39b Mon Sep 17 00:00:00 2001 From: John Schember Date: Wed, 8 Jul 2009 07:55:08 -0400 Subject: [PATCH] Header and footer removal by regex moved to from pdf input to structure detection in plumber. --- src/calibre/ebooks/conversion/plumber.py | 25 +++++++++ src/calibre/ebooks/conversion/preprocess.py | 25 ++++----- src/calibre/ebooks/pdf/input.py | 10 ---- src/calibre/gui2/convert/pdf_input.py | 32 +----------- src/calibre/gui2/convert/pdf_input.ui | 28 ++-------- .../gui2/convert/structure_detection.py | 18 +++++-- .../gui2/convert/structure_detection.ui | 52 ++++++++++++++++--- 7 files changed, 104 insertions(+), 86 deletions(-) diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index dc6c0f8b52..3c52ec2d7b 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -315,6 +315,31 @@ OptionRecommendation(name='preprocess_html', ) ), +OptionRecommendation(name='remove_header', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Use a regular expression to try and remove the header.' + ) + ), + +OptionRecommendation(name='header_regex', + recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)', + level=OptionRecommendation.LOW, + help=_('The regular expression to use to remove the header.' + ) + ), + +OptionRecommendation(name='remove_footer', + recommended_value=False, level=OptionRecommendation.LOW, + help=_('Use a regular expression to try and remove the footer.' + ) + ), + +OptionRecommendation(name='footer_regex', + recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)', + level=OptionRecommendation.LOW, + help=_('The regular expression to use to remove the footer.' + ) + ), OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index f9788fdba8..69d6f1e511 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -185,17 +185,7 @@ class HTMLPreProcessor(object): elif self.is_book_designer(html): rules = self.BOOK_DESIGNER elif self.is_pdftohtml(html): - start_rules = [] end_rules = [] - - if getattr(self.extra_opts, 'remove_header', None): - start_rules.append( - (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') - ) - if getattr(self.extra_opts, 'remove_footer', None): - start_rules.append( - (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') - ) if getattr(self.extra_opts, 'unwrap_factor', None): length = line_length(html, getattr(self.extra_opts, 'unwrap_factor')) if length: @@ -204,10 +194,21 @@ class HTMLPreProcessor(object): (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P)?\s*()\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines), ) - rules = start_rules + self.PDFTOHTML + end_rules + rules = self.PDFTOHTML + end_rules else: rules = [] - for rule in self.PREPROCESS + rules: + + pre_rules = [] + if getattr(self.extra_opts, 'remove_header', None): + pre_rules.append( + (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '') + ) + if getattr(self.extra_opts, 'remove_footer', None): + pre_rules.append( + (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '') + ) + + for rule in self.PREPROCESS + pre_rules + rules: html = rule[0].sub(rule[1], html) # Handle broken XHTML w/ SVG (ugh) diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py index e17d50869e..58abbd635c 100644 --- a/src/calibre/ebooks/pdf/input.py +++ b/src/calibre/ebooks/pdf/input.py @@ -24,16 +24,6 @@ class PDFInput(InputFormatPlugin): help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'default is 0.5, this is the median line length.')), - OptionRecommendation(name='remove_header', recommended_value=False, - help=_('Use a regular expression to try and remove the header.')), - OptionRecommendation(name='header_regex', - recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)', - help=_('The regular expression to use to remove the header.')), - OptionRecommendation(name='remove_footer', recommended_value=False, - help=_('Use a regular expression to try and remove the footer.')), - OptionRecommendation(name='footer_regex', - recommended_value='(?i)(?<=
)((\s*(()*
\s*)?\d+
\s*.*?\s*)|(\s*(()*
\s*)?.*?
\s*\d+))(?=
)', - help=_('The regular expression to use to remove the footer.')), ]) def convert(self, stream, options, file_ext, log, diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py index bfd658526c..e4a9541823 100644 --- a/src/calibre/gui2/convert/pdf_input.py +++ b/src/calibre/gui2/convert/pdf_input.py @@ -4,13 +4,8 @@ __license__ = 'GPL 3' __copyright__ = '2009, John Schember ' __docformat__ = 'restructuredtext en' -import re - -from PyQt4.Qt import SIGNAL - from calibre.gui2.convert.pdf_input_ui import Ui_Form from calibre.gui2.convert import Widget -from calibre.gui2 import qstring_to_unicode, error_dialog class PluginWidget(Widget, Ui_Form): @@ -19,31 +14,6 @@ class PluginWidget(Widget, Ui_Form): def __init__(self, parent, get_option, get_help, db=None, book_id=None): Widget.__init__(self, parent, 'pdf_input', - ['no_images', 'unwrap_factor', 'remove_header', 'header_regex', - 'remove_footer', 'footer_regex']) + ['no_images', 'unwrap_factor']) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) - - self.opt_header_regex.setEnabled(self.opt_remove_header.isChecked()) - self.opt_footer_regex.setEnabled(self.opt_remove_footer.isChecked()) - - self.connect(self.opt_remove_header, SIGNAL('stateChanged(int)'), self.header_regex_state) - self.connect(self.opt_remove_footer, SIGNAL('stateChanged(int)'), self.footer_regex_state) - - def header_regex_state(self, state): - self.opt_header_regex.setEnabled(state) - - def footer_regex_state(self, state): - self.opt_footer_regex.setEnabled(state) - - def pre_commit_check(self): - for x in ('header_regex', 'footer_regex'): - x = getattr(self, 'opt_'+x) - try: - pat = qstring_to_unicode(x.text()) - re.compile(pat) - except Exception, err: - error_dialog(self, _('Invalid regular expression'), - _('Invalid regular expression: %s')%err).exec_() - return False - return True diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui index d34c6d404b..40f480b15d 100644 --- a/src/calibre/gui2/convert/pdf_input.ui +++ b/src/calibre/gui2/convert/pdf_input.ui @@ -14,14 +14,14 @@ Form - + Line Un-Wrapping Factor: - + Qt::Vertical @@ -34,7 +34,7 @@ - + 1.000000000000000 @@ -47,33 +47,13 @@ - + No Images - - - - Remove Header - - - - - - - Remove Footer - - - - - - - - - diff --git a/src/calibre/gui2/convert/structure_detection.py b/src/calibre/gui2/convert/structure_detection.py index 66dff86aca..ee0a389478 100644 --- a/src/calibre/gui2/convert/structure_detection.py +++ b/src/calibre/gui2/convert/structure_detection.py @@ -6,10 +6,13 @@ __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' +import re + +from PyQt4.Qt import SIGNAL from calibre.gui2.convert.structure_detection_ui import Ui_Form from calibre.gui2.convert import Widget -from calibre.gui2 import error_dialog +from calibre.gui2 import error_dialog, qstring_to_unicode class StructureDetectionWidget(Widget, Ui_Form): @@ -23,7 +26,8 @@ class StructureDetectionWidget(Widget, Ui_Form): ['chapter', 'chapter_mark', 'remove_first_image', 'insert_metadata', 'page_breaks_before', - 'preprocess_html'] + 'preprocess_html', 'remove_header', 'header_regex', + 'remove_footer', 'footer_regex'] ) self.db, self.book_id = db, book_id self.initialize_options(get_option, get_help, db, book_id) @@ -31,8 +35,16 @@ class StructureDetectionWidget(Widget, Ui_Form): self.opt_page_breaks_before.set_msg(_('Insert page breaks before ' '(XPath expression):')) - def pre_commit_check(self): + for x in ('header_regex', 'footer_regex'): + x = getattr(self, 'opt_'+x) + try: + pat = qstring_to_unicode(x.text()) + re.compile(pat) + except Exception, err: + error_dialog(self, _('Invalid regular expression'), + _('Invalid regular expression: %s')%err).exec_() + return False for x in ('chapter', 'page_breaks_before'): x = getattr(self, 'opt_'+x) if not x.check(): diff --git a/src/calibre/gui2/convert/structure_detection.ui b/src/calibre/gui2/convert/structure_detection.ui index 768b430c5a..eebc0f0d53 100644 --- a/src/calibre/gui2/convert/structure_detection.ui +++ b/src/calibre/gui2/convert/structure_detection.ui @@ -14,6 +14,9 @@ Form + + + @@ -62,20 +65,27 @@ - + + + + &Footer regular expression: + + + opt_footer_regex + + + + &Preprocess input file to possibly improve structure detection - + - - - - + Qt::Vertical @@ -88,6 +98,36 @@ + + + + &Header regular expression: + + + opt_header_regex + + + + + + + Remove F&ooter + + + + + + + Remove H&eader + + + + + + + + +