From eb896d010f4e1661b35664fa51f04e94ac3fa5f3 Mon Sep 17 00:00:00 2001
From: John Schember <john@nachtimwald.com>
Date: Sat, 4 Jul 2009 14:14:53 -0400
Subject: [PATCH] PDF Input: User can specify regex to use to remove header and
 footer. Preprocessor: Able to use options from input plugins.

---
 src/calibre/ebooks/conversion/plumber.py    |  2 +-
 src/calibre/ebooks/conversion/preprocess.py | 33 +++++++++++++--------
 src/calibre/ebooks/pdf/input.py             | 19 +++++++-----
 src/calibre/gui2/convert/pdf_input.py       | 32 +++++++++++++++++++-
 src/calibre/gui2/convert/pdf_input.ui       | 30 +++++++++++++++----
 5 files changed, 90 insertions(+), 26 deletions(-)
diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py
index 11975094e3..77ae507867 100644
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@@ -694,7 +694,7 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
     '''
     from calibre.ebooks.oeb.base import OEBBook
     html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
-            opts.preprocess_html, getattr(opts, 'pdf_line_length', 0.5))
+            opts.preprocess_html, opts)
     oeb = OEBBook(log, html_preprocessor,
             pretty_print=opts.pretty_print, input_encoding=encoding)
     if not populate:
diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py
index 43bb52b8ad..f9788fdba8 100644
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@@ -140,8 +140,6 @@ class HTMLPreProcessor(object):
                   (re.compile(u'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), lambda match: ' '),
                   # Connect paragraphs split by -
                   (re.compile(u'(?<=[^\s][-–])[\s]*(</p>)*[\s]*(<p>)*\s*(?=[^\s])'), lambda match: ''),
-                  # Remove - that splits words
-                  (re.compile(u'(?<=[^\s])[-–]+(?=[^\s])'), lambda match: ''),
                   # Add space before and after italics
                   (re.compile(u'(?<!“)<i>'), lambda match: ' <i>'),
                   (re.compile(r'</i>(?=\w)'), lambda match: '</i> '),
@@ -163,10 +161,10 @@ class HTMLPreProcessor(object):
                       lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                      ]
     def __init__(self, input_plugin_preprocess, plugin_preprocess,
-            pdf_line_length):
+            extra_opts=None):
         self.input_plugin_preprocess = input_plugin_preprocess
         self.plugin_preprocess = plugin_preprocess
-        self.pdf_line_length = pdf_line_length
+        self.extra_opts = extra_opts
 
     def is_baen(self, src):
         return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@@ -187,15 +185,26 @@ class HTMLPreProcessor(object):
         elif self.is_book_designer(html):
             rules = self.BOOK_DESIGNER
         elif self.is_pdftohtml(html):
-            length = line_length(html, self.pdf_line_length)
-            line_length_rules = []
-            if length:
-                line_length_rules = [
-                    # Un wrap using punctuation
-                    (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
-                ]
+            start_rules = []
+            end_rules = []
 
-            rules = self.PDFTOHTML + line_length_rules
+            if getattr(self.extra_opts, 'remove_header', None):
+                start_rules.append(
+                    (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
+                )
+            if getattr(self.extra_opts, 'remove_footer', None):
+                start_rules.append(
+                    (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
+                )
+            if getattr(self.extra_opts, 'unwrap_factor', None):
+                length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
+                if length:
+                    end_rules.append(
+                        # Un wrap using punctuation
+                        (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
+                    )
+
+            rules = start_rules + self.PDFTOHTML + end_rules
         else:
             rules = []
         for rule in self.PREPROCESS + rules:
diff --git a/src/calibre/ebooks/pdf/input.py b/src/calibre/ebooks/pdf/input.py
index 3b82becc1f..e17d50869e 100644
--- a/src/calibre/ebooks/pdf/input.py
+++ b/src/calibre/ebooks/pdf/input.py
@@ -20,10 +20,20 @@ class PDFInput(InputFormatPlugin):
     options = set([
         OptionRecommendation(name='no_images', recommended_value=False,
             help=_('Do not extract images from the document')),
-        OptionRecommendation(name='pdf_line_length', recommended_value=0.5,
+        OptionRecommendation(name='unwrap_factor', recommended_value=0.5,
             help=_('Scale used to determine the length at which a line should '
             'be unwrapped. Valid values are a decimal between 0 and 1. The '
             'default is 0.5, this is the median line length.')),
+        OptionRecommendation(name='remove_header', recommended_value=False,
+            help=_('Use a regular expression to try and remove the header.')),
+        OptionRecommendation(name='header_regex',
+            recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
+            help=_('The regular expression to use to remove the header.')),
+        OptionRecommendation(name='remove_footer', recommended_value=False,
+            help=_('Use a regular expression to try and remove the footer.')),
+        OptionRecommendation(name='footer_regex',
+            recommended_value='(?i)(?<=<hr>)((\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?\d+<br>\s*.*?\s*)|(\s*<a name=\d+></a>((<img.+?>)*<br>\s*)?.*?<br>\s*\d+))(?=<br>)',
+            help=_('The regular expression to use to remove the footer.')),
     ])
 
     def convert(self, stream, options, file_ext, log,
@@ -42,12 +52,7 @@ class PDFInput(InputFormatPlugin):
         images = os.listdir(os.getcwd())
         images.remove('index.html')
         for i in images:
-            # Remove the - from the file name because it causes problems.
-            # The reference to the image with the - will be changed to not
-            # include it later in the conversion process.
-            new_i = i.replace('-', '')
-            os.rename(i, new_i)
-            manifest.append((new_i, None))
+            manifest.append((i, None))
         log.debug('Generating manifest...')
         opf.create_manifest(manifest)
 
diff --git a/src/calibre/gui2/convert/pdf_input.py b/src/calibre/gui2/convert/pdf_input.py
index 71e4bc0ef3..bfd658526c 100644
--- a/src/calibre/gui2/convert/pdf_input.py
+++ b/src/calibre/gui2/convert/pdf_input.py
@@ -4,8 +4,13 @@ __license__ = 'GPL 3'
 __copyright__ = '2009, John Schember <john@nachtimwald.com>'
 __docformat__ = 'restructuredtext en'
 
+import re
+
+from PyQt4.Qt import SIGNAL
+
 from calibre.gui2.convert.pdf_input_ui import Ui_Form
 from calibre.gui2.convert import Widget
+from calibre.gui2 import qstring_to_unicode, error_dialog
 
 class PluginWidget(Widget, Ui_Form):
 
@@ -14,6 +19,31 @@ class PluginWidget(Widget, Ui_Form):
 
     def __init__(self, parent, get_option, get_help, db=None, book_id=None):
         Widget.__init__(self, parent, 'pdf_input',
-            ['no_images', 'pdf_line_length'])
+            ['no_images', 'unwrap_factor', 'remove_header', 'header_regex',
+            'remove_footer', 'footer_regex'])
         self.db, self.book_id = db, book_id
         self.initialize_options(get_option, get_help, db, book_id)
+
+        self.opt_header_regex.setEnabled(self.opt_remove_header.isChecked())
+        self.opt_footer_regex.setEnabled(self.opt_remove_footer.isChecked())
+
+        self.connect(self.opt_remove_header, SIGNAL('stateChanged(int)'), self.header_regex_state)
+        self.connect(self.opt_remove_footer, SIGNAL('stateChanged(int)'), self.footer_regex_state)
+
+    def header_regex_state(self, state):
+        self.opt_header_regex.setEnabled(state)
+
+    def footer_regex_state(self, state):
+        self.opt_footer_regex.setEnabled(state)
+
+    def pre_commit_check(self):
+        for x in ('header_regex', 'footer_regex'):
+            x = getattr(self, 'opt_'+x)
+            try:
+                pat = qstring_to_unicode(x.text())
+                re.compile(pat)
+            except Exception, err:
+                error_dialog(self, _('Invalid regular expression'),
+                             _('Invalid regular expression: %s')%err).exec_()
+                return False
+        return True
diff --git a/src/calibre/gui2/convert/pdf_input.ui b/src/calibre/gui2/convert/pdf_input.ui
index 35b840ded0..d34c6d404b 100644
--- a/src/calibre/gui2/convert/pdf_input.ui
+++ b/src/calibre/gui2/convert/pdf_input.ui
@@ -14,14 +14,14 @@
    <string>Form</string>
   </property>
   <layout class="QGridLayout" name="gridLayout">
-   <item row="1" column="0">
+   <item row="2" column="0">
     <widget class="QLabel" name="label_2">
      <property name="text">
       <string>Line Un-Wrapping Factor:</string>
      </property>
     </widget>
    </item>
-   <item row="3" column="0">
+   <item row="4" column="0">
     <spacer name="verticalSpacer">
      <property name="orientation">
       <enum>Qt::Vertical</enum>
@@ -34,8 +34,8 @@
      </property>
     </spacer>
    </item>
-   <item row="1" column="1">
-    <widget class="QDoubleSpinBox" name="opt_pdf_line_length">
+   <item row="2" column="1">
+    <widget class="QDoubleSpinBox" name="opt_unwrap_factor">
      <property name="maximum">
       <double>1.000000000000000</double>
      </property>
@@ -47,13 +47,33 @@
      </property>
     </widget>
    </item>
-   <item row="2" column="0">
+   <item row="3" column="0">
     <widget class="QCheckBox" name="opt_no_images">
      <property name="text">
       <string>No Images</string>
      </property>
     </widget>
    </item>
+   <item row="0" column="0">
+    <widget class="QCheckBox" name="opt_remove_header">
+     <property name="text">
+      <string>Remove Header</string>
+     </property>
+    </widget>
+   </item>
+   <item row="1" column="0">
+    <widget class="QCheckBox" name="opt_remove_footer">
+     <property name="text">
+      <string>Remove Footer</string>
+     </property>
+    </widget>
+   </item>
+   <item row="0" column="1">
+    <widget class="QLineEdit" name="opt_header_regex"/>
+   </item>
+   <item row="1" column="1">
+    <widget class="QLineEdit" name="opt_footer_regex"/>
+   </item>
   </layout>
  </widget>
  <resources/>