Get header and footer regex matching working better.

2025-07-09 03:04:10 -04:00 · 2009-10-26 19:55:10 -04:00 · 2009-10-26 19:55:10 -04:00 · 68e3acd43a
commit 68e3acd43a
parent abe52807cb
3 changed files with 10 additions and 11 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -228,17 +228,16 @@ class HTMLPreProcessor(object):
        else:
            rules = []
-        pre_rules = []
+        end_rules = []
        if getattr(self.extra_opts, 'remove_header', None):
-            pre_rules.append(
+            end_rules.append(
                (re.compile(getattr(self.extra_opts, 'header_regex')), lambda match : '')
            )
        if getattr(self.extra_opts, 'remove_footer', None):
-            pre_rules.append(
+            end_rules.append(
                (re.compile(getattr(self.extra_opts, 'footer_regex')), lambda match : '')
            )
-
+        
        end_rules = []
        if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
            length = line_length(html, getattr(self.extra_opts, 'unwrap_factor'))
            if length:
@ -247,7 +246,7 @@ class HTMLPreProcessor(object):
                    (re.compile(r'(?<=.{%i}[a-z\.,;:)-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                )
-        for rule in self.PREPROCESS + pre_rules + rules + end_rules:
+        for rule in self.PREPROCESS + rules + end_rules:
            html = rule[0].sub(rule[1], html)
        # Handle broken XHTML w/ SVG (ugh)
--- a/src/calibre/ebooks/oeb/iterator.py
+++ b/src/calibre/ebooks/oeb/iterator.py
@ -123,7 +123,7 @@ class EbookIterator(object):
                                else:
                                    print 'Loaded embedded font:', repr(family)
-    def __enter__(self, raw_only=False):
+    def __enter__(self, processed=False):
        self.delete_on_exit = []
        self._tdir = TemporaryDirectory('_ebook_iter')
        self.base  = self._tdir.__enter__()
@ -140,7 +140,7 @@ class EbookIterator(object):
                plumber.opts, plumber.input_fmt, self.log,
                {}, self.base)
-        if not raw_only and plumber.input_fmt.lower() in ('pdf', 'rb'):
+        if processed or plumber.input_fmt.lower() in ('pdf', 'rb'):
            self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts,
                    plumber.input_plugin)
        if hasattr(self.pathtoopf, 'manifest'):
--- a/src/calibre/gui2/convert/regex_builder.py
+++ b/src/calibre/gui2/convert/regex_builder.py
@ -87,12 +87,12 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
    def open_book(self, pathtoebook):
        self.iterator = EbookIterator(pathtoebook)
-        self.iterator.__enter__(raw_only=True)
+        self.iterator.__enter__(processed=True)
        text = [u'']
        for path in self.iterator.spine:
-            html = open(path, 'rb').read().decode(path.encoding, 'replace')
+            html = open(path, 'rb').read().decode('utf-8', 'replace')
            text.append(html)
-        self.preview.setPlainText('\n\n'.join(text))
+        self.preview.setPlainText('\n---\n'.join(text))
    def button_clicked(self, button):
        if button == self.button_box.button(QDialogButtonBox.Open):