Add/remove header wizard: When running on PDF input, replace non breaking spaces with normal spaces, since it is hard to write regexps to match non breaking spaces with the regex builder wizard.

2025-12-07 21:55:07 -05:00 · 2010-08-25 10:02:13 -06:00 · 2010-08-25 10:02:13 -06:00 · 2635f836fb
commit 2635f836fb
parent 364c6101ec
2 changed files with 42 additions and 13 deletions
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -5,8 +5,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import functools
-import re
+import functools, re

 from calibre import entity_to_unicode

@ -73,7 +72,7 @@ def line_length(format, raw, percent):
    '''
    raw = raw.replace('&nbsp;', ' ')
    if format == 'html':
-    	linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
+        linere = re.compile('(?<=<p).*?(?=</p>)', re.DOTALL)
    elif format == 'pdf':
        linere = re.compile('(?<=<br>).*?(?=<br>)', re.DOTALL)
    lines = linere.findall(raw)
@ -205,9 +204,6 @@ class HTMLPreProcessor(object):
                  # Remove gray background
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),

-                  # Remove non breaking spaces
-                  (re.compile(ur'\u00a0'), lambda match : ' '),
-
                  # Detect Chapters to match default XPATH in GUI
                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>(<(i|b)>(<(i|b)>)?)?(.?Chapter|Epilogue|Prologue|Book|Part)\s*([\d\w-]+(\s\w+)?)?(</(i|b)>(</(i|b)>)?)?)</?(br|p)[^>]*>\s*(?P<title>(<(i|b)>)?\s*\w+(\s*\w+)?\s*(</(i|b)>)?\s*(</?(br|p)[^>]*>))?', re.IGNORECASE), chap_head),
                  (re.compile(r'(?=<(/?br|p))(<(/?br|p)[^>]*)?>\s*(?P<chap>([A-Z \'"!]{5,})\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?'), chap_head),
@ -254,20 +250,27 @@ class HTMLPreProcessor(object):
    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]

-    def __call__(self, html, remove_special_chars=None):
+    def __call__(self, html, remove_special_chars=None,
+            get_preprocess_html=False):
        if remove_special_chars is not None:
            html = remove_special_chars.sub('', html)
        html = html.replace('\0', '')
+        is_pdftohtml = self.is_pdftohtml(html)
        if self.is_baen(html):
            rules = []
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
-        elif self.is_pdftohtml(html):
+        elif is_pdftohtml:
            rules = self.PDFTOHTML
        else:
            rules = []

-        if not self.extra_opts.keep_ligatures:
+        start_rules = []
+        if is_pdftohtml:
+            # Remove non breaking spaces
+            start_rules.append((re.compile(ur'\u00a0'), lambda match : ' '))
+
+        if not getattr(self.extra_opts, 'keep_ligatures', False):
            html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)

        end_rules = []
@ -299,9 +302,35 @@ class HTMLPreProcessor(object):
                    (re.compile(r'(?<=.{%i}[a-z\.,;:)\-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?\s*[\w\d(])' % length, re.UNICODE), wrap_lines),
                )

-        for rule in self.PREPROCESS + rules + end_rules:
+        for rule in self.PREPROCESS + start_rules:
            html = rule[0].sub(rule[1], html)

+        if get_preprocess_html:
+            return html
+
+        def dump(raw, where):
+            import os
+            dp = getattr(self.extra_opts, 'debug_pipeline', None)
+            if dp and os.path.exists(dp):
+                odir = os.path.join(dp, 'input')
+                if os.path.exists(odir):
+                    odir = os.path.join(odir, where)
+                    if not os.path.exists(odir):
+                        os.makedirs(odir)
+                    name, i = None, 0
+                    while not name or os.path.exists(os.path.join(odir, name)):
+                        i += 1
+                        name = '%04d.html'%i
+                    with open(os.path.join(odir, name), 'wb') as f:
+                        f.write(raw.encode('utf-8'))
+
+        #dump(html, 'pre-preprocess')
+
+        for rule in rules + end_rules:
+            html = rule[0].sub(rule[1], html)
+
+        #dump(html, 'post-preprocess')
+
        # Handle broken XHTML w/ SVG (ugh)
        if 'svg:' in html and SVG_NS not in html:
            html = html.replace(
--- a/src/calibre/gui2/convert/regex_builder.py
+++ b/src/calibre/gui2/convert/regex_builder.py
@ -14,7 +14,7 @@ from calibre.gui2.convert.regex_builder_ui import Ui_RegexBuilder
 from calibre.gui2.convert.xexp_edit_ui import Ui_Form as Ui_Edit
 from calibre.gui2 import error_dialog, choose_files
 from calibre.ebooks.oeb.iterator import EbookIterator
-from calibre.ebooks.conversion.preprocess import convert_entities
+from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
 from calibre.gui2.dialogs.choose_format import ChooseFormatDialog

 class RegexBuilder(QDialog, Ui_RegexBuilder):
@ -91,10 +91,10 @@ class RegexBuilder(QDialog, Ui_RegexBuilder):
        self.iterator = EbookIterator(pathtoebook)
        self.iterator.__enter__(only_input_plugin=True)
        text = [u'']
-        ent_pat = re.compile(r'&(\S+?);')
+        preprocessor = HTMLPreProcessor(None, False)
        for path in self.iterator.spine:
            html = open(path, 'rb').read().decode('utf-8', 'replace')
-            html = ent_pat.sub(convert_entities, html)
+            html = preprocessor(html, get_preprocess_html=True)
            text.append(html)
        self.preview.setPlainText('\n---\n'.join(text))