Add a --preprocess-html option

2025-07-09 03:04:10 -04:00 · 2009-05-02 12:40:29 -07:00 · 2009-05-02 12:40:29 -07:00 · dc0e0f26a1
commit dc0e0f26a1
parent 8be2541738
7 changed files with 41 additions and 12 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -149,6 +149,18 @@ class InputFormatPlugin(Plugin):
        '''
        raise NotImplementedError()
    def preprocess_html(self, html):
        '''
        This method is called by the conversion pipeline on all HTML before it
        is parsed. It is meant to be used to do any required preprocessing on
        the HTML, like removing hard line breaks, etc.
        :param html: A unicode string
        :return: A unicode string
        '''
        return html
    def convert(self, stream, options, file_ext, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
--- a/src/calibre/ebooks/conversion/cli.py
+++ b/src/calibre/ebooks/conversion/cli.py
@ -129,6 +129,7 @@ def add_pipeline_options(parser, plumber):
                      'dont_split_on_page_breaks', 'chapter', 'chapter_mark',
                      'prefer_metadata_cover', 'remove_first_image',
                      'insert_metadata', 'page_breaks_before',
                      'preprocess_html',
                  ]
                  ),
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -312,6 +312,14 @@ OptionRecommendation(name='insert_metadata',
            )
        ),
 OptionRecommendation(name='preprocess_html',
        recommended_value=False, level=OptionRecommendation.LOW,
        help=_('Attempt to detect and correct hard line breaks and other '
            'problems in the source file. This may make things worse, so use '
            'with care.'
            )
        ),
 OptionRecommendation(name='read_metadata_from_opf',
            recommended_value=None, level=OptionRecommendation.LOW,
@ -580,7 +588,8 @@ OptionRecommendation(name='list_recipes',
            self.log('Debug input called, aborting the rest of the pipeline.')
            return
        if not hasattr(self.oeb, 'manifest'):
-            self.oeb = create_oebbook(self.log, self.oeb, self.opts)
+            self.oeb = create_oebbook(self.log, self.oeb, self.opts,
                    self.input_plugin)
        pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
        pr(0., _('Running transforms on ebook...'))
@ -652,12 +661,13 @@ OptionRecommendation(name='list_recipes',
                self.opts, self.log)
        self.ui_reporter(1.)
-def create_oebbook(log, path_or_stream, opts, reader=None):
+def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None):
    '''
    Create an OEBBook.
    '''
    from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor()
+    html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html,
            opts.preprocess_html)
    oeb = OEBBook(log, html_preprocessor=html_preprocessor,
            pretty_print=opts.pretty_print)
    # Read OEB Book into OEBBook
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -26,16 +26,16 @@ def sanitize_head(match):
 def chap_head(match):
    chap = match.group('chap')
    title = match.group('title')
-    if not title: 
+    if not title:
               return '<h1>'+chap+'</h1><br/>\n'
-    else: 
+    else:
               return '<h1>'+chap+'<br/>\n'+title+'</h1><br/>\n'
 def wrap_lines(match):
    ital = match.group('ital')
-    if not ital: 
+    if not ital:
               return ' '
-    else: 
+    else:
               return ital+' '
 def line_length(raw, percent):
@ -106,7 +106,7 @@ class HTMLPreProcessor(object):
                  (re.compile(u'¨\s*(<br.*?>)*\s*I', re.UNICODE), lambda match: u'Ï'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*a', re.UNICODE), lambda match: u'ä'),
                  (re.compile(u'¨\s*(<br.*?>)*\s*A', re.UNICODE), lambda match: u'Ä'),
-                  
+
                  # Remove page links
                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                  # Remove <hr> tags
@ -151,6 +151,9 @@ class HTMLPreProcessor(object):
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]
    def __init__(self, input_plugin_preprocess, plugin_preprocess):
        self.input_plugin_preprocess = input_plugin_preprocess
        self.plugin_preprocess = plugin_preprocess
    def is_baen(self, src):
        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
@ -175,7 +178,7 @@ class HTMLPreProcessor(object):
                # Un wrap using punctuation
                (re.compile(r'(?<=.{%i}[a-z,;:-IA])\s*(?P<ital></(i|b|u)>)?\s*(<p.*?>)\s*(?=(<(i|b|u)>)?[\w\d])' % line_length(html, .4), re.UNICODE), wrap_lines),
            ]
-            
+
            rules = self.PDFTOHTML + line_length_rules
        else:
            rules = []
@ -192,5 +195,8 @@ class HTMLPreProcessor(object):
        html = XMLDECL_RE.sub('', html)
        if self.plugin_preprocess:
            html = self.input_plugin_preprocess(html)
        return html
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -288,7 +288,7 @@ class HTMLInput(InputFormatPlugin):
            return opfpath
        from calibre.ebooks.conversion.plumber import create_oebbook
-        oeb = create_oebbook(log, opfpath, opts)
+        oeb = create_oebbook(log, opfpath, opts, self)
        from calibre.ebooks.oeb.transforms.package import Package
        Package(os.getcwdu())(oeb, opts)
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -19,6 +19,6 @@ class LITInput(InputFormatPlugin):
                accelerators):
        from calibre.ebooks.lit.reader import LitReader
        from calibre.ebooks.conversion.plumber import create_oebbook
-        return create_oebbook(log, stream, options, reader=LitReader)
+        return create_oebbook(log, stream, options, self, reader=LitReader)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -1506,7 +1506,7 @@ class OEBBook(object):
    COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
    def __init__(self, logger,
-            html_preprocessor=HTMLPreProcessor(),
+            html_preprocessor,
            css_preprocessor=CSSPreProcessor(),
            encoding='utf-8', pretty_print=False):
        """Create empty book.  Arguments: