ODT Input: Speed up conversion of ODT files that define huge amounts of redundant style information. Fixes #777468 (Conversion from ODT to EPUB extremely slow)

2025-07-09 03:04:10 -04:00 · 2011-05-04 18:56:07 -06:00 · 2011-05-04 18:56:07 -06:00 · e15ee70a1d
commit e15ee70a1d
parent 36ba0bd52a
2 changed files with 62 additions and 7 deletions
--- a/src/calibre/ebooks/odt/input.py
+++ b/src/calibre/ebooks/odt/input.py
@ -7,6 +7,8 @@ __docformat__ = 'restructuredtext en'
 Convert an ODT file into a Open Ebook
 '''
 import os
 from lxml import etree
 from odf.odf2xhtml import ODF2XHTML
 from calibre import CurrentDir, walk
@ -23,7 +25,48 @@ class Extract(ODF2XHTML):
                with open(name, 'wb') as f:
                    f.write(data)
-    def __call__(self, stream, odir):
+    def filter_css(self, html, log):
        root = etree.fromstring(html)
        style = root.xpath('//*[local-name() = "style" and @type="text/css"]')
        if style:
            style = style[0]
            css = style.text
            if css:
                style.text, sel_map = self.do_filter_css(css)
                for x in root.xpath('//*[@class]'):
                    extra = []
                    orig = x.get('class')
                    for cls in orig.split():
                        extra.extend(sel_map.get(cls, []))
                    if extra:
                        x.set('class', orig + ' ' + ' '.join(extra))
                html = etree.tostring(root, encoding='utf-8',
                        xml_declaration=True)
        return html
    def do_filter_css(self, css):
        from cssutils import parseString
        from cssutils.css import CSSRule
        sheet = parseString(css)
        rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
        sel_map = {}
        count = 0
        for r in rules:
            # Check if we have only class selectors for this rule
            nc = [x for x in r.selectorList if not
                    x.selectorText.startswith('.')]
            if len(r.selectorList) > 1 and not nc:
                replace_name = 'c_odt%d'%count
                count += 1
                for sel in r.selectorList:
                    s = sel.selectorText[1:]
                    if s not in sel_map:
                        sel_map[s] = []
                    sel_map[s].append(replace_name)
                r.selectorText = '.'+replace_name
        return sheet.cssText, sel_map
    def __call__(self, stream, odir, log):
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.metadata.opf2 import OPFCreator
@ -32,13 +75,17 @@ class Extract(ODF2XHTML):
        if not os.path.exists(odir):
            os.makedirs(odir)
        with CurrentDir(odir):
-            print 'Extracting ODT file...'
+            log('Extracting ODT file...')
            html = self.odf2xhtml(stream)
            # A blanket img specification like this causes problems
-            # with EPUB output as the contaiing element often has
+            # with EPUB output as the containing element often has
            # an absolute height and width set that is larger than
            # the available screen real estate
            html = html.replace('img { width: 100%; height: 100%; }', '')
            try:
                html = self.filter_css(html, log)
            except:
                log.exception('Failed to filter CSS, conversion may be slow')
            with open('index.xhtml', 'wb') as f:
                f.write(html.encode('utf-8'))
            zf = ZipFile(stream, 'r')
@ -67,7 +114,7 @@ class ODTInput(InputFormatPlugin):
    def convert(self, stream, options, file_ext, log,
                accelerators):
-        return Extract()(stream, '.')
+        return Extract()(stream, '.', log)
    def postprocess_book(self, oeb, opts, log):
        # Fix <p><div> constructs as the asinine epubchecker complains
--- a/src/odf/odf2xhtml.py
+++ b/src/odf/odf2xhtml.py
@ -841,11 +841,19 @@ ol, ul { padding-left: 2em; }
            self.styledict[name] = styles
        # Write the styles to HTML
        self.writeout(self.default_styles)
        # Changed by Kovid to not write out endless copies of the same style
        css_styles = {}
        for name in self.stylestack:
            styles = self.styledict.get(name)
-            css2 = self.cs.convert_styles(styles)
+            css2 = tuple(self.cs.convert_styles(styles).iteritems())
-            self.writeout("%s {\n" % name)
+            if css2 in css_styles:
-            for style, val in css2.items():
+                css_styles[css2].append(name)
            else:
                css_styles[css2] = [name]
        for css2, names in css_styles.iteritems():
            self.writeout("%s {\n" % ', '.join(names))
            for style, val in css2:
                self.writeout("\t%s: %s;\n" % (style, val) )
            self.writeout("}\n")