ODT Input: Speed up conversion of ODT files that define huge amounts of redundant style information. Fixes #777468 (Conversion from ODT to EPUB extremely slow)

2025-10-29 17:52:28 -04:00 · 2011-05-04 18:56:07 -06:00 · 2011-05-04 18:56:07 -06:00 · e15ee70a1d
commit e15ee70a1d
parent 36ba0bd52a
2 changed files with 62 additions and 7 deletions
--- a/src/calibre/ebooks/odt/input.py
+++ b/src/calibre/ebooks/odt/input.py
@ -7,6 +7,8 @@ __docformat__ = 'restructuredtext en'
 Convert an ODT file into a Open Ebook
 '''
 import os
+
+from lxml import etree
 from odf.odf2xhtml import ODF2XHTML

 from calibre import CurrentDir, walk
@ -23,7 +25,48 @@ class Extract(ODF2XHTML):
                with open(name, 'wb') as f:
                    f.write(data)

-    def __call__(self, stream, odir):
+    def filter_css(self, html, log):
+        root = etree.fromstring(html)
+        style = root.xpath('//*[local-name() = "style" and @type="text/css"]')
+        if style:
+            style = style[0]
+            css = style.text
+            if css:
+                style.text, sel_map = self.do_filter_css(css)
+                for x in root.xpath('//*[@class]'):
+                    extra = []
+                    orig = x.get('class')
+                    for cls in orig.split():
+                        extra.extend(sel_map.get(cls, []))
+                    if extra:
+                        x.set('class', orig + ' ' + ' '.join(extra))
+                html = etree.tostring(root, encoding='utf-8',
+                        xml_declaration=True)
+        return html
+
+    def do_filter_css(self, css):
+        from cssutils import parseString
+        from cssutils.css import CSSRule
+        sheet = parseString(css)
+        rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
+        sel_map = {}
+        count = 0
+        for r in rules:
+            # Check if we have only class selectors for this rule
+            nc = [x for x in r.selectorList if not
+                    x.selectorText.startswith('.')]
+            if len(r.selectorList) > 1 and not nc:
+                replace_name = 'c_odt%d'%count
+                count += 1
+                for sel in r.selectorList:
+                    s = sel.selectorText[1:]
+                    if s not in sel_map:
+                        sel_map[s] = []
+                    sel_map[s].append(replace_name)
+                r.selectorText = '.'+replace_name
+        return sheet.cssText, sel_map
+
+    def __call__(self, stream, odir, log):
        from calibre.utils.zipfile import ZipFile
        from calibre.ebooks.metadata.meta import get_metadata
        from calibre.ebooks.metadata.opf2 import OPFCreator
@ -32,13 +75,17 @@ class Extract(ODF2XHTML):
        if not os.path.exists(odir):
            os.makedirs(odir)
        with CurrentDir(odir):
-            print 'Extracting ODT file...'
+            log('Extracting ODT file...')
            html = self.odf2xhtml(stream)
            # A blanket img specification like this causes problems
-            # with EPUB output as the contaiing element often has
+            # with EPUB output as the containing element often has
            # an absolute height and width set that is larger than
            # the available screen real estate
            html = html.replace('img { width: 100%; height: 100%; }', '')
+            try:
+                html = self.filter_css(html, log)
+            except:
+                log.exception('Failed to filter CSS, conversion may be slow')
            with open('index.xhtml', 'wb') as f:
                f.write(html.encode('utf-8'))
            zf = ZipFile(stream, 'r')
@ -67,7 +114,7 @@ class ODTInput(InputFormatPlugin):

    def convert(self, stream, options, file_ext, log,
                accelerators):
-        return Extract()(stream, '.')
+        return Extract()(stream, '.', log)

    def postprocess_book(self, oeb, opts, log):
        # Fix <p><div> constructs as the asinine epubchecker complains
--- a/src/odf/odf2xhtml.py
+++ b/src/odf/odf2xhtml.py
@ -841,11 +841,19 @@ ol, ul { padding-left: 2em; }
            self.styledict[name] = styles
        # Write the styles to HTML
        self.writeout(self.default_styles)
+        # Changed by Kovid to not write out endless copies of the same style
+        css_styles = {}
        for name in self.stylestack:
            styles = self.styledict.get(name)
-            css2 = self.cs.convert_styles(styles)
-            self.writeout("%s {\n" % name)
-            for style, val in css2.items():
+            css2 = tuple(self.cs.convert_styles(styles).iteritems())
+            if css2 in css_styles:
+                css_styles[css2].append(name)
+            else:
+                css_styles[css2] = [name]
+
+        for css2, names in css_styles.iteritems():
+            self.writeout("%s {\n" % ', '.join(names))
+            for style, val in css2:
                self.writeout("\t%s: %s;\n" % (style, val) )
            self.writeout("}\n")