ODT Input: Speed up conversion of ODT files that define huge amounts of redundant style information. Fixes #777468 (Conversion from ODT to EPUB extremely slow)

This commit is contained in:
Kovid Goyal 2011-05-04 18:56:07 -06:00
parent 36ba0bd52a
commit e15ee70a1d
2 changed files with 62 additions and 7 deletions

View File

@ -7,6 +7,8 @@ __docformat__ = 'restructuredtext en'
Convert an ODT file into a Open Ebook Convert an ODT file into a Open Ebook
''' '''
import os import os
from lxml import etree
from odf.odf2xhtml import ODF2XHTML from odf.odf2xhtml import ODF2XHTML
from calibre import CurrentDir, walk from calibre import CurrentDir, walk
@ -23,7 +25,48 @@ class Extract(ODF2XHTML):
with open(name, 'wb') as f: with open(name, 'wb') as f:
f.write(data) f.write(data)
def __call__(self, stream, odir): def filter_css(self, html, log):
root = etree.fromstring(html)
style = root.xpath('//*[local-name() = "style" and @type="text/css"]')
if style:
style = style[0]
css = style.text
if css:
style.text, sel_map = self.do_filter_css(css)
for x in root.xpath('//*[@class]'):
extra = []
orig = x.get('class')
for cls in orig.split():
extra.extend(sel_map.get(cls, []))
if extra:
x.set('class', orig + ' ' + ' '.join(extra))
html = etree.tostring(root, encoding='utf-8',
xml_declaration=True)
return html
def do_filter_css(self, css):
from cssutils import parseString
from cssutils.css import CSSRule
sheet = parseString(css)
rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
sel_map = {}
count = 0
for r in rules:
# Check if we have only class selectors for this rule
nc = [x for x in r.selectorList if not
x.selectorText.startswith('.')]
if len(r.selectorList) > 1 and not nc:
replace_name = 'c_odt%d'%count
count += 1
for sel in r.selectorList:
s = sel.selectorText[1:]
if s not in sel_map:
sel_map[s] = []
sel_map[s].append(replace_name)
r.selectorText = '.'+replace_name
return sheet.cssText, sel_map
def __call__(self, stream, odir, log):
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
@ -32,13 +75,17 @@ class Extract(ODF2XHTML):
if not os.path.exists(odir): if not os.path.exists(odir):
os.makedirs(odir) os.makedirs(odir)
with CurrentDir(odir): with CurrentDir(odir):
print 'Extracting ODT file...' log('Extracting ODT file...')
html = self.odf2xhtml(stream) html = self.odf2xhtml(stream)
# A blanket img specification like this causes problems # A blanket img specification like this causes problems
# with EPUB output as the contaiing element often has # with EPUB output as the containing element often has
# an absolute height and width set that is larger than # an absolute height and width set that is larger than
# the available screen real estate # the available screen real estate
html = html.replace('img { width: 100%; height: 100%; }', '') html = html.replace('img { width: 100%; height: 100%; }', '')
try:
html = self.filter_css(html, log)
except:
log.exception('Failed to filter CSS, conversion may be slow')
with open('index.xhtml', 'wb') as f: with open('index.xhtml', 'wb') as f:
f.write(html.encode('utf-8')) f.write(html.encode('utf-8'))
zf = ZipFile(stream, 'r') zf = ZipFile(stream, 'r')
@ -67,7 +114,7 @@ class ODTInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
return Extract()(stream, '.') return Extract()(stream, '.', log)
def postprocess_book(self, oeb, opts, log): def postprocess_book(self, oeb, opts, log):
# Fix <p><div> constructs as the asinine epubchecker complains # Fix <p><div> constructs as the asinine epubchecker complains

View File

@ -841,11 +841,19 @@ ol, ul { padding-left: 2em; }
self.styledict[name] = styles self.styledict[name] = styles
# Write the styles to HTML # Write the styles to HTML
self.writeout(self.default_styles) self.writeout(self.default_styles)
# Changed by Kovid to not write out endless copies of the same style
css_styles = {}
for name in self.stylestack: for name in self.stylestack:
styles = self.styledict.get(name) styles = self.styledict.get(name)
css2 = self.cs.convert_styles(styles) css2 = tuple(self.cs.convert_styles(styles).iteritems())
self.writeout("%s {\n" % name) if css2 in css_styles:
for style, val in css2.items(): css_styles[css2].append(name)
else:
css_styles[css2] = [name]
for css2, names in css_styles.iteritems():
self.writeout("%s {\n" % ', '.join(names))
for style, val in css2:
self.writeout("\t%s: %s;\n" % (style, val) ) self.writeout("\t%s: %s;\n" % (style, val) )
self.writeout("}\n") self.writeout("}\n")