mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
ODT Input: Speed up conversion of ODT files that define huge amounts of redundant style information. Fixes #777468 (Conversion from ODT to EPUB extremely slow)
This commit is contained in:
parent
36ba0bd52a
commit
e15ee70a1d
@ -7,6 +7,8 @@ __docformat__ = 'restructuredtext en'
|
||||
Convert an ODT file into a Open Ebook
|
||||
'''
|
||||
import os
|
||||
|
||||
from lxml import etree
|
||||
from odf.odf2xhtml import ODF2XHTML
|
||||
|
||||
from calibre import CurrentDir, walk
|
||||
@ -23,7 +25,48 @@ class Extract(ODF2XHTML):
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
def __call__(self, stream, odir):
|
||||
def filter_css(self, html, log):
|
||||
root = etree.fromstring(html)
|
||||
style = root.xpath('//*[local-name() = "style" and @type="text/css"]')
|
||||
if style:
|
||||
style = style[0]
|
||||
css = style.text
|
||||
if css:
|
||||
style.text, sel_map = self.do_filter_css(css)
|
||||
for x in root.xpath('//*[@class]'):
|
||||
extra = []
|
||||
orig = x.get('class')
|
||||
for cls in orig.split():
|
||||
extra.extend(sel_map.get(cls, []))
|
||||
if extra:
|
||||
x.set('class', orig + ' ' + ' '.join(extra))
|
||||
html = etree.tostring(root, encoding='utf-8',
|
||||
xml_declaration=True)
|
||||
return html
|
||||
|
||||
def do_filter_css(self, css):
|
||||
from cssutils import parseString
|
||||
from cssutils.css import CSSRule
|
||||
sheet = parseString(css)
|
||||
rules = list(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
|
||||
sel_map = {}
|
||||
count = 0
|
||||
for r in rules:
|
||||
# Check if we have only class selectors for this rule
|
||||
nc = [x for x in r.selectorList if not
|
||||
x.selectorText.startswith('.')]
|
||||
if len(r.selectorList) > 1 and not nc:
|
||||
replace_name = 'c_odt%d'%count
|
||||
count += 1
|
||||
for sel in r.selectorList:
|
||||
s = sel.selectorText[1:]
|
||||
if s not in sel_map:
|
||||
sel_map[s] = []
|
||||
sel_map[s].append(replace_name)
|
||||
r.selectorText = '.'+replace_name
|
||||
return sheet.cssText, sel_map
|
||||
|
||||
def __call__(self, stream, odir, log):
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
@ -32,13 +75,17 @@ class Extract(ODF2XHTML):
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
with CurrentDir(odir):
|
||||
print 'Extracting ODT file...'
|
||||
log('Extracting ODT file...')
|
||||
html = self.odf2xhtml(stream)
|
||||
# A blanket img specification like this causes problems
|
||||
# with EPUB output as the contaiing element often has
|
||||
# with EPUB output as the containing element often has
|
||||
# an absolute height and width set that is larger than
|
||||
# the available screen real estate
|
||||
html = html.replace('img { width: 100%; height: 100%; }', '')
|
||||
try:
|
||||
html = self.filter_css(html, log)
|
||||
except:
|
||||
log.exception('Failed to filter CSS, conversion may be slow')
|
||||
with open('index.xhtml', 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
zf = ZipFile(stream, 'r')
|
||||
@ -67,7 +114,7 @@ class ODTInput(InputFormatPlugin):
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
return Extract()(stream, '.')
|
||||
return Extract()(stream, '.', log)
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
# Fix <p><div> constructs as the asinine epubchecker complains
|
||||
|
@ -841,11 +841,19 @@ ol, ul { padding-left: 2em; }
|
||||
self.styledict[name] = styles
|
||||
# Write the styles to HTML
|
||||
self.writeout(self.default_styles)
|
||||
# Changed by Kovid to not write out endless copies of the same style
|
||||
css_styles = {}
|
||||
for name in self.stylestack:
|
||||
styles = self.styledict.get(name)
|
||||
css2 = self.cs.convert_styles(styles)
|
||||
self.writeout("%s {\n" % name)
|
||||
for style, val in css2.items():
|
||||
css2 = tuple(self.cs.convert_styles(styles).iteritems())
|
||||
if css2 in css_styles:
|
||||
css_styles[css2].append(name)
|
||||
else:
|
||||
css_styles[css2] = [name]
|
||||
|
||||
for css2, names in css_styles.iteritems():
|
||||
self.writeout("%s {\n" % ', '.join(names))
|
||||
for style, val in css2:
|
||||
self.writeout("\t%s: %s;\n" % (style, val) )
|
||||
self.writeout("}\n")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user