ODT Input: More workarounds for the image positioning markup produced by newer versions of LibreOffice. Fixes #1063207 (odt to anything [alignment])

This commit is contained in:
Kovid Goyal 2012-10-08 13:36:39 +05:30
parent 9e33851777
commit 9b8c6f218e

View File

@ -6,15 +6,19 @@ __docformat__ = 'restructuredtext en'
''' '''
Convert an ODT file into a Open Ebook Convert an ODT file into a Open Ebook
''' '''
import os import os, logging
from lxml import etree from lxml import etree
from cssutils import CSSParser
from cssutils.css import CSSRule
from odf.odf2xhtml import ODF2XHTML from odf.odf2xhtml import ODF2XHTML
from odf.opendocument import load as odLoad from odf.opendocument import load as odLoad
from odf.draw import Frame as odFrame, Image as odImage from odf.draw import Frame as odFrame, Image as odImage
from odf.namespaces import TEXTNS as odTEXTNS from odf.namespaces import TEXTNS as odTEXTNS
from calibre import CurrentDir, walk from calibre import CurrentDir, walk
from calibre.ebooks.oeb.base import _css_logger
class Extract(ODF2XHTML): class Extract(ODF2XHTML):
@ -29,14 +33,14 @@ class Extract(ODF2XHTML):
def fix_markup(self, html, log): def fix_markup(self, html, log):
root = etree.fromstring(html) root = etree.fromstring(html)
self.epubify_markup(root, log)
self.filter_css(root, log) self.filter_css(root, log)
self.extract_css(root) self.extract_css(root, log)
self.epubify_markup(root, log)
html = etree.tostring(root, encoding='utf-8', html = etree.tostring(root, encoding='utf-8',
xml_declaration=True) xml_declaration=True)
return html return html
def extract_css(self, root): def extract_css(self, root, log):
ans = [] ans = []
for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'): for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
ans.append(s.text) ans.append(s.text)
@ -51,9 +55,21 @@ class Extract(ODF2XHTML):
etree.SubElement(head, ns+'link', {'type':'text/css', etree.SubElement(head, ns+'link', {'type':'text/css',
'rel':'stylesheet', 'href':'odfpy.css'}) 'rel':'stylesheet', 'href':'odfpy.css'})
with open('odfpy.css', 'wb') as f: css = u'\n\n'.join(ans)
f.write((u'\n\n'.join(ans)).encode('utf-8')) parser = CSSParser(loglevel=logging.WARNING,
log=_css_logger)
self.css = parser.parseString(css, validate=False)
with open('odfpy.css', 'wb') as f:
f.write(css.encode('utf-8'))
def get_css_for_class(self, cls):
if not cls: return None
for rule in self.css.cssRules.rulesOfType(CSSRule.STYLE_RULE):
for sel in rule.selectorList:
q = sel.selectorText
if q == '.' + cls:
return rule
def epubify_markup(self, root, log): def epubify_markup(self, root, log):
from calibre.ebooks.oeb.base import XPath, XHTML from calibre.ebooks.oeb.base import XPath, XHTML
@ -84,16 +100,54 @@ class Extract(ODF2XHTML):
div.attrib['style'] = style div.attrib['style'] = style
img.attrib['style'] = 'max-width: 100%; max-height: 100%' img.attrib['style'] = 'max-width: 100%; max-height: 100%'
# A div/div/img construct causes text-align:center to not work in ADE # Handle anchored images. The default markup + CSS produced by
# so set the display of the second div to inline. This should have no # odf2xhtml works with WebKit but not with ADE. So we convert the
# effect (apart from minor vspace issues) in a compliant HTML renderer # common cases of left/right/center aligned block images to work on
# but it fixes the centering of the image via a text-align:center on # both webkit and ADE. We detect the case of setting the side margins
# the first div in ADE # to auto and map it to an appropriate text-align directive, which
# works in both WebKit and ADE.
# https://bugs.launchpad.net/bugs/1063207
# https://bugs.launchpad.net/calibre/+bug/859343
imgpath = XPath('descendant::h:div/h:div/h:img') imgpath = XPath('descendant::h:div/h:div/h:img')
for img in imgpath(root): for img in imgpath(root):
div2 = img.getparent() div2 = img.getparent()
div1 = div2.getparent() div1 = div2.getparent()
if len(div1) == len(div2) == 1: if (len(div1), len(div2)) != (1, 1): continue
cls = div1.get('class', '')
first_rules = filter(None, [self.get_css_for_class(x) for x in
cls.split()])
has_align = False
for r in first_rules:
if r.style.getProperty(u'text-align') is not None:
has_align = True
ml = mr = None
if not has_align:
aval = None
cls = div2.get(u'class', u'')
rules = filter(None, [self.get_css_for_class(x) for x in
cls.split()])
for r in rules:
ml = r.style.getPropertyCSSValue(u'margin-left') or ml
mr = r.style.getPropertyCSSValue(u'margin-right') or mr
ml = getattr(ml, 'value', None)
mr = getattr(mr, 'value', None)
if ml == mr == u'auto':
aval = u'center'
elif ml == u'auto' and mr != u'auto':
aval = 'right'
elif ml != u'auto' and mr == u'auto':
aval = 'left'
if aval is not None:
style = div1.attrib.get('style', '').strip()
if style and not style.endswith(';'):
style = style + ';'
style += 'text-align:%s'%aval
has_align = True
div1.attrib['style'] = style
if has_align:
# This is needed for ADE, without it the text-align has no
# effect
style = div2.attrib['style'] style = div2.attrib['style']
div2.attrib['style'] = 'display:inline;'+style div2.attrib['style'] = 'display:inline;'+style