From 76de6aef24f99929957676fde5e98f86f209345b Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Sun, 18 Jan 2009 21:44:43 -0500 Subject: [PATCH] Use etree.html to handle HTML entities and not UTF-8 encodings --- src/calibre/ebooks/oeb/base.py | 20 +++++++------------- src/calibre/ebooks/oeb/transforms/flatcss.py | 8 ++++---- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 4248657e23..a903136610 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -15,10 +15,10 @@ from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote import logging import re -import htmlentitydefs import uuid import copy from lxml import etree +from lxml import html from calibre import LoggingInterface XML_PARSER = etree.XMLParser(recover=True) @@ -67,14 +67,6 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME]) MS_COVER_TYPE = 'other.ms-coverimage-standard' -recode = lambda s: s.decode('iso-8859-1').encode('ascii', 'xmlcharrefreplace') -ENTITYDEFS = dict((k, recode(v)) for k, v in htmlentitydefs.entitydefs.items()) -del ENTITYDEFS['lt'] -del ENTITYDEFS['gt'] -del ENTITYDEFS['quot'] -del ENTITYDEFS['amp'] -del recode - def element(parent, *args, **kwargs): if parent is not None: @@ -298,7 +290,6 @@ class Metadata(object): class Manifest(object): class Item(object): - ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') def __init__(self, id, href, media_type, @@ -317,9 +308,12 @@ class Manifest(object): % (self.id, self.href, self.media_type) def _force_xhtml(self, data): - repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - data = self.ENTITY_RE.sub(repl, data) - data = etree.fromstring(data, parser=XML_PARSER) + try: + data = etree.fromstring(data, parser=XML_PARSER) + except etree.XMLSyntaxError: + data = html.fromstring(data, parser=XML_PARSER) + data = etree.tostring(data, encoding=unicode) + data = etree.fromstring(data, parser=XML_PARSER) if namespace(data.tag) != XHTML_NS: data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data) diff --git a/src/calibre/ebooks/oeb/transforms/flatcss.py b/src/calibre/ebooks/oeb/transforms/flatcss.py index 375003c1a5..4877b28f51 100644 --- a/src/calibre/ebooks/oeb/transforms/flatcss.py +++ b/src/calibre/ebooks/oeb/transforms/flatcss.py @@ -161,11 +161,11 @@ class CSSFlattener(object): if 'bgcolor' in node.attrib: cssdict['background-color'] = node.attrib['bgcolor'] del node.attrib['bgcolor'] + if 'font-size' in cssdict or tag == 'body': + fsize = self.fmap[style['font-size']] + cssdict['font-size'] = "%0.5fem" % (fsize / psize) + psize = fsize if cssdict: - if 'font-size' in cssdict or tag == 'body': - fsize = self.fmap[style['font-size']] - cssdict['font-size'] = "%0.5fem" % (fsize / psize) - psize = fsize if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) margin = style['margin-left']