Use etree.html to handle HTML entities and not UTF-8 encodings

This commit is contained in:
Marshall T. Vandegrift 2009-01-18 21:44:43 -05:00
parent cba3bb55e4
commit 76de6aef24
2 changed files with 11 additions and 17 deletions

View File

@ -15,10 +15,10 @@ from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
import logging
import re
import htmlentitydefs
import uuid
import copy
from lxml import etree
from lxml import html
from calibre import LoggingInterface
XML_PARSER = etree.XMLParser(recover=True)
@ -67,14 +67,6 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
MS_COVER_TYPE = 'other.ms-coverimage-standard'
recode = lambda s: s.decode('iso-8859-1').encode('ascii', 'xmlcharrefreplace')
ENTITYDEFS = dict((k, recode(v)) for k, v in htmlentitydefs.entitydefs.items())
del ENTITYDEFS['lt']
del ENTITYDEFS['gt']
del ENTITYDEFS['quot']
del ENTITYDEFS['amp']
del recode
def element(parent, *args, **kwargs):
if parent is not None:
@ -298,7 +290,6 @@ class Metadata(object):
class Manifest(object):
class Item(object):
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
def __init__(self, id, href, media_type,
@ -317,8 +308,11 @@ class Manifest(object):
% (self.id, self.href, self.media_type)
def _force_xhtml(self, data):
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
data = self.ENTITY_RE.sub(repl, data)
try:
data = etree.fromstring(data, parser=XML_PARSER)
except etree.XMLSyntaxError:
data = html.fromstring(data, parser=XML_PARSER)
data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data, parser=XML_PARSER)
if namespace(data.tag) != XHTML_NS:
data.attrib['xmlns'] = XHTML_NS

View File

@ -161,11 +161,11 @@ class CSSFlattener(object):
if 'bgcolor' in node.attrib:
cssdict['background-color'] = node.attrib['bgcolor']
del node.attrib['bgcolor']
if cssdict:
if 'font-size' in cssdict or tag == 'body':
fsize = self.fmap[style['font-size']]
cssdict['font-size'] = "%0.5fem" % (fsize / psize)
psize = fsize
if cssdict:
if self.lineh and self.fbase and tag != 'body':
self.clean_edges(cssdict, style, psize)
margin = style['margin-left']