Use lxml to handle HTML entities and <meta/>-specified encodings

This commit is contained in:
Marshall T. Vandegrift 2009-01-19 22:57:18 -05:00
parent c198458f65
commit 50ea39227b

View File

@ -15,10 +15,10 @@ from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote from urllib import unquote as urlunquote
import logging import logging
import re import re
import htmlentitydefs
import uuid import uuid
import copy import copy
from lxml import etree from lxml import etree
from lxml import html
from calibre import LoggingInterface from calibre import LoggingInterface
XML_PARSER = etree.XMLParser(recover=True) XML_PARSER = etree.XMLParser(recover=True)
@ -67,14 +67,6 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
MS_COVER_TYPE = 'other.ms-coverimage-standard' MS_COVER_TYPE = 'other.ms-coverimage-standard'
recode = lambda s: s.decode('iso-8859-1').encode('ascii', 'xmlcharrefreplace')
ENTITYDEFS = dict((k, recode(v)) for k, v in htmlentitydefs.entitydefs.items())
del ENTITYDEFS['lt']
del ENTITYDEFS['gt']
del ENTITYDEFS['quot']
del ENTITYDEFS['amp']
del recode
def element(parent, *args, **kwargs): def element(parent, *args, **kwargs):
if parent is not None: if parent is not None:
@ -298,7 +290,6 @@ class Metadata(object):
class Manifest(object): class Manifest(object):
class Item(object): class Item(object):
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)') NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
def __init__(self, id, href, media_type, def __init__(self, id, href, media_type,
@ -317,9 +308,12 @@ class Manifest(object):
% (self.id, self.href, self.media_type) % (self.id, self.href, self.media_type)
def _force_xhtml(self, data): def _force_xhtml(self, data):
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) try:
data = self.ENTITY_RE.sub(repl, data) data = etree.fromstring(data, parser=XML_PARSER)
data = etree.fromstring(data, parser=XML_PARSER) except etree.XMLSyntaxError:
data = html.fromstring(data, parser=XML_PARSER)
data = etree.tostring(data, encoding=unicode)
data = etree.fromstring(data, parser=XML_PARSER)
if namespace(data.tag) != XHTML_NS: if namespace(data.tag) != XHTML_NS:
data.attrib['xmlns'] = XHTML_NS data.attrib['xmlns'] = XHTML_NS
data = etree.tostring(data) data = etree.tostring(data)