mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Use etree.html to handle HTML entities and not UTF-8 encodings
This commit is contained in:
parent
cba3bb55e4
commit
76de6aef24
@ -15,10 +15,10 @@ from urlparse import urldefrag, urlparse, urlunparse
|
|||||||
from urllib import unquote as urlunquote
|
from urllib import unquote as urlunquote
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import htmlentitydefs
|
|
||||||
import uuid
|
import uuid
|
||||||
import copy
|
import copy
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from lxml import html
|
||||||
from calibre import LoggingInterface
|
from calibre import LoggingInterface
|
||||||
|
|
||||||
XML_PARSER = etree.XMLParser(recover=True)
|
XML_PARSER = etree.XMLParser(recover=True)
|
||||||
@ -67,14 +67,6 @@ OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])
|
|||||||
|
|
||||||
MS_COVER_TYPE = 'other.ms-coverimage-standard'
|
MS_COVER_TYPE = 'other.ms-coverimage-standard'
|
||||||
|
|
||||||
recode = lambda s: s.decode('iso-8859-1').encode('ascii', 'xmlcharrefreplace')
|
|
||||||
ENTITYDEFS = dict((k, recode(v)) for k, v in htmlentitydefs.entitydefs.items())
|
|
||||||
del ENTITYDEFS['lt']
|
|
||||||
del ENTITYDEFS['gt']
|
|
||||||
del ENTITYDEFS['quot']
|
|
||||||
del ENTITYDEFS['amp']
|
|
||||||
del recode
|
|
||||||
|
|
||||||
|
|
||||||
def element(parent, *args, **kwargs):
|
def element(parent, *args, **kwargs):
|
||||||
if parent is not None:
|
if parent is not None:
|
||||||
@ -298,7 +290,6 @@ class Metadata(object):
|
|||||||
|
|
||||||
class Manifest(object):
|
class Manifest(object):
|
||||||
class Item(object):
|
class Item(object):
|
||||||
ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
|
|
||||||
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
|
NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')
|
||||||
|
|
||||||
def __init__(self, id, href, media_type,
|
def __init__(self, id, href, media_type,
|
||||||
@ -317,8 +308,11 @@ class Manifest(object):
|
|||||||
% (self.id, self.href, self.media_type)
|
% (self.id, self.href, self.media_type)
|
||||||
|
|
||||||
def _force_xhtml(self, data):
|
def _force_xhtml(self, data):
|
||||||
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
try:
|
||||||
data = self.ENTITY_RE.sub(repl, data)
|
data = etree.fromstring(data, parser=XML_PARSER)
|
||||||
|
except etree.XMLSyntaxError:
|
||||||
|
data = html.fromstring(data, parser=XML_PARSER)
|
||||||
|
data = etree.tostring(data, encoding=unicode)
|
||||||
data = etree.fromstring(data, parser=XML_PARSER)
|
data = etree.fromstring(data, parser=XML_PARSER)
|
||||||
if namespace(data.tag) != XHTML_NS:
|
if namespace(data.tag) != XHTML_NS:
|
||||||
data.attrib['xmlns'] = XHTML_NS
|
data.attrib['xmlns'] = XHTML_NS
|
||||||
|
@ -161,11 +161,11 @@ class CSSFlattener(object):
|
|||||||
if 'bgcolor' in node.attrib:
|
if 'bgcolor' in node.attrib:
|
||||||
cssdict['background-color'] = node.attrib['bgcolor']
|
cssdict['background-color'] = node.attrib['bgcolor']
|
||||||
del node.attrib['bgcolor']
|
del node.attrib['bgcolor']
|
||||||
if cssdict:
|
|
||||||
if 'font-size' in cssdict or tag == 'body':
|
if 'font-size' in cssdict or tag == 'body':
|
||||||
fsize = self.fmap[style['font-size']]
|
fsize = self.fmap[style['font-size']]
|
||||||
cssdict['font-size'] = "%0.5fem" % (fsize / psize)
|
cssdict['font-size'] = "%0.5fem" % (fsize / psize)
|
||||||
psize = fsize
|
psize = fsize
|
||||||
|
if cssdict:
|
||||||
if self.lineh and self.fbase and tag != 'body':
|
if self.lineh and self.fbase and tag != 'body':
|
||||||
self.clean_edges(cssdict, style, psize)
|
self.clean_edges(cssdict, style, psize)
|
||||||
margin = style['margin-left']
|
margin = style['margin-left']
|
||||||
|
Loading…
x
Reference in New Issue
Block a user