mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Fix #1688. Be less clever -- parse XML as XML, and HTML as HTML if it isn't XML.
This commit is contained in:
parent
6cc4432cb7
commit
7cec68cc4e
@ -23,7 +23,6 @@ from calibre import LoggingInterface
|
|||||||
from calibre.translations.dynamic import translate
|
from calibre.translations.dynamic import translate
|
||||||
from calibre.startup import get_lang
|
from calibre.startup import get_lang
|
||||||
|
|
||||||
XML_PARSER = etree.XMLParser(recover=True)
|
|
||||||
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
XML_NS = 'http://www.w3.org/XML/1998/namespace'
|
||||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||||
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
|
OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/'
|
||||||
@ -140,8 +139,7 @@ class Logger(LoggingInterface, object):
|
|||||||
class AbstractContainer(object):
|
class AbstractContainer(object):
|
||||||
def read_xml(self, path):
|
def read_xml(self, path):
|
||||||
return etree.fromstring(
|
return etree.fromstring(
|
||||||
self.read(path), parser=XML_PARSER,
|
self.read(path), base_url=os.path.dirname(path))
|
||||||
base_url=os.path.dirname(path))
|
|
||||||
|
|
||||||
class DirContainer(AbstractContainer):
|
class DirContainer(AbstractContainer):
|
||||||
def __init__(self, rootdir):
|
def __init__(self, rootdir):
|
||||||
@ -334,15 +332,15 @@ class Manifest(object):
|
|||||||
if self.oeb.encoding is not None:
|
if self.oeb.encoding is not None:
|
||||||
data = data.decode(self.oeb.encoding, 'replace')
|
data = data.decode(self.oeb.encoding, 'replace')
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data, parser=XML_PARSER)
|
data = etree.fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
data = html.fromstring(data)
|
data = html.fromstring(data)
|
||||||
data = etree.tostring(data, encoding=unicode)
|
data = etree.tostring(data, encoding=unicode)
|
||||||
data = etree.fromstring(data, parser=XML_PARSER)
|
data = etree.fromstring(data)
|
||||||
if namespace(data.tag) != XHTML_NS:
|
if namespace(data.tag) != XHTML_NS:
|
||||||
data.attrib['xmlns'] = XHTML_NS
|
data.attrib['xmlns'] = XHTML_NS
|
||||||
data = etree.tostring(data, encoding=unicode)
|
data = etree.tostring(data, encoding=unicode)
|
||||||
data = etree.fromstring(data, parser=XML_PARSER)
|
data = etree.fromstring(data)
|
||||||
for meta in self.META_XP(data):
|
for meta in self.META_XP(data):
|
||||||
meta.getparent().remove(meta)
|
meta.getparent().remove(meta)
|
||||||
return data
|
return data
|
||||||
@ -355,7 +353,7 @@ class Manifest(object):
|
|||||||
if self.media_type in OEB_DOCS:
|
if self.media_type in OEB_DOCS:
|
||||||
data = self._force_xhtml(data)
|
data = self._force_xhtml(data)
|
||||||
elif self.media_type[-4:] in ('+xml', '/xml'):
|
elif self.media_type[-4:] in ('+xml', '/xml'):
|
||||||
data = etree.fromstring(data, parser=XML_PARSER)
|
data = etree.fromstring(data)
|
||||||
self._data = data
|
self._data = data
|
||||||
return data
|
return data
|
||||||
def fset(self, value):
|
def fset(self, value):
|
||||||
@ -788,7 +786,7 @@ class OEBBook(object):
|
|||||||
for tag in ('manifest', 'spine', 'tours', 'guide'):
|
for tag in ('manifest', 'spine', 'tours', 'guide'):
|
||||||
for element in opf.xpath(tag):
|
for element in opf.xpath(tag):
|
||||||
nroot.append(element)
|
nroot.append(element)
|
||||||
return etree.fromstring(etree.tostring(nroot), parser=XML_PARSER)
|
return etree.fromstring(etree.tostring(nroot))
|
||||||
|
|
||||||
def _read_opf(self, opfpath):
|
def _read_opf(self, opfpath):
|
||||||
opf = self.container.read_xml(opfpath)
|
opf = self.container.read_xml(opfpath)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user