Do not resolve entities when parsing XML

Resolving entities is dangerous since lxml will actually
read file:// URLs in entity definitions. Fixes #1857800 [Private bug](https://bugs.launchpad.net/calibre/+bug/1857800)
This commit is contained in:
Kovid Goyal 2019-12-29 18:01:43 +05:30
parent 589079c6aa
commit 68febe94ca
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
51 changed files with 166 additions and 164 deletions

View File

@ -92,7 +92,7 @@ def uuid():
class XMLCache(object): class XMLCache(object):
def __init__(self, paths, ext_paths, prefixes, use_author_sort): def __init__(self, paths, ext_paths, prefixes, use_author_sort):
from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring
if DEBUG: if DEBUG:
debug_print('Building XMLCache...', paths) debug_print('Building XMLCache...', paths)
@ -101,7 +101,6 @@ class XMLCache(object):
self.use_author_sort = use_author_sort self.use_author_sort = use_author_sort
# Parse XML files {{{ # Parse XML files {{{
parser = etree.XMLParser(recover=True)
self.roots = {} self.roots = {}
for source_id, path in paths.items(): for source_id, path in paths.items():
if source_id == 0: if source_id == 0:
@ -116,10 +115,9 @@ class XMLCache(object):
with lopen(path, 'rb') as f: with lopen(path, 'rb') as f:
raw = f.read() raw = f.read()
self.roots[source_id] = etree.fromstring(xml_to_unicode( self.roots[source_id] = safe_xml_fromstring(
raw, strip_encoding_pats=True, assume_utf8=True, xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, verbose=DEBUG)[0]
verbose=DEBUG)[0], )
parser=parser)
if self.roots[source_id] is None: if self.roots[source_id] is None:
raise Exception(('The SONY database at %r is corrupted. Try ' raise Exception(('The SONY database at %r is corrupted. Try '
' disconnecting and reconnecting your reader.')%path) ' disconnecting and reconnecting your reader.')%path)
@ -136,10 +134,9 @@ class XMLCache(object):
if os.access(path, os.W_OK): if os.access(path, os.W_OK):
try: try:
with lopen(path, 'rb') as f: with lopen(path, 'rb') as f:
self.ext_roots[source_id] = etree.fromstring( self.ext_roots[source_id] = safe_xml_fromstring(
xml_to_unicode(f.read(), xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True, verbose=DEBUG)[0]
strip_encoding_pats=True, assume_utf8=True, )
verbose=DEBUG)[0], parser=parser)
self.ext_paths[source_id] = path self.ext_paths[source_id] = path
except: except:
pass pass

View File

@ -51,9 +51,9 @@ def return_raster_image(path):
def extract_cover_from_embedded_svg(html, base, log): def extract_cover_from_embedded_svg(html, base, log):
from lxml import etree
from calibre.ebooks.oeb.base import XPath, SVG, XLINK from calibre.ebooks.oeb.base import XPath, SVG, XLINK
root = etree.fromstring(html) from calibre.utils.xml_parse import safe_xml_fromstring
root = safe_xml_fromstring(html)
svg = XPath('//svg:svg')(root) svg = XPath('//svg:svg')(root)
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):

View File

@ -231,7 +231,7 @@ class EPUBInput(InputFormatPlugin):
return removed return removed
def find_opf(self): def find_opf(self):
from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring
def attr(n, attr): def attr(n, attr):
for k, v in n.attrib.items(): for k, v in n.attrib.items():
@ -239,7 +239,7 @@ class EPUBInput(InputFormatPlugin):
return v return v
try: try:
with lopen('META-INF/container.xml', 'rb') as f: with lopen('META-INF/container.xml', 'rb') as f:
root = etree.fromstring(f.read()) root = safe_xml_fromstring(f.read())
for r in root.xpath('//*[local-name()="rootfile"]'): for r in root.xpath('//*[local-name()="rootfile"]'):
if attr(r, 'media-type') != "application/oebps-package+xml": if attr(r, 'media-type') != "application/oebps-package+xml":
continue continue
@ -356,12 +356,13 @@ class EPUBInput(InputFormatPlugin):
from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
from calibre.ebooks.oeb.polish.toc import first_child from calibre.ebooks.oeb.polish.toc import first_child
from calibre.utils.xml_parse import safe_xml_fromstring
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
with lopen(nav_path, 'rb') as f: with lopen(nav_path, 'rb') as f:
raw = f.read() raw = f.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
root = parse(raw, log=log) root = parse(raw, log=log)
ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>') ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
navmap = ncx[0] navmap = ncx[0]
et = '{%s}type' % EPUB_NS et = '{%s}type' % EPUB_NS
bn = os.path.basename(nav_path) bn = os.path.basename(nav_path)

View File

@ -39,10 +39,11 @@ class FB2Input(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
from lxml import etree from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
self.log = log self.log = log
log.debug('Parsing XML...') log.debug('Parsing XML...')
@ -51,15 +52,9 @@ class FB2Input(InputFormatPlugin):
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0] assume_utf8=True, resolve_entities=True)[0]
try: try:
doc = etree.fromstring(raw) doc = safe_xml_fromstring(raw)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
try: doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
if doc is None:
raise Exception('parse failed')
except:
doc = etree.fromstring(raw.replace('& ', '&amp;'),
parser=RECOVER_PARSER)
if doc is None: if doc is None:
raise ValueError('The FB2 file is not valid XML') raise ValueError('The FB2 file is not valid XML')
doc = ensure_namespace(doc) doc = ensure_namespace(doc)
@ -99,7 +94,7 @@ class FB2Input(InputFormatPlugin):
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->', ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
re.DOTALL).sub('', ss) re.DOTALL).sub('', ss)
styledoc = etree.fromstring(ss) styledoc = safe_xml_fromstring(ss)
transform = etree.XSLT(styledoc) transform = etree.XSLT(styledoc)
result = transform(doc) result = transform(doc)

View File

@ -43,7 +43,7 @@ class LITInput(InputFormatPlugin):
from calibre.ebooks.txt.processor import convert_basic, \ from calibre.ebooks.txt.processor import convert_basic, \
separate_paragraphs_single_line separate_paragraphs_single_line
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring
import copy import copy
self.log('LIT file with all text in singe <pre> tag detected') self.log('LIT file with all text in singe <pre> tag detected')
html = separate_paragraphs_single_line(pre.text) html = separate_paragraphs_single_line(pre.text)
@ -55,7 +55,7 @@ class LITInput(InputFormatPlugin):
# SmartyPants skips text inside <pre> tags # SmartyPants skips text inside <pre> tags
from calibre.ebooks.conversion.preprocess import smarten_punctuation from calibre.ebooks.conversion.preprocess import smarten_punctuation
html = smarten_punctuation(html, self.log) html = smarten_punctuation(html, self.log)
root = etree.fromstring(html) root = safe_xml_fromstring(html)
body = XPath('//h:body')(root) body = XPath('//h:body')(root)
pre.tag = XHTML('div') pre.tag = XHTML('div')
pre.text = '' pre.text = ''

View File

@ -20,25 +20,19 @@ class LRFInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
from lxml import etree
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock, from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
Canvas, ImageBlock, RuledLine) Canvas, ImageBlock, RuledLine)
self.log = log self.log = log
self.log('Generating XML') self.log('Generating XML')
from calibre.ebooks.lrf.lrfparser import LRFDocument from calibre.ebooks.lrf.lrfparser import LRFDocument
from calibre.utils.xml_parse import safe_xml_fromstring
from lxml import etree
d = LRFDocument(stream) d = LRFDocument(stream)
d.parse() d.parse()
xml = d.to_xml(write_files=True) xml = d.to_xml(write_files=True)
if options.verbose > 2: if options.verbose > 2:
open(u'lrs.xml', 'wb').write(xml.encode('utf-8')) open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
parser = etree.XMLParser(no_network=True, huge_tree=True) doc = safe_xml_fromstring(xml)
try:
doc = etree.fromstring(xml, parser=parser)
except:
self.log.warn('Failed to parse XML. Trying to recover')
parser = etree.XMLParser(no_network=True, huge_tree=True,
recover=True)
doc = etree.fromstring(xml, parser=parser)
char_button_map = {} char_button_map = {}
for x in doc.xpath('//CharButton[@refobj]'): for x in doc.xpath('//CharButton[@refobj]'):
@ -60,7 +54,7 @@ class LRFInput(InputFormatPlugin):
plot_map[ro] = imgstr[0].get('file') plot_map[ro] = imgstr[0].get('file')
self.log('Converting XML to HTML...') self.log('Converting XML to HTML...')
styledoc = etree.fromstring(P('templates/lrf.xsl', data=True)) styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
media_type = MediaType() media_type = MediaType()
styles = Styles() styles = Styles()
text_block = TextBlock(styles, char_button_map, plot_map, log) text_block = TextBlock(styles, char_button_map, plot_map, log)

View File

@ -251,6 +251,7 @@ class RTFInput(InputFormatPlugin):
from calibre.ebooks.metadata.opf2 import OPFCreator from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from calibre.ebooks.rtf.input import InlineClass from calibre.ebooks.rtf.input import InlineClass
from calibre.utils.xml_parse import safe_xml_fromstring
self.opts = options self.opts = options
self.log = log self.log = log
self.log('Converting RTF to XML...') self.log('Converting RTF to XML...')
@ -270,8 +271,7 @@ class RTFInput(InputFormatPlugin):
self.log.exception('Failed to extract images...') self.log.exception('Failed to extract images...')
self.log('Parsing XML...') self.log('Parsing XML...')
parser = etree.XMLParser(recover=True, no_network=True) doc = safe_xml_fromstring(xml)
doc = etree.fromstring(xml, parser=parser)
border_styles = self.convert_borders(doc) border_styles = self.convert_borders(doc)
for pict in doc.xpath('//rtf:pict[@num]', for pict in doc.xpath('//rtf:pict[@num]',
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}): namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
@ -282,7 +282,7 @@ class RTFInput(InputFormatPlugin):
self.log('Converting XML to HTML...') self.log('Converting XML to HTML...')
inline_class = InlineClass(self.log) inline_class = InlineClass(self.log)
styledoc = etree.fromstring(P('templates/rtf.xsl', data=True)) styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True))
extensions = {('calibre', 'inline-class') : inline_class} extensions = {('calibre', 'inline-class') : inline_class}
transform = etree.XSLT(styledoc, extensions=extensions) transform = etree.XSLT(styledoc, extensions=extensions)
result = transform(doc) result = transform(doc)

View File

@ -32,10 +32,10 @@ class SNBInput(InputFormatPlugin):
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
accelerators): accelerators):
import uuid import uuid
from lxml import etree
from calibre.ebooks.oeb.base import DirContainer from calibre.ebooks.oeb.base import DirContainer
from calibre.ebooks.snb.snbfile import SNBFile from calibre.ebooks.snb.snbfile import SNBFile
from calibre.utils.xml_parse import safe_xml_fromstring
log.debug("Parsing SNB file...") log.debug("Parsing SNB file...")
snbFile = SNBFile() snbFile = SNBFile()
@ -52,7 +52,7 @@ class SNBInput(InputFormatPlugin):
encoding=options.input_encoding, populate=False) encoding=options.input_encoding, populate=False)
meta = snbFile.GetFileStream('snbf/book.snbf') meta = snbFile.GetFileStream('snbf/book.snbf')
if meta is not None: if meta is not None:
meta = etree.fromstring(meta) meta = safe_xml_fromstring(meta)
l = {'title' : './/head/name', l = {'title' : './/head/name',
'creator' : './/head/author', 'creator' : './/head/author',
'language' : './/head/language', 'language' : './/head/language',
@ -87,7 +87,7 @@ class SNBInput(InputFormatPlugin):
toc = snbFile.GetFileStream('snbf/toc.snbf') toc = snbFile.GetFileStream('snbf/toc.snbf')
oeb.container = DirContainer(tdir, log) oeb.container = DirContainer(tdir, log)
if toc is not None: if toc is not None:
toc = etree.fromstring(toc) toc = safe_xml_fromstring(toc)
i = 1 i = 1
for ch in toc.find('.//body'): for ch in toc.find('.//body'):
chapterName = ch.text chapterName = ch.text
@ -96,7 +96,7 @@ class SNBInput(InputFormatPlugin):
data = snbFile.GetFileStream('snbc/' + chapterSrc) data = snbFile.GetFileStream('snbc/' + chapterSrc)
if data is None: if data is None:
continue continue
snbc = etree.fromstring(data) snbc = safe_xml_fromstring(data)
lines = [] lines = []
for line in snbc.find('.//body'): for line in snbc.find('.//body'):
if line.tag == 'text': if line.tag == 'text':

View File

@ -18,11 +18,12 @@ from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.localization import canonicalize_lang from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
def fromstring(raw, parser=RECOVER_PARSER): def fromstring(raw, parser=RECOVER_PARSER):
return etree.fromstring(raw, parser=parser) return safe_xml_fromstring(raw)
# Read metadata {{{ # Read metadata {{{

View File

@ -11,6 +11,7 @@ from lxml import etree
from calibre import walk from calibre import walk
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from calibre.utils.xml_parse import safe_xml_fromstring
def pretty_all_xml_in_dir(path): def pretty_all_xml_in_dir(path):
@ -19,7 +20,7 @@ def pretty_all_xml_in_dir(path):
with open(f, 'r+b') as stream: with open(f, 'r+b') as stream:
raw = stream.read() raw = stream.read()
if raw: if raw:
root = etree.fromstring(raw) root = safe_xml_fromstring(raw)
stream.seek(0) stream.seek(0)
stream.truncate() stream.truncate()
stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True)) stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))

View File

@ -17,6 +17,7 @@ from lxml import etree
from calibre import prepare_string_for_xml from calibre import prepare_string_for_xml
from calibre.constants import __appname__, __version__ from calibre.constants import __appname__, __version__
from calibre.utils.localization import lang_as_iso639_1 from calibre.utils.localization import lang_as_iso639_1
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.img import save_cover_data_to from calibre.utils.img import save_cover_data_to
from calibre.ebooks.oeb.base import urlnormalize from calibre.ebooks.oeb.base import urlnormalize
from polyglot.builtins import unicode_type, string_or_bytes, range, filter from polyglot.builtins import unicode_type, string_or_bytes, range, filter
@ -69,7 +70,7 @@ class FB2MLizer(object):
output = self.clean_text('\n'.join(output)) output = self.clean_text('\n'.join(output))
if self.opts.pretty_print: if self.opts.pretty_print:
output = etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True) output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True)
return '<?xml version="1.0" encoding="UTF-8"?>\n' + output return '<?xml version="1.0" encoding="UTF-8"?>\n' + output

View File

@ -8,9 +8,8 @@ __docformat__ = 'restructuredtext en'
from io import BytesIO from io import BytesIO
from lxml import etree
from calibre.ebooks.docx.container import DOCX from calibre.ebooks.docx.container import DOCX
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.docx.writer.container import update_doc_props, xml2str from calibre.ebooks.docx.writer.container import update_doc_props, xml2str
from calibre.utils.imghdr import identify from calibre.utils.imghdr import identify
@ -61,11 +60,11 @@ def set_metadata(stream, mi):
ap_raw = c.read(ap_name) ap_raw = c.read(ap_name)
except Exception: except Exception:
ap_raw = None ap_raw = None
cp = etree.fromstring(dp_raw) cp = safe_xml_fromstring(dp_raw)
update_doc_props(cp, mi, c.namespace) update_doc_props(cp, mi, c.namespace)
replacements = {} replacements = {}
if ap_raw is not None: if ap_raw is not None:
ap = etree.fromstring(ap_raw) ap = safe_xml_fromstring(ap_raw)
comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep']) comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep'])
for child in tuple(ap): for child in tuple(ap):
if child.tag == comp.tag: if child.tag == comp.tag:

View File

@ -12,13 +12,12 @@ import os
import posixpath import posixpath
from contextlib import closing from contextlib import closing
from lxml import etree
from calibre import CurrentDir from calibre import CurrentDir
from calibre.ebooks.metadata.opf import ( from calibre.ebooks.metadata.opf import (
get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
) )
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
from calibre.utils.localunzip import LocalZipFile from calibre.utils.localunzip import LocalZipFile
from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace
@ -42,7 +41,7 @@ class Container(dict):
def __init__(self, stream=None): def __init__(self, stream=None):
if not stream: if not stream:
return return
container = etree.fromstring(stream.read()) container = safe_xml_fromstring(stream.read())
if container.get('version', None) != '1.0': if container.get('version', None) != '1.0':
raise EPubException("unsupported version of OCF") raise EPubException("unsupported version of OCF")
rootfiles = container.xpath('./*[local-name()="rootfiles"]') rootfiles = container.xpath('./*[local-name()="rootfiles"]')
@ -70,8 +69,7 @@ class Encryption(object):
'http://www.idpf.org/2008/embedding']) 'http://www.idpf.org/2008/embedding'])
def __init__(self, raw): def __init__(self, raw):
from lxml import etree self.root = safe_xml_fromstring(raw) if raw else None
self.root = etree.fromstring(raw) if raw else None
self.entries = {} self.entries = {}
if self.root is not None: if self.root is not None:
for em in self.root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): for em in self.root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):

View File

@ -15,6 +15,7 @@ from lxml import etree
from calibre.utils.date import parse_only_date from calibre.utils.date import parse_only_date
from calibre.utils.img import save_cover_data_to from calibre.utils.img import save_cover_data_to
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.imghdr import identify from calibre.utils.imghdr import identify
from calibre import guess_type, guess_all_extensions, prints, force_unicode from calibre import guess_type, guess_all_extensions, prints, force_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn from calibre.ebooks.metadata import MetaInformation, check_isbn
@ -315,9 +316,8 @@ def _parse_language(root, mi, ctx):
def _get_fbroot(raw): def _get_fbroot(raw):
parser = etree.XMLParser(recover=True, no_network=True)
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
root = etree.fromstring(raw, parser=parser) root = safe_xml_fromstring(raw)
return ensure_namespace(root) return ensure_namespace(root)
@ -452,5 +452,5 @@ def ensure_namespace(doc):
import re import re
raw = etree.tostring(doc, encoding='unicode') raw = etree.tostring(doc, encoding='unicode')
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw) raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw)
doc = etree.fromstring(raw) doc = safe_xml_fromstring(raw)
return doc return doc

View File

@ -11,9 +11,9 @@ Read metadata from LRX files
import struct import struct
from zlib import decompress from zlib import decompress
from lxml import etree
from calibre.ebooks.metadata import MetaInformation, string_to_authors from calibre.ebooks.metadata import MetaInformation, string_to_authors
from calibre.utils.xml_parse import safe_xml_fromstring
def _read(f, at, amount): def _read(f, at, amount):
@ -66,7 +66,7 @@ def get_metadata(f):
info = decompress(f.read(compressed_size)) info = decompress(f.read(compressed_size))
if len(info) != uncompressed_size: if len(info) != uncompressed_size:
raise ValueError('LRX file has malformed metadata section') raise ValueError('LRX file has malformed metadata section')
root = etree.fromstring(info) root = safe_xml_fromstring(info)
bi = root.find('BookInfo') bi = root.find('BookInfo')
title = bi.find('Title') title = bi.find('Title')
title_sort = title.get('reading', None) title_sort = title.get('reading', None)

View File

@ -23,6 +23,7 @@ from calibre.utils.localization import get_lang, canonicalize_lang
from calibre import prints, guess_type from calibre import prints, guess_type
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.config import tweaks from calibre.utils.config import tweaks
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.builtins import iteritems, unicode_type, getcwd, map from polyglot.builtins import iteritems, unicode_type, getcwd, map
from polyglot.urllib import unquote, urlparse from polyglot.urllib import unquote, urlparse
@ -1588,7 +1589,7 @@ def metadata_to_opf(mi, as_string=True, default_lang=None):
is None else default_lang) is None else default_lang)
mi.languages = [lang] mi.languages = [lang]
root = etree.fromstring(textwrap.dedent( root = safe_xml_fromstring(textwrap.dedent(
''' '''
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0"> <package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">

View File

@ -7,9 +7,8 @@ from collections import defaultdict
from io import BytesIO from io import BytesIO
import unittest import unittest
from lxml import etree
from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.metadata.opf3 import ( from calibre.ebooks.metadata.opf3 import (
parse_prefixes, reserved_prefixes, expand_prefix, read_identifiers, parse_prefixes, reserved_prefixes, expand_prefix, read_identifiers,
@ -37,7 +36,7 @@ class TestOPF3(unittest.TestCase):
ae = unittest.TestCase.assertEqual ae = unittest.TestCase.assertEqual
def get_opf(self, metadata='', manifest=''): def get_opf(self, metadata='', manifest=''):
return etree.fromstring(TEMPLATE.format(metadata=metadata, manifest=manifest)) return safe_xml_fromstring(TEMPLATE.format(metadata=metadata, manifest=manifest))
def test_prefix_parsing(self): # {{{ def test_prefix_parsing(self): # {{{
self.ae(parse_prefixes('foaf: http://xmlns.com/foaf/spec/\n dbp: http://dbpedia.org/ontology/'), self.ae(parse_prefixes('foaf: http://xmlns.com/foaf/spec/\n dbp: http://dbpedia.org/ontology/'),
@ -523,7 +522,7 @@ class TestOPF3(unittest.TestCase):
self.ae(v2, v3, '%s: %r != %r' % (field, v2, v3)) self.ae(v2, v3, '%s: %r != %r' % (field, v2, v3))
mi2 = OPF(BytesIO(raw.encode('utf-8'))).to_book_metadata() mi2 = OPF(BytesIO(raw.encode('utf-8'))).to_book_metadata()
root = etree.fromstring(raw) root = safe_xml_fromstring(raw)
root.set('version', '3.0') root.set('version', '3.0')
mi3, _, raster_cover, first_spine_item = read_metadata(root, return_extra_data=True) mi3, _, raster_cover, first_spine_item = read_metadata(root, return_extra_data=True)
self.assertIsNone(raster_cover) self.assertIsNone(raster_cover)

View File

@ -9,7 +9,7 @@ import os
import io import io
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.snb.snbfile import SNBFile from calibre.ebooks.snb.snbfile import SNBFile
from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring
def get_metadata(stream, extract_cover=True): def get_metadata(stream, extract_cover=True):
@ -27,7 +27,7 @@ def get_metadata(stream, extract_cover=True):
meta = snbFile.GetFileStream('snbf/book.snbf') meta = snbFile.GetFileStream('snbf/book.snbf')
if meta is not None: if meta is not None:
meta = etree.fromstring(meta) meta = safe_xml_fromstring(meta)
mi.title = meta.find('.//head/name').text mi.title = meta.find('.//head/name').text
mi.authors = [meta.find('.//head/author').text] mi.authors = [meta.find('.//head/author').text]
mi.language = meta.find('.//head/language').text.lower().replace('_', '-') mi.language = meta.find('.//head/language').text.lower().replace('_', '-')

View File

@ -49,7 +49,7 @@ class Douban(Source):
name = 'Douban Books' name = 'Douban Books'
author = 'Li Fanxi' author = 'Li Fanxi'
version = (2, 1, 1) version = (2, 1, 2)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
description = _('Downloads metadata and covers from Douban.com. ' description = _('Downloads metadata and covers from Douban.com. '
@ -119,8 +119,10 @@ class Douban(Source):
try: try:
log.info(id_url) log.info(id_url)
raw = get_details(browser, id_url, timeout) raw = get_details(browser, id_url, timeout)
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), feed = etree.fromstring(
strip_encoding_pats=True)[0]) xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
)
extra = entry(feed)[0] extra = entry(feed)[0]
except: except:
log.exception('Failed to get additional details for', mi.title) log.exception('Failed to get additional details for', mi.title)

View File

@ -105,7 +105,8 @@ def to_metadata(browser, log, entry_, timeout): # {{{
try: try:
raw = get_details(browser, id_url, timeout) raw = get_details(browser, id_url, timeout)
feed = etree.fromstring( feed = etree.fromstring(
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0] xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
) )
extra = entry(feed)[0] extra = entry(feed)[0]
except: except:
@ -173,7 +174,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
class GoogleBooks(Source): class GoogleBooks(Source):
name = 'Google' name = 'Google'
version = (1, 0, 0) version = (1, 0, 1)
minimum_calibre_version = (2, 80, 0) minimum_calibre_version = (2, 80, 0)
description = _('Downloads metadata and covers from Google Books') description = _('Downloads metadata and covers from Google Books')
@ -371,10 +372,9 @@ class GoogleBooks(Source):
return as_unicode(e) return as_unicode(e)
try: try:
parser = etree.XMLParser(recover=True, no_network=True)
feed = etree.fromstring( feed = etree.fromstring(
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
parser=parser parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
) )
entries = entry(feed) entries = entry(feed)
except Exception as e: except Exception as e:

View File

@ -12,6 +12,7 @@ from lxml.builder import ElementMaker
from calibre.constants import __appname__, __version__ from calibre.constants import __appname__, __version__
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
from polyglot.builtins import unicode_type, getcwd from polyglot.builtins import unicode_type, getcwd
from polyglot.urllib import unquote, urlparse from polyglot.urllib import unquote, urlparse
@ -177,8 +178,7 @@ class TOC(list):
with open(toc, 'rb') as f: with open(toc, 'rb') as f:
raw = xml_to_unicode(f.read(), assume_utf8=True, raw = xml_to_unicode(f.read(), assume_utf8=True,
strip_encoding_pats=True)[0] strip_encoding_pats=True)[0]
root = etree.fromstring(raw, parser=etree.XMLParser(recover=True, root = safe_xml_fromstring(raw)
no_network=True))
xpn = {'re': 'http://exslt.org/regular-expressions'} xpn = {'re': 'http://exslt.org/regular-expressions'}
XPath = functools.partial(etree.XPath, namespaces=xpn) XPath = functools.partial(etree.XPath, namespaces=xpn)

View File

@ -6,17 +6,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
from collections import namedtuple from collections import namedtuple
from polyglot.builtins import map from polyglot.builtins import map
from lxml import etree
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.base import OPF from calibre.ebooks.oeb.base import OPF
from calibre.ebooks.oeb.polish.utils import guess_type from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.spell import parse_lang_code from calibre.spell import parse_lang_code
from calibre.utils.localization import lang_as_iso639_1 from calibre.utils.localization import lang_as_iso639_1
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.builtins import filter from polyglot.builtins import filter
PARSER = etree.XMLParser(recover=True, no_network=True)
OPFVersion = namedtuple('OPFVersion', 'major minor patch') OPFVersion = namedtuple('OPFVersion', 'major minor patch')
@ -45,7 +43,7 @@ def parse_opf(stream_or_path):
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream')) raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True) raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
raw = raw[raw.find('<'):] raw = raw[raw.find('<'):]
root = etree.fromstring(raw, PARSER) root = safe_xml_fromstring(raw)
if root is None: if root is None:
raise ValueError('Not an OPF file') raise ValueError('Not an OPF file')
return root return root

View File

@ -14,6 +14,7 @@ from lxml.builder import ElementMaker
from calibre import prints from calibre import prints
from calibre.ebooks.metadata import check_isbn, check_doi from calibre.ebooks.metadata import check_isbn, check_doi
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.opf2 import dump_dict from calibre.ebooks.metadata.opf2 import dump_dict
from calibre.utils.date import parse_date, isoformat, now from calibre.utils.date import parse_date, isoformat, now
@ -74,9 +75,9 @@ def parse_xmp_packet(raw_bytes):
enc = emap.get(m.group(1), enc) enc = emap.get(m.group(1), enc)
break break
if enc is None: if enc is None:
return etree.fromstring(raw_bytes) return safe_xml_fromstring(raw_bytes)
raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string
return etree.fromstring(raw) return safe_xml_fromstring(raw)
def serialize_xmp_packet(root, encoding='utf-8'): def serialize_xmp_packet(root, encoding='utf-8'):

View File

@ -6,8 +6,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
from lxml import etree from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.oeb.base import (urlnormalize, XPath, XHTML_NS, XHTML, from calibre.ebooks.oeb.base import (urlnormalize, XPath, XHTML_NS, XHTML,
XHTML_MIME, css_text) XHTML_MIME, css_text)
@ -88,7 +87,7 @@ class TOCAdder(object):
'body { font-family: %s }'%s.body_font_family] 'body { font-family: %s }'%s.body_font_family]
embed_css = '\n\n'.join(css) embed_css = '\n\n'.join(css)
root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS, root = safe_xml_fromstring(TEMPLATE.format(xhtmlns=XHTML_NS,
title=self.title, embed_css=embed_css, title=self.title, embed_css=embed_css,
extra_css=(opts.extra_css or ''))) extra_css=(opts.extra_css or '')))
parent = XPath('//h:ul')(root)[0] parent = XPath('//h:ul')(root)[0]

View File

@ -19,6 +19,7 @@ from odf.namespaces import TEXTNS as odTEXTNS
from calibre import CurrentDir, walk from calibre import CurrentDir, walk
from calibre.ebooks.oeb.base import _css_logger from calibre.ebooks.oeb.base import _css_logger
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.builtins import unicode_type, string_or_bytes, filter, getcwd, as_bytes from polyglot.builtins import unicode_type, string_or_bytes, filter, getcwd, as_bytes
@ -45,7 +46,7 @@ class Extract(ODF2XHTML):
ol.set('start', val) ol.set('start', val)
def fix_markup(self, html, log): def fix_markup(self, html, log):
root = etree.fromstring(html) root = safe_xml_fromstring(html)
self.filter_css(root, log) self.filter_css(root, log)
self.extract_css(root, log) self.extract_css(root, log)
self.epubify_markup(root, log) self.epubify_markup(root, log)

View File

@ -16,11 +16,11 @@ from lxml import etree, html
from calibre import force_unicode from calibre import force_unicode
from calibre.constants import filesystem_encoding, __version__, ispy3 from calibre.constants import filesystem_encoding, __version__, ispy3
from calibre.translations.dynamic import translate from calibre.translations.dynamic import translate
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.conversion.preprocess import CSSPreProcessor from calibre.ebooks.conversion.preprocess import CSSPreProcessor
from calibre import (isbytestring, as_unicode, get_types_map) from calibre import (isbytestring, as_unicode, get_types_map)
from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER, from calibre.ebooks.oeb.parse_utils import barename, XHTML_NS, namespace, XHTML, parse_html, NotHTML
namespace, XHTML, parse_html, NotHTML)
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.short_uuid import uuid4 from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter, codepoint_to_chr from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter, codepoint_to_chr
@ -946,7 +946,7 @@ class Manifest(object):
return return
data = xml_to_unicode(data, strip_encoding_pats=True, data = xml_to_unicode(data, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0] assume_utf8=True, resolve_entities=True)[0]
return etree.fromstring(data, parser=RECOVER_PARSER) return safe_xml_fromstring(data)
def _parse_xhtml(self, data): def _parse_xhtml(self, data):
orig_data = data orig_data = data

View File

@ -11,6 +11,7 @@ import re
from lxml import etree, html from lxml import etree, html
from calibre import xml_replace_entities, force_unicode from calibre import xml_replace_entities, force_unicode
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.constants import filesystem_encoding from calibre.constants import filesystem_encoding
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
from polyglot.builtins import iteritems, itervalues, unicode_type, string_or_bytes, map from polyglot.builtins import iteritems, itervalues, unicode_type, string_or_bytes, map
@ -114,12 +115,7 @@ def _html4_parse(data):
elem.text = elem.text.strip('-') elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding='unicode') data = etree.tostring(data, encoding='unicode')
# Setting huge_tree=True causes crashes in windows with large files data = safe_xml_fromstring(data)
parser = etree.XMLParser(no_network=True)
try:
data = etree.fromstring(data, parser=parser)
except etree.XMLSyntaxError:
data = etree.fromstring(data, parser=RECOVER_PARSER)
return data return data
@ -210,19 +206,16 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
data = data.replace('\0', '') data = data.replace('\0', '')
data = raw = clean_word_doc(data, log) data = raw = clean_word_doc(data, log)
# Setting huge_tree=True causes crashes in windows with large files
parser = etree.XMLParser(no_network=True)
# Try with more & more drastic measures to parse # Try with more & more drastic measures to parse
try: try:
data = etree.fromstring(data, parser=parser) data = safe_xml_fromstring(data)
check_for_html5(pre, data) check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError): except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Initial parse failed, using more' log.debug('Initial parse failed, using more'
' forgiving parsers') ' forgiving parsers')
raw = data = xml_replace_entities(raw) raw = data = xml_replace_entities(raw)
try: try:
data = etree.fromstring(data, parser=parser) data = safe_xml_fromstring(data)
check_for_html5(pre, data) check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError): except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Parsing %s as HTML' % filename) log.debug('Parsing %s as HTML' % filename)
@ -251,7 +244,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
if barename(data.tag) in non_html_file_tags: if barename(data.tag) in non_html_file_tags:
raise NotHTML(data.tag) raise NotHTML(data.tag)
log.warn('File %r does not appear to be (X)HTML'%filename) log.warn('File %r does not appear to be (X)HTML'%filename)
nroot = etree.fromstring('<html></html>') nroot = safe_xml_fromstring('<html></html>')
has_body = False has_body = False
for child in list(data): for child in list(data):
if isinstance(child.tag, (unicode_type, bytes)) and barename(child.tag) == 'body': if isinstance(child.tag, (unicode_type, bytes)) and barename(child.tag) == 'body':
@ -260,7 +253,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
parent = nroot parent = nroot
if not has_body: if not has_body:
log.warn('File %r appears to be a HTML fragment'%filename) log.warn('File %r appears to be a HTML fragment'%filename)
nroot = etree.fromstring('<html><body/></html>') nroot = safe_xml_fromstring('<html><body/></html>')
parent = nroot[0] parent = nroot[0]
for child in list(data.iter()): for child in list(data.iter()):
oparent = child.getparent() oparent = child.getparent()
@ -276,12 +269,12 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
data = etree.tostring(data, encoding='unicode') data = etree.tostring(data, encoding='unicode')
try: try:
data = etree.fromstring(data, parser=parser) data = safe_xml_fromstring(data)
except: except:
data = data.replace(':=', '=').replace(':>', '>') data = data.replace(':=', '=').replace(':>', '>')
data = data.replace('<http:/>', '') data = data.replace('<http:/>', '')
try: try:
data = etree.fromstring(data, parser=parser) data = safe_xml_fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
log.warn('Stripping comments from %s'% log.warn('Stripping comments from %s'%
filename) filename)
@ -292,12 +285,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
'') '')
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
try: try:
data = etree.fromstring(data, data = safe_xml_fromstring(data)
parser=RECOVER_PARSER)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
log.warn('Stripping meta tags from %s'% filename) log.warn('Stripping meta tags from %s'% filename)
data = re.sub(r'<meta\s+[^>]+?>', '', data) data = re.sub(r'<meta\s+[^>]+?>', '', data)
data = etree.fromstring(data, parser=RECOVER_PARSER) data = safe_xml_fromstring(data)
elif namespace(data.tag) != XHTML_NS: elif namespace(data.tag) != XHTML_NS:
# OEB_DOC_NS, but possibly others # OEB_DOC_NS, but possibly others
ns = namespace(data.tag) ns = namespace(data.tag)

View File

@ -7,11 +7,12 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re import re
from lxml.etree import XMLParser, fromstring, XMLSyntaxError from lxml.etree import XMLSyntaxError
import css_parser import css_parser
from calibre import force_unicode, human_readable, prepare_string_for_xml from calibre import force_unicode, human_readable, prepare_string_for_xml
from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.html_entities import html5_entities from calibre.ebooks.html_entities import html5_entities
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
@ -276,7 +277,6 @@ def check_xml_parsing(name, mt, raw):
# Get rid of entities as named entities trip up the XML parser # Get rid of entities as named entities trip up the XML parser
eproc = EntitityProcessor(mt) eproc = EntitityProcessor(mt)
eraw = entity_pat.sub(eproc, raw) eraw = entity_pat.sub(eproc, raw)
parser = XMLParser(recover=False)
errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
errors = [] errors = []
if eproc.ok_named_entities: if eproc.ok_named_entities:
@ -288,7 +288,7 @@ def check_xml_parsing(name, mt, raw):
errors.append(BadEntity(ent, name, lnum, col)) errors.append(BadEntity(ent, name, lnum, col))
try: try:
root = fromstring(eraw, parser=parser) root = safe_xml_fromstring(eraw, recover=False)
except UnicodeDecodeError: except UnicodeDecodeError:
return errors + [DecodeError(name)] return errors + [DecodeError(name)]
except XMLSyntaxError as err: except XMLSyntaxError as err:

View File

@ -18,7 +18,6 @@ from io import BytesIO
from itertools import count from itertools import count
from css_parser import getUrls, replaceUrls from css_parser import getUrls, replaceUrls
from lxml import etree
from calibre import CurrentDir, walk from calibre import CurrentDir, walk
from calibre.constants import iswindows from calibre.constants import iswindows
@ -42,7 +41,7 @@ from calibre.ebooks.oeb.base import (
DC11_NS, OEB_DOCS, OEB_STYLES, OPF, OPF2_NS, Manifest, itercsslinks, iterlinks, DC11_NS, OEB_DOCS, OEB_STYLES, OPF, OPF2_NS, Manifest, itercsslinks, iterlinks,
rewrite_links, serialize, urlquote, urlunquote rewrite_links, serialize, urlquote, urlunquote
) )
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER, NotHTML, parse_html from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html
from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
from calibre.ebooks.oeb.polish.utils import ( from calibre.ebooks.oeb.polish.utils import (
@ -52,6 +51,7 @@ from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryF
from calibre.utils.filenames import hardlink_file, nlinks_file from calibre.utils.filenames import hardlink_file, nlinks_file
from calibre.utils.ipc.simple_worker import WorkerError, fork_job from calibre.utils.ipc.simple_worker import WorkerError, fork_job
from calibre.utils.logging import default_log from calibre.utils.logging import default_log
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from polyglot.builtins import iteritems, map, unicode_type, zip from polyglot.builtins import iteritems, map, unicode_type, zip
from polyglot.urllib import urlparse from polyglot.urllib import urlparse
@ -201,7 +201,7 @@ class ContainerBase(object): # {{{
data, self.used_encoding = xml_to_unicode( data, self.used_encoding = xml_to_unicode(
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True) data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
data = unicodedata.normalize('NFC', data) data = unicodedata.normalize('NFC', data)
return etree.fromstring(data, parser=RECOVER_PARSER) return safe_xml_fromstring(data)
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False): def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
if self.tweak_mode: if self.tweak_mode:
@ -1178,7 +1178,7 @@ class EpubContainer(Container):
container_path = join(self.root, 'META-INF', 'container.xml') container_path = join(self.root, 'META-INF', 'container.xml')
if not exists(container_path): if not exists(container_path):
raise InvalidEpub('No META-INF/container.xml in epub') raise InvalidEpub('No META-INF/container.xml in epub')
container = etree.fromstring(open(container_path, 'rb').read()) container = safe_xml_fromstring(open(container_path, 'rb').read())
opf_files = container.xpath(( opf_files = container.xpath((
r'child::ocf:rootfiles/ocf:rootfile' r'child::ocf:rootfiles/ocf:rootfile'
'[@media-type="%s" and @full-path]'%guess_type('a.opf') '[@media-type="%s" and @full-path]'%guess_type('a.opf')

View File

@ -7,10 +7,11 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re import re
from lxml.etree import XMLParser, fromstring, Element as LxmlElement from lxml.etree import Element as LxmlElement
import html5_parser import html5_parser
from calibre import xml_replace_entities from calibre import xml_replace_entities
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
@ -77,8 +78,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
if force_html5_parse: if force_html5_parse:
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
try: try:
parser = XMLParser(no_network=True) ans = safe_xml_fromstring(raw)
ans = fromstring(raw, parser=parser)
if ans.tag != '{%s}html' % XHTML_NS: if ans.tag != '{%s}html' % XHTML_NS:
raise ValueError('Root tag is not <html> in the XHTML namespace') raise ValueError('Root tag is not <html> in the XHTML namespace')
if linenumber_attribute: if linenumber_attribute:

View File

@ -21,6 +21,7 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
urlnormalize, BINARY_MIME, \ urlnormalize, BINARY_MIME, \
OEBError, OEBBook, DirContainer OEBError, OEBBook, DirContainer
from calibre.ebooks.oeb.writer import OEBWriter from calibre.ebooks.oeb.writer import OEBWriter
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.localization import get_lang from calibre.utils.localization import get_lang
from calibre.ptempfile import TemporaryDirectory from calibre.ptempfile import TemporaryDirectory
@ -108,23 +109,18 @@ class OEBReader(object):
data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
OPF1_NS, data) OPF1_NS, data)
try: try:
opf = etree.fromstring(data) opf = safe_xml_fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
data = xml_replace_entities(clean_xml_chars(data), encoding=None) data = xml_replace_entities(clean_xml_chars(data), encoding=None)
try: try:
opf = etree.fromstring(data) opf = safe_xml_fromstring(data)
self.logger.warn('OPF contains invalid HTML named entities') self.logger.warn('OPF contains invalid HTML named entities')
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = re.sub(r'(?is)<tours>.+</tours>', '', data)
data = data.replace('<dc-metadata>', data = data.replace('<dc-metadata>',
'<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">') '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
try: opf = safe_xml_fromstring(data)
opf = etree.fromstring(data) self.logger.warn('OPF contains invalid tours section')
self.logger.warn('OPF contains invalid tours section')
except etree.XMLSyntaxError:
self.logger.warn('OPF contains invalid markup, trying to parse it anyway')
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
opf = etree.fromstring(data, parser=RECOVER_PARSER)
ns = namespace(opf.tag) ns = namespace(opf.tag)
if ns not in ('', OPF1_NS, OPF2_NS): if ns not in ('', OPF1_NS, OPF2_NS):

View File

@ -8,9 +8,9 @@ __docformat__ = 'restructuredtext en'
import textwrap import textwrap
from lxml import etree
from calibre import guess_type from calibre import guess_type
from calibre.utils.imghdr import identify from calibre.utils.imghdr import identify
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
from polyglot.urllib import unquote from polyglot.urllib import unquote
@ -156,7 +156,7 @@ class CoverManager(object):
tp = templ%unquote(href) tp = templ%unquote(href)
id, href = m.generate('titlepage', 'titlepage.xhtml') id, href = m.generate('titlepage', 'titlepage.xhtml')
item = m.add(id, href, guess_type('t.xhtml')[0], item = m.add(id, href, guess_type('t.xhtml')[0],
data=etree.fromstring(tp)) data=safe_xml_fromstring(tp))
else: else:
item = self.oeb.manifest.hrefs[ item = self.oeb.manifest.hrefs[
urldefrag(self.oeb.guide['titlepage'].href)[0]] urldefrag(self.oeb.guide['titlepage'].href)[0]]

View File

@ -129,9 +129,9 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
def parse_outline(raw, output_dir): def parse_outline(raw, output_dir):
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER from calibre.utils.xml_parse import safe_xml_fromstring
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]') outline = safe_xml_fromstring(raw).xpath('(//outline)[1]')
if outline: if outline:
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
outline = outline[0] outline = outline[0]

View File

@ -12,6 +12,7 @@ from itertools import count
from lxml import etree from lxml import etree
from polyglot.builtins import range, map from polyglot.builtins import range, map
from calibre.utils.xml_parse import safe_xml_fromstring
class Font(object): class Font(object):
@ -622,8 +623,7 @@ class PDFDocument(object):
def __init__(self, xml, opts, log): def __init__(self, xml, opts, log):
self.opts, self.log = opts, log self.opts, self.log = opts, log
parser = etree.XMLParser(recover=True) self.root = safe_xml_fromstring(xml)
self.root = etree.fromstring(xml, parser=parser)
idc = count() idc = count()
self.fonts = [] self.fonts = []

View File

@ -14,6 +14,7 @@ import re
from lxml import etree from lxml import etree
from calibre.ebooks.pdb.ereader import image_name from calibre.ebooks.pdb.ereader import image_name
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.pml import unipmlcode from calibre.ebooks.pml import unipmlcode
from polyglot.builtins import unicode_type, string_or_bytes from polyglot.builtins import unicode_type, string_or_bytes
@ -138,7 +139,7 @@ class PMLMLizer(object):
self.log.debug('Converting %s to PML markup...' % item.href) self.log.debug('Converting %s to PML markup...' % item.href)
content = etree.tostring(item.data, encoding='unicode') content = etree.tostring(item.data, encoding='unicode')
content = self.prepare_text(content) content = self.prepare_text(content)
content = etree.fromstring(content) content = safe_xml_fromstring(content)
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
text.append(self.add_page_anchor(item)) text.append(self.add_page_anchor(item))
text += self.dump_text(content.find(XHTML('body')), stylizer, item) text += self.dump_text(content.find(XHTML('body')), stylizer, item)

View File

@ -109,6 +109,7 @@ class RTFMLizer(object):
def mlize_spine(self): def mlize_spine(self):
from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.base import XHTML
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.xml_parse import safe_xml_fromstring
output = self.header() output = self.header()
if 'titlepage' in self.oeb_book.guide: if 'titlepage' in self.oeb_book.guide:
href = self.oeb_book.guide['titlepage'].href href = self.oeb_book.guide['titlepage'].href
@ -126,7 +127,7 @@ class RTFMLizer(object):
content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL) content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
content = self.remove_newlines(content) content = self.remove_newlines(content)
content = self.remove_tabs(content) content = self.remove_tabs(content)
content = etree.fromstring(content) content = safe_xml_fromstring(content)
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
self.currently_dumping_item = item self.currently_dumping_item = item
output += self.dump_text(content.find(XHTML('body')), stylizer) output += self.dump_text(content.find(XHTML('body')), stylizer)

View File

@ -84,6 +84,7 @@ class SNBMLizer(object):
def mlize(self): def mlize(self):
from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.base import XHTML
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.xml_parse import safe_xml_fromstring
output = [u''] output = [u'']
stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile)
content = etree.tostring(self.item.data.find(XHTML('body')), encoding='unicode') content = etree.tostring(self.item.data.find(XHTML('body')), encoding='unicode')
@ -98,7 +99,7 @@ class SNBMLizer(object):
etree.SubElement(snbcTree, "body") etree.SubElement(snbcTree, "body")
trees[subitem] = snbcTree trees[subitem] = snbcTree
output.append('%s%s\n\n' % (CALIBRE_SNB_BM_TAG, "")) output.append('%s%s\n\n' % (CALIBRE_SNB_BM_TAG, ""))
output += self.dump_text(self.subitems, etree.fromstring(content), stylizer)[0] output += self.dump_text(self.subitems, safe_xml_fromstring(content), stylizer)[0]
output = self.cleanup_text(''.join(output)) output = self.cleanup_text(''.join(output))
subitem = '' subitem = ''

View File

@ -67,6 +67,7 @@ class TXTMLizer(object):
def mlize_spine(self): def mlize_spine(self):
from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.base import XHTML
from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.utils.xml_parse import safe_xml_fromstring
output = [u''] output = [u'']
output.append(self.get_toc()) output.append(self.get_toc())
for item in self.oeb_book.spine: for item in self.oeb_book.spine:
@ -76,7 +77,7 @@ class TXTMLizer(object):
x.text = x.text.replace('--', '__') x.text = x.text.replace('--', '__')
content = etree.tostring(item.data, encoding='unicode') content = etree.tostring(item.data, encoding='unicode')
content = self.remove_newlines(content) content = self.remove_newlines(content)
content = etree.fromstring(content) content = safe_xml_fromstring(content)
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
output += self.dump_text(content.find(XHTML('body')), stylizer, item) output += self.dump_text(content.find(XHTML('body')), stylizer, item)
output += '\n\n\n\n\n\n' output += '\n\n\n\n\n\n'

View File

@ -15,6 +15,7 @@ from PyQt5.Qt import (
from lxml import etree from lxml import etree
from calibre.gui2 import choose_files, error_dialog from calibre.gui2 import choose_files, error_dialog
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.icu import sort_key from calibre.utils.icu import sort_key
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
@ -32,7 +33,7 @@ def uniq(vals, kmap=lambda x:x):
def import_opml(raw, preserve_groups=True): def import_opml(raw, preserve_groups=True):
root = etree.fromstring(raw) root = safe_xml_fromstring(raw)
groups = defaultdict(list) groups = defaultdict(list)
ax = etree.XPath('ancestor::outline[@title or @text]') ax = etree.XPath('ancestor::outline[@title or @text]')
for outline in root.xpath('//outline[@type="rss" and @xmlUrl]'): for outline in root.xpath('//outline[@type="rss" and @xmlUrl]'):

View File

@ -8,12 +8,11 @@ __docformat__ = 'restructuredtext en'
from contextlib import closing from contextlib import closing
from lxml import etree
from PyQt5.Qt import QUrl from PyQt5.Qt import QUrl
from calibre import (browser, guess_extension) from calibre import (browser, guess_extension)
from calibre.gui2 import open_url from calibre.gui2 import open_url
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.gui2.store import StorePlugin from calibre.gui2.store import StorePlugin
from calibre.gui2.store.search_result import SearchResult from calibre.gui2.store.search_result import SearchResult
from calibre.gui2.store.web_store_dialog import WebStoreDialog from calibre.gui2.store.web_store_dialog import WebStoreDialog
@ -36,7 +35,7 @@ def open_search(url, query, max_results=10, timeout=60):
counter = max_results counter = max_results
br = browser() br = browser()
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = etree.fromstring(f.read()) doc = safe_xml_fromstring(f.read())
for data in doc.xpath('//*[local-name() = "entry"]'): for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0: if counter <= 0:
break break

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
store_version = 5 # Needed for dynamic plugin loading store_version = 6 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>' __copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
@ -43,7 +43,7 @@ def search(query, max_results=10, timeout=60, write_raw_to=None):
if write_raw_to is not None: if write_raw_to is not None:
with open(write_raw_to, 'wb') as f: with open(write_raw_to, 'wb') as f:
f.write(raw) f.write(raw)
doc = etree.fromstring(raw) doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
for data in doc.xpath('//*[local-name() = "entry"]'): for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0: if counter <= 0:
break break
@ -63,7 +63,7 @@ def search(query, max_results=10, timeout=60, write_raw_to=None):
# Get the formats and direct download links. # Get the formats and direct download links.
with closing(br.open(id, timeout=timeout/4)) as nf: with closing(br.open(id, timeout=timeout/4)) as nf:
ndoc = etree.fromstring(nf.read()) ndoc = etree.fromstring(nf.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'): for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
type = link.get('type') type = link.get('type')
href = link.get('href') href = link.get('href')

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
store_version = 1 # Needed for dynamic plugin loading store_version = 2 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>' __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
@ -63,8 +63,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
ungzipResponse(r,br) ungzipResponse(r,br)
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
doc = etree.fromstring(raw, parser=parser)
for data in doc.xpath('//*[local-name() = "fb2-book"]'): for data in doc.xpath('//*[local-name() = "fb2-book"]'):
if counter <= 0: if counter <= 0:
break break

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
store_version = 1 # Needed for dynamic plugin loading store_version = 2 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -46,7 +46,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
raw_data = f.read() raw_data = f.read()
raw_data = raw_data.decode('utf-8', 'replace') raw_data = raw_data.decode('utf-8', 'replace')
doc = etree.fromstring(raw_data) doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
for data in doc.xpath('//*[local-name() = "entry"]'): for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0: if counter <= 0:
break break
@ -71,7 +71,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
# Follow the detail link to get the rest of the info. # Follow the detail link to get the rest of the info.
with closing(br.open(detail_href, timeout=timeout/4)) as df: with closing(br.open(detail_href, timeout=timeout/4)) as df:
ddoc = etree.fromstring(df.read()) ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
ddata = ddoc.xpath('//*[local-name() = "entry"][1]') ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
if ddata: if ddata:
ddata = ddata[0] ddata = ddata[0]

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
store_version = 1 # Needed for dynamic plugin loading store_version = 2 # Needed for dynamic plugin loading
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>' __copyright__ = '2011, John Schember <john@nachtimwald.com>'
@ -47,7 +47,7 @@ class XinXiiStore(BasicStoreConfig, OpenSearchOPDSStore):
counter = max_results counter = max_results
br = browser() br = browser()
with closing(br.open(url, timeout=timeout)) as f: with closing(br.open(url, timeout=timeout)) as f:
doc = etree.fromstring(f.read()) doc = etree.fromstring(f.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
for data in doc.xpath('//*[local-name() = "entry"]'): for data in doc.xpath('//*[local-name() = "entry"]'):
if counter <= 0: if counter <= 0:
break break

View File

@ -28,6 +28,7 @@ from calibre.gui2.tweak_book.editor.text import PlainTextEdit, default_font_fami
from calibre.gui2.tweak_book.editor.themes import theme_color, get_theme from calibre.gui2.tweak_book.editor.themes import theme_color, get_theme
from calibre.gui2.tweak_book.diff import get_sequence_matcher from calibre.gui2.tweak_book.diff import get_sequence_matcher
from calibre.gui2.tweak_book.diff.highlight import get_highlighter from calibre.gui2.tweak_book.diff.highlight import get_highlighter
from calibre.utils.xml_parse import safe_xml_fromstring
Change = namedtuple('Change', 'ltop lbot rtop rbot kind') Change = namedtuple('Change', 'ltop lbot rtop rbot kind')
@ -47,7 +48,7 @@ def beautify_text(raw, syntax):
from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
from calibre.ebooks.chardet import strip_encoding_declarations from calibre.ebooks.chardet import strip_encoding_declarations
if syntax == 'xml': if syntax == 'xml':
root = etree.fromstring(strip_encoding_declarations(raw)) root = safe_xml_fromstring(strip_encoding_declarations(raw))
pretty_xml_tree(root) pretty_xml_tree(root)
elif syntax == 'css': elif syntax == 'css':
import logging import logging

View File

@ -21,6 +21,7 @@ from calibre import (
replace_entities, strftime, xml_replace_entities replace_entities, strftime, xml_replace_entities
) )
from calibre.constants import cache_dir, isosx from calibre.constants import cache_dir, isosx
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.customize.conversion import DummyReporter from calibre.customize.conversion import DummyReporter
from calibre.customize.ui import output_profiles from calibre.customize.ui import output_profiles
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify
@ -2992,7 +2993,7 @@ class CatalogBuilder(object):
<navMap/> <navMap/>
</ncx> </ncx>
''' '''
root = self.ncx_root = etree.fromstring(header) root = self.ncx_root = safe_xml_fromstring(header)
navMapTag = root[0] navMapTag = root[0]
if self.generate_for_kindle_mobi: if self.generate_for_kindle_mobi:
@ -3668,7 +3669,7 @@ class CatalogBuilder(object):
lang=prepare_string_for_xml(lang), lang=prepare_string_for_xml(lang),
pt="periodical:default" if self.generate_for_kindle_mobi else "" pt="periodical:default" if self.generate_for_kindle_mobi else ""
) )
root = etree.fromstring(header) root = safe_xml_fromstring(header)
manifest = root.xpath('//*[local-name()="manifest"]')[0] manifest = root.xpath('//*[local-name()="manifest"]')[0]
spine = root.xpath('//*[local-name()="spine"]')[0] spine = root.xpath('//*[local-name()="spine"]')[0]
guide = root.xpath('//*[local-name()="guide"]')[0] guide = root.xpath('//*[local-name()="guide"]')[0]

View File

@ -10,6 +10,7 @@ import sys, glob, os, tempfile, re, codecs
from lxml import etree from lxml import etree
from calibre.constants import config_dir from calibre.constants import config_dir
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.zipfile import ZipFile from calibre.utils.zipfile import ZipFile
from polyglot.builtins import iteritems from polyglot.builtins import iteritems
@ -26,7 +27,7 @@ BUILTIN_LOCALES = {'en-US', 'en-GB', 'es-ES'}
def parse_xcu(raw, origin='%origin%'): def parse_xcu(raw, origin='%origin%'):
' Get the dictionary and affix file names as well as supported locales for each dictionary ' ' Get the dictionary and affix file names as well as supported locales for each dictionary '
ans = {} ans = {}
root = etree.fromstring(raw) root = safe_xml_fromstring(raw)
for node in XPath('//prop[@oor:name="Format"]/value[text()="DICT_SPELL"]/../..')(root): for node in XPath('//prop[@oor:name="Format"]/value[text()="DICT_SPELL"]/../..')(root):
value = XPath('descendant::prop[@oor:name="Locations"]/value')(node) value = XPath('descendant::prop[@oor:name="Locations"]/value')(node)
@ -123,7 +124,7 @@ def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'):
key = key[3:] key = key[3:]
return zf.open(key.lstrip('/')).read() return zf.open(key.lstrip('/')).read()
root = etree.fromstring(zf.open('META-INF/manifest.xml').read()) root = safe_xml_fromstring(zf.open('META-INF/manifest.xml').read())
xcu = XPath('//manifest:file-entry[@manifest:media-type="application/vnd.sun.star.configuration-data"]')(root)[0].get( xcu = XPath('//manifest:file-entry[@manifest:media-type="application/vnd.sun.star.configuration-data"]')(root)[0].get(
'{%s}full-path' % NS_MAP['manifest']) '{%s}full-path' % NS_MAP['manifest'])
for (dic, aff), locales in iteritems(parse_xcu(zf.open(xcu).read(), origin='')): for (dic, aff), locales in iteritems(parse_xcu(zf.open(xcu).read(), origin='')):

View File

@ -15,6 +15,7 @@ from lxml.builder import ElementMaker
from calibre.constants import __appname__ from calibre.constants import __appname__
from calibre.db.view import sanitize_sort_field_name from calibre.db.view import sanitize_sort_field_name
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata import fmt_sidx, authors_to_string, rating_to_stars from calibre.ebooks.metadata import fmt_sidx, authors_to_string, rating_to_stars
from calibre.library.comments import comments_to_html from calibre.library.comments import comments_to_html
from calibre import guess_type, prepare_string_for_xml as xml from calibre import guess_type, prepare_string_for_xml as xml
@ -123,7 +124,7 @@ def html_to_lxml(raw):
root.set('xmlns', "http://www.w3.org/1999/xhtml") root.set('xmlns', "http://www.w3.org/1999/xhtml")
raw = etree.tostring(root, encoding=None) raw = etree.tostring(root, encoding=None)
try: try:
return etree.fromstring(raw) return safe_xml_fromstring(raw)
except: except:
for x in root.iterdescendants(): for x in root.iterdescendants():
remove = [] remove = []
@ -134,7 +135,7 @@ def html_to_lxml(raw):
del x.attrib[a] del x.attrib[a]
raw = etree.tostring(root, encoding=None) raw = etree.tostring(root, encoding=None)
try: try:
return etree.fromstring(raw) return safe_xml_fromstring(raw)
except: except:
from calibre.ebooks.oeb.parse_utils import _html4_parse from calibre.ebooks.oeb.parse_utils import _html4_parse
return _html4_parse(raw) return _html4_parse(raw)

View File

@ -11,9 +11,8 @@ __docformat__ = 'restructuredtext en'
from contextlib import closing from contextlib import closing
from lxml import etree
from calibre import browser from calibre import browser
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.opensearch.url import URL from calibre.utils.opensearch.url import URL
@ -38,7 +37,7 @@ class Description(object):
''' '''
br = browser() br = browser()
with closing(br.open(url, timeout=15)) as f: with closing(br.open(url, timeout=15)) as f:
doc = etree.fromstring(f.read()) doc = safe_xml_fromstring(f.read())
# version 1.1 has repeating Url elements. # version 1.1 has repeating Url elements.
self.urls = [] self.urls = []

View File

@ -0,0 +1,19 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
from lxml import etree
# resolve_entities is turned off as entities can cause
# reads of local files, for example:
# <!DOCTYPE foo [ <!ENTITY passwd SYSTEM "file:///etc/passwd" >]>
SAFE_XML_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
SAFE_XML_PARSER_NO_RECOVER = etree.XMLParser(recover=False, no_network=True, resolve_entities=False)
fs = etree.fromstring
def safe_xml_fromstring(string_or_bytes, recover=True):
return fs(string_or_bytes, SAFE_XML_PARSER if recover else SAFE_XML_PARSER_NO_RECOVER)

View File

@ -14,6 +14,7 @@ from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from calibre import force_unicode from calibre import force_unicode
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.constants import numeric_version from calibre.constants import numeric_version
from calibre.utils.iso8601 import parse_iso8601 from calibre.utils.iso8601 import parse_iso8601
from calibre.utils.date import now as nowf, utcnow, local_tz, isoformat, EPOCH, UNDEFINED_DATE from calibre.utils.date import now as nowf, utcnow, local_tz, isoformat, EPOCH, UNDEFINED_DATE
@ -124,7 +125,7 @@ def get_custom_recipe_collection(*args):
import traceback import traceback
traceback.print_exc() traceback.print_exc()
continue continue
return etree.fromstring(serialize_collection(rmap)) return safe_xml_fromstring(serialize_collection(rmap))
def update_custom_recipe(id_, title, script): def update_custom_recipe(id_, title, script):
@ -287,7 +288,7 @@ class SchedulerConfig(object):
if os.access(self.conf_path, os.R_OK): if os.access(self.conf_path, os.R_OK):
with ExclusiveFile(self.conf_path) as f: with ExclusiveFile(self.conf_path) as f:
try: try:
self.root = etree.fromstring(f.read()) self.root = safe_xml_fromstring(f.read())
except: except:
print('Failed to read recipe scheduler config') print('Failed to read recipe scheduler config')
import traceback import traceback