mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Do not resolve entities when parsing XML
Resolving entities is dangerous since lxml will actually read file:// URLs in entity definitions. Fixes #1857800 [Private bug](https://bugs.launchpad.net/calibre/+bug/1857800)
This commit is contained in:
parent
589079c6aa
commit
68febe94ca
@ -92,7 +92,7 @@ def uuid():
|
|||||||
class XMLCache(object):
|
class XMLCache(object):
|
||||||
|
|
||||||
def __init__(self, paths, ext_paths, prefixes, use_author_sort):
|
def __init__(self, paths, ext_paths, prefixes, use_author_sort):
|
||||||
from lxml import etree
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
debug_print('Building XMLCache...', paths)
|
debug_print('Building XMLCache...', paths)
|
||||||
@ -101,7 +101,6 @@ class XMLCache(object):
|
|||||||
self.use_author_sort = use_author_sort
|
self.use_author_sort = use_author_sort
|
||||||
|
|
||||||
# Parse XML files {{{
|
# Parse XML files {{{
|
||||||
parser = etree.XMLParser(recover=True)
|
|
||||||
self.roots = {}
|
self.roots = {}
|
||||||
for source_id, path in paths.items():
|
for source_id, path in paths.items():
|
||||||
if source_id == 0:
|
if source_id == 0:
|
||||||
@ -116,10 +115,9 @@ class XMLCache(object):
|
|||||||
with lopen(path, 'rb') as f:
|
with lopen(path, 'rb') as f:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
|
|
||||||
self.roots[source_id] = etree.fromstring(xml_to_unicode(
|
self.roots[source_id] = safe_xml_fromstring(
|
||||||
raw, strip_encoding_pats=True, assume_utf8=True,
|
xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, verbose=DEBUG)[0]
|
||||||
verbose=DEBUG)[0],
|
)
|
||||||
parser=parser)
|
|
||||||
if self.roots[source_id] is None:
|
if self.roots[source_id] is None:
|
||||||
raise Exception(('The SONY database at %r is corrupted. Try '
|
raise Exception(('The SONY database at %r is corrupted. Try '
|
||||||
' disconnecting and reconnecting your reader.')%path)
|
' disconnecting and reconnecting your reader.')%path)
|
||||||
@ -136,10 +134,9 @@ class XMLCache(object):
|
|||||||
if os.access(path, os.W_OK):
|
if os.access(path, os.W_OK):
|
||||||
try:
|
try:
|
||||||
with lopen(path, 'rb') as f:
|
with lopen(path, 'rb') as f:
|
||||||
self.ext_roots[source_id] = etree.fromstring(
|
self.ext_roots[source_id] = safe_xml_fromstring(
|
||||||
xml_to_unicode(f.read(),
|
xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True, verbose=DEBUG)[0]
|
||||||
strip_encoding_pats=True, assume_utf8=True,
|
)
|
||||||
verbose=DEBUG)[0], parser=parser)
|
|
||||||
self.ext_paths[source_id] = path
|
self.ext_paths[source_id] = path
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
@ -51,9 +51,9 @@ def return_raster_image(path):
|
|||||||
|
|
||||||
|
|
||||||
def extract_cover_from_embedded_svg(html, base, log):
|
def extract_cover_from_embedded_svg(html, base, log):
|
||||||
from lxml import etree
|
|
||||||
from calibre.ebooks.oeb.base import XPath, SVG, XLINK
|
from calibre.ebooks.oeb.base import XPath, SVG, XLINK
|
||||||
root = etree.fromstring(html)
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
root = safe_xml_fromstring(html)
|
||||||
|
|
||||||
svg = XPath('//svg:svg')(root)
|
svg = XPath('//svg:svg')(root)
|
||||||
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
|
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
|
||||||
|
@ -231,7 +231,7 @@ class EPUBInput(InputFormatPlugin):
|
|||||||
return removed
|
return removed
|
||||||
|
|
||||||
def find_opf(self):
|
def find_opf(self):
|
||||||
from lxml import etree
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
def attr(n, attr):
|
def attr(n, attr):
|
||||||
for k, v in n.attrib.items():
|
for k, v in n.attrib.items():
|
||||||
@ -239,7 +239,7 @@ class EPUBInput(InputFormatPlugin):
|
|||||||
return v
|
return v
|
||||||
try:
|
try:
|
||||||
with lopen('META-INF/container.xml', 'rb') as f:
|
with lopen('META-INF/container.xml', 'rb') as f:
|
||||||
root = etree.fromstring(f.read())
|
root = safe_xml_fromstring(f.read())
|
||||||
for r in root.xpath('//*[local-name()="rootfile"]'):
|
for r in root.xpath('//*[local-name()="rootfile"]'):
|
||||||
if attr(r, 'media-type') != "application/oebps-package+xml":
|
if attr(r, 'media-type') != "application/oebps-package+xml":
|
||||||
continue
|
continue
|
||||||
@ -356,12 +356,13 @@ class EPUBInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.oeb.polish.parsing import parse
|
from calibre.ebooks.oeb.polish.parsing import parse
|
||||||
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
|
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
|
||||||
from calibre.ebooks.oeb.polish.toc import first_child
|
from calibre.ebooks.oeb.polish.toc import first_child
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from tempfile import NamedTemporaryFile
|
from tempfile import NamedTemporaryFile
|
||||||
with lopen(nav_path, 'rb') as f:
|
with lopen(nav_path, 'rb') as f:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
|
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
|
||||||
root = parse(raw, log=log)
|
root = parse(raw, log=log)
|
||||||
ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
|
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
|
||||||
navmap = ncx[0]
|
navmap = ncx[0]
|
||||||
et = '{%s}type' % EPUB_NS
|
et = '{%s}type' % EPUB_NS
|
||||||
bn = os.path.basename(nav_path)
|
bn = os.path.basename(nav_path)
|
||||||
|
@ -39,10 +39,11 @@ class FB2Input(InputFormatPlugin):
|
|||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
|
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
|
||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.metadata.meta import get_metadata
|
from calibre.ebooks.metadata.meta import get_metadata
|
||||||
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
|
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
self.log = log
|
self.log = log
|
||||||
log.debug('Parsing XML...')
|
log.debug('Parsing XML...')
|
||||||
@ -51,15 +52,9 @@ class FB2Input(InputFormatPlugin):
|
|||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
assume_utf8=True, resolve_entities=True)[0]
|
assume_utf8=True, resolve_entities=True)[0]
|
||||||
try:
|
try:
|
||||||
doc = etree.fromstring(raw)
|
doc = safe_xml_fromstring(raw)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
try:
|
doc = safe_xml_fromstring(raw.replace('& ', '&'))
|
||||||
doc = etree.fromstring(raw, parser=RECOVER_PARSER)
|
|
||||||
if doc is None:
|
|
||||||
raise Exception('parse failed')
|
|
||||||
except:
|
|
||||||
doc = etree.fromstring(raw.replace('& ', '&'),
|
|
||||||
parser=RECOVER_PARSER)
|
|
||||||
if doc is None:
|
if doc is None:
|
||||||
raise ValueError('The FB2 file is not valid XML')
|
raise ValueError('The FB2 file is not valid XML')
|
||||||
doc = ensure_namespace(doc)
|
doc = ensure_namespace(doc)
|
||||||
@ -99,7 +94,7 @@ class FB2Input(InputFormatPlugin):
|
|||||||
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
|
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
|
||||||
re.DOTALL).sub('', ss)
|
re.DOTALL).sub('', ss)
|
||||||
|
|
||||||
styledoc = etree.fromstring(ss)
|
styledoc = safe_xml_fromstring(ss)
|
||||||
|
|
||||||
transform = etree.XSLT(styledoc)
|
transform = etree.XSLT(styledoc)
|
||||||
result = transform(doc)
|
result = transform(doc)
|
||||||
|
@ -43,7 +43,7 @@ class LITInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.txt.processor import convert_basic, \
|
from calibre.ebooks.txt.processor import convert_basic, \
|
||||||
separate_paragraphs_single_line
|
separate_paragraphs_single_line
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from lxml import etree
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
import copy
|
import copy
|
||||||
self.log('LIT file with all text in singe <pre> tag detected')
|
self.log('LIT file with all text in singe <pre> tag detected')
|
||||||
html = separate_paragraphs_single_line(pre.text)
|
html = separate_paragraphs_single_line(pre.text)
|
||||||
@ -55,7 +55,7 @@ class LITInput(InputFormatPlugin):
|
|||||||
# SmartyPants skips text inside <pre> tags
|
# SmartyPants skips text inside <pre> tags
|
||||||
from calibre.ebooks.conversion.preprocess import smarten_punctuation
|
from calibre.ebooks.conversion.preprocess import smarten_punctuation
|
||||||
html = smarten_punctuation(html, self.log)
|
html = smarten_punctuation(html, self.log)
|
||||||
root = etree.fromstring(html)
|
root = safe_xml_fromstring(html)
|
||||||
body = XPath('//h:body')(root)
|
body = XPath('//h:body')(root)
|
||||||
pre.tag = XHTML('div')
|
pre.tag = XHTML('div')
|
||||||
pre.text = ''
|
pre.text = ''
|
||||||
|
@ -20,25 +20,19 @@ class LRFInput(InputFormatPlugin):
|
|||||||
|
|
||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
from lxml import etree
|
|
||||||
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
|
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
|
||||||
Canvas, ImageBlock, RuledLine)
|
Canvas, ImageBlock, RuledLine)
|
||||||
self.log = log
|
self.log = log
|
||||||
self.log('Generating XML')
|
self.log('Generating XML')
|
||||||
from calibre.ebooks.lrf.lrfparser import LRFDocument
|
from calibre.ebooks.lrf.lrfparser import LRFDocument
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
from lxml import etree
|
||||||
d = LRFDocument(stream)
|
d = LRFDocument(stream)
|
||||||
d.parse()
|
d.parse()
|
||||||
xml = d.to_xml(write_files=True)
|
xml = d.to_xml(write_files=True)
|
||||||
if options.verbose > 2:
|
if options.verbose > 2:
|
||||||
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
|
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
|
||||||
parser = etree.XMLParser(no_network=True, huge_tree=True)
|
doc = safe_xml_fromstring(xml)
|
||||||
try:
|
|
||||||
doc = etree.fromstring(xml, parser=parser)
|
|
||||||
except:
|
|
||||||
self.log.warn('Failed to parse XML. Trying to recover')
|
|
||||||
parser = etree.XMLParser(no_network=True, huge_tree=True,
|
|
||||||
recover=True)
|
|
||||||
doc = etree.fromstring(xml, parser=parser)
|
|
||||||
|
|
||||||
char_button_map = {}
|
char_button_map = {}
|
||||||
for x in doc.xpath('//CharButton[@refobj]'):
|
for x in doc.xpath('//CharButton[@refobj]'):
|
||||||
@ -60,7 +54,7 @@ class LRFInput(InputFormatPlugin):
|
|||||||
plot_map[ro] = imgstr[0].get('file')
|
plot_map[ro] = imgstr[0].get('file')
|
||||||
|
|
||||||
self.log('Converting XML to HTML...')
|
self.log('Converting XML to HTML...')
|
||||||
styledoc = etree.fromstring(P('templates/lrf.xsl', data=True))
|
styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
|
||||||
media_type = MediaType()
|
media_type = MediaType()
|
||||||
styles = Styles()
|
styles = Styles()
|
||||||
text_block = TextBlock(styles, char_button_map, plot_map, log)
|
text_block = TextBlock(styles, char_button_map, plot_map, log)
|
||||||
|
@ -251,6 +251,7 @@ class RTFInput(InputFormatPlugin):
|
|||||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||||
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
||||||
from calibre.ebooks.rtf.input import InlineClass
|
from calibre.ebooks.rtf.input import InlineClass
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
self.opts = options
|
self.opts = options
|
||||||
self.log = log
|
self.log = log
|
||||||
self.log('Converting RTF to XML...')
|
self.log('Converting RTF to XML...')
|
||||||
@ -270,8 +271,7 @@ class RTFInput(InputFormatPlugin):
|
|||||||
self.log.exception('Failed to extract images...')
|
self.log.exception('Failed to extract images...')
|
||||||
|
|
||||||
self.log('Parsing XML...')
|
self.log('Parsing XML...')
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
doc = safe_xml_fromstring(xml)
|
||||||
doc = etree.fromstring(xml, parser=parser)
|
|
||||||
border_styles = self.convert_borders(doc)
|
border_styles = self.convert_borders(doc)
|
||||||
for pict in doc.xpath('//rtf:pict[@num]',
|
for pict in doc.xpath('//rtf:pict[@num]',
|
||||||
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
|
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
|
||||||
@ -282,7 +282,7 @@ class RTFInput(InputFormatPlugin):
|
|||||||
|
|
||||||
self.log('Converting XML to HTML...')
|
self.log('Converting XML to HTML...')
|
||||||
inline_class = InlineClass(self.log)
|
inline_class = InlineClass(self.log)
|
||||||
styledoc = etree.fromstring(P('templates/rtf.xsl', data=True))
|
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True))
|
||||||
extensions = {('calibre', 'inline-class') : inline_class}
|
extensions = {('calibre', 'inline-class') : inline_class}
|
||||||
transform = etree.XSLT(styledoc, extensions=extensions)
|
transform = etree.XSLT(styledoc, extensions=extensions)
|
||||||
result = transform(doc)
|
result = transform(doc)
|
||||||
|
@ -32,10 +32,10 @@ class SNBInput(InputFormatPlugin):
|
|||||||
def convert(self, stream, options, file_ext, log,
|
def convert(self, stream, options, file_ext, log,
|
||||||
accelerators):
|
accelerators):
|
||||||
import uuid
|
import uuid
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import DirContainer
|
from calibre.ebooks.oeb.base import DirContainer
|
||||||
from calibre.ebooks.snb.snbfile import SNBFile
|
from calibre.ebooks.snb.snbfile import SNBFile
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
log.debug("Parsing SNB file...")
|
log.debug("Parsing SNB file...")
|
||||||
snbFile = SNBFile()
|
snbFile = SNBFile()
|
||||||
@ -52,7 +52,7 @@ class SNBInput(InputFormatPlugin):
|
|||||||
encoding=options.input_encoding, populate=False)
|
encoding=options.input_encoding, populate=False)
|
||||||
meta = snbFile.GetFileStream('snbf/book.snbf')
|
meta = snbFile.GetFileStream('snbf/book.snbf')
|
||||||
if meta is not None:
|
if meta is not None:
|
||||||
meta = etree.fromstring(meta)
|
meta = safe_xml_fromstring(meta)
|
||||||
l = {'title' : './/head/name',
|
l = {'title' : './/head/name',
|
||||||
'creator' : './/head/author',
|
'creator' : './/head/author',
|
||||||
'language' : './/head/language',
|
'language' : './/head/language',
|
||||||
@ -87,7 +87,7 @@ class SNBInput(InputFormatPlugin):
|
|||||||
toc = snbFile.GetFileStream('snbf/toc.snbf')
|
toc = snbFile.GetFileStream('snbf/toc.snbf')
|
||||||
oeb.container = DirContainer(tdir, log)
|
oeb.container = DirContainer(tdir, log)
|
||||||
if toc is not None:
|
if toc is not None:
|
||||||
toc = etree.fromstring(toc)
|
toc = safe_xml_fromstring(toc)
|
||||||
i = 1
|
i = 1
|
||||||
for ch in toc.find('.//body'):
|
for ch in toc.find('.//body'):
|
||||||
chapterName = ch.text
|
chapterName = ch.text
|
||||||
@ -96,7 +96,7 @@ class SNBInput(InputFormatPlugin):
|
|||||||
data = snbFile.GetFileStream('snbc/' + chapterSrc)
|
data = snbFile.GetFileStream('snbc/' + chapterSrc)
|
||||||
if data is None:
|
if data is None:
|
||||||
continue
|
continue
|
||||||
snbc = etree.fromstring(data)
|
snbc = safe_xml_fromstring(data)
|
||||||
lines = []
|
lines = []
|
||||||
for line in snbc.find('.//body'):
|
for line in snbc.find('.//body'):
|
||||||
if line.tag == 'text':
|
if line.tag == 'text':
|
||||||
|
@ -18,11 +18,12 @@ from calibre.ptempfile import PersistentTemporaryDirectory
|
|||||||
from calibre.utils.localization import canonicalize_lang
|
from calibre.utils.localization import canonicalize_lang
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
from calibre.utils.zipfile import ZipFile
|
from calibre.utils.zipfile import ZipFile
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
||||||
|
|
||||||
|
|
||||||
def fromstring(raw, parser=RECOVER_PARSER):
|
def fromstring(raw, parser=RECOVER_PARSER):
|
||||||
return etree.fromstring(raw, parser=parser)
|
return safe_xml_fromstring(raw)
|
||||||
|
|
||||||
# Read metadata {{{
|
# Read metadata {{{
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ from lxml import etree
|
|||||||
|
|
||||||
from calibre import walk
|
from calibre import walk
|
||||||
from calibre.utils.zipfile import ZipFile
|
from calibre.utils.zipfile import ZipFile
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
|
|
||||||
def pretty_all_xml_in_dir(path):
|
def pretty_all_xml_in_dir(path):
|
||||||
@ -19,7 +20,7 @@ def pretty_all_xml_in_dir(path):
|
|||||||
with open(f, 'r+b') as stream:
|
with open(f, 'r+b') as stream:
|
||||||
raw = stream.read()
|
raw = stream.read()
|
||||||
if raw:
|
if raw:
|
||||||
root = etree.fromstring(raw)
|
root = safe_xml_fromstring(raw)
|
||||||
stream.seek(0)
|
stream.seek(0)
|
||||||
stream.truncate()
|
stream.truncate()
|
||||||
stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))
|
stream.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True))
|
||||||
|
@ -17,6 +17,7 @@ from lxml import etree
|
|||||||
from calibre import prepare_string_for_xml
|
from calibre import prepare_string_for_xml
|
||||||
from calibre.constants import __appname__, __version__
|
from calibre.constants import __appname__, __version__
|
||||||
from calibre.utils.localization import lang_as_iso639_1
|
from calibre.utils.localization import lang_as_iso639_1
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.img import save_cover_data_to
|
from calibre.utils.img import save_cover_data_to
|
||||||
from calibre.ebooks.oeb.base import urlnormalize
|
from calibre.ebooks.oeb.base import urlnormalize
|
||||||
from polyglot.builtins import unicode_type, string_or_bytes, range, filter
|
from polyglot.builtins import unicode_type, string_or_bytes, range, filter
|
||||||
@ -69,7 +70,7 @@ class FB2MLizer(object):
|
|||||||
output = self.clean_text('\n'.join(output))
|
output = self.clean_text('\n'.join(output))
|
||||||
|
|
||||||
if self.opts.pretty_print:
|
if self.opts.pretty_print:
|
||||||
output = etree.tostring(etree.fromstring(output), encoding='unicode', pretty_print=True)
|
output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True)
|
||||||
|
|
||||||
return '<?xml version="1.0" encoding="UTF-8"?>\n' + output
|
return '<?xml version="1.0" encoding="UTF-8"?>\n' + output
|
||||||
|
|
||||||
|
@ -8,9 +8,8 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre.ebooks.docx.container import DOCX
|
from calibre.ebooks.docx.container import DOCX
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.docx.writer.container import update_doc_props, xml2str
|
from calibre.ebooks.docx.writer.container import update_doc_props, xml2str
|
||||||
from calibre.utils.imghdr import identify
|
from calibre.utils.imghdr import identify
|
||||||
|
|
||||||
@ -61,11 +60,11 @@ def set_metadata(stream, mi):
|
|||||||
ap_raw = c.read(ap_name)
|
ap_raw = c.read(ap_name)
|
||||||
except Exception:
|
except Exception:
|
||||||
ap_raw = None
|
ap_raw = None
|
||||||
cp = etree.fromstring(dp_raw)
|
cp = safe_xml_fromstring(dp_raw)
|
||||||
update_doc_props(cp, mi, c.namespace)
|
update_doc_props(cp, mi, c.namespace)
|
||||||
replacements = {}
|
replacements = {}
|
||||||
if ap_raw is not None:
|
if ap_raw is not None:
|
||||||
ap = etree.fromstring(ap_raw)
|
ap = safe_xml_fromstring(ap_raw)
|
||||||
comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep'])
|
comp = ap.makeelement('{%s}Company' % c.namespace.namespaces['ep'])
|
||||||
for child in tuple(ap):
|
for child in tuple(ap):
|
||||||
if child.tag == comp.tag:
|
if child.tag == comp.tag:
|
||||||
|
@ -12,13 +12,12 @@ import os
|
|||||||
import posixpath
|
import posixpath
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre import CurrentDir
|
from calibre import CurrentDir
|
||||||
from calibre.ebooks.metadata.opf import (
|
from calibre.ebooks.metadata.opf import (
|
||||||
get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
|
get_metadata as get_metadata_from_opf, set_metadata as set_metadata_opf
|
||||||
)
|
)
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
from calibre.utils.localunzip import LocalZipFile
|
from calibre.utils.localunzip import LocalZipFile
|
||||||
from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace
|
from calibre.utils.zipfile import BadZipfile, ZipFile, safe_replace
|
||||||
@ -42,7 +41,7 @@ class Container(dict):
|
|||||||
def __init__(self, stream=None):
|
def __init__(self, stream=None):
|
||||||
if not stream:
|
if not stream:
|
||||||
return
|
return
|
||||||
container = etree.fromstring(stream.read())
|
container = safe_xml_fromstring(stream.read())
|
||||||
if container.get('version', None) != '1.0':
|
if container.get('version', None) != '1.0':
|
||||||
raise EPubException("unsupported version of OCF")
|
raise EPubException("unsupported version of OCF")
|
||||||
rootfiles = container.xpath('./*[local-name()="rootfiles"]')
|
rootfiles = container.xpath('./*[local-name()="rootfiles"]')
|
||||||
@ -70,8 +69,7 @@ class Encryption(object):
|
|||||||
'http://www.idpf.org/2008/embedding'])
|
'http://www.idpf.org/2008/embedding'])
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw):
|
||||||
from lxml import etree
|
self.root = safe_xml_fromstring(raw) if raw else None
|
||||||
self.root = etree.fromstring(raw) if raw else None
|
|
||||||
self.entries = {}
|
self.entries = {}
|
||||||
if self.root is not None:
|
if self.root is not None:
|
||||||
for em in self.root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
|
for em in self.root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
|
||||||
|
@ -15,6 +15,7 @@ from lxml import etree
|
|||||||
|
|
||||||
from calibre.utils.date import parse_only_date
|
from calibre.utils.date import parse_only_date
|
||||||
from calibre.utils.img import save_cover_data_to
|
from calibre.utils.img import save_cover_data_to
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.imghdr import identify
|
from calibre.utils.imghdr import identify
|
||||||
from calibre import guess_type, guess_all_extensions, prints, force_unicode
|
from calibre import guess_type, guess_all_extensions, prints, force_unicode
|
||||||
from calibre.ebooks.metadata import MetaInformation, check_isbn
|
from calibre.ebooks.metadata import MetaInformation, check_isbn
|
||||||
@ -315,9 +316,8 @@ def _parse_language(root, mi, ctx):
|
|||||||
|
|
||||||
|
|
||||||
def _get_fbroot(raw):
|
def _get_fbroot(raw):
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
|
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
|
||||||
root = etree.fromstring(raw, parser=parser)
|
root = safe_xml_fromstring(raw)
|
||||||
return ensure_namespace(root)
|
return ensure_namespace(root)
|
||||||
|
|
||||||
|
|
||||||
@ -452,5 +452,5 @@ def ensure_namespace(doc):
|
|||||||
import re
|
import re
|
||||||
raw = etree.tostring(doc, encoding='unicode')
|
raw = etree.tostring(doc, encoding='unicode')
|
||||||
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw)
|
raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw)
|
||||||
doc = etree.fromstring(raw)
|
doc = safe_xml_fromstring(raw)
|
||||||
return doc
|
return doc
|
||||||
|
@ -11,9 +11,9 @@ Read metadata from LRX files
|
|||||||
|
|
||||||
import struct
|
import struct
|
||||||
from zlib import decompress
|
from zlib import decompress
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
|
|
||||||
def _read(f, at, amount):
|
def _read(f, at, amount):
|
||||||
@ -66,7 +66,7 @@ def get_metadata(f):
|
|||||||
info = decompress(f.read(compressed_size))
|
info = decompress(f.read(compressed_size))
|
||||||
if len(info) != uncompressed_size:
|
if len(info) != uncompressed_size:
|
||||||
raise ValueError('LRX file has malformed metadata section')
|
raise ValueError('LRX file has malformed metadata section')
|
||||||
root = etree.fromstring(info)
|
root = safe_xml_fromstring(info)
|
||||||
bi = root.find('BookInfo')
|
bi = root.find('BookInfo')
|
||||||
title = bi.find('Title')
|
title = bi.find('Title')
|
||||||
title_sort = title.get('reading', None)
|
title_sort = title.get('reading', None)
|
||||||
|
@ -23,6 +23,7 @@ from calibre.utils.localization import get_lang, canonicalize_lang
|
|||||||
from calibre import prints, guess_type
|
from calibre import prints, guess_type
|
||||||
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||||
from calibre.utils.config import tweaks
|
from calibre.utils.config import tweaks
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from polyglot.builtins import iteritems, unicode_type, getcwd, map
|
from polyglot.builtins import iteritems, unicode_type, getcwd, map
|
||||||
from polyglot.urllib import unquote, urlparse
|
from polyglot.urllib import unquote, urlparse
|
||||||
|
|
||||||
@ -1588,7 +1589,7 @@ def metadata_to_opf(mi, as_string=True, default_lang=None):
|
|||||||
is None else default_lang)
|
is None else default_lang)
|
||||||
mi.languages = [lang]
|
mi.languages = [lang]
|
||||||
|
|
||||||
root = etree.fromstring(textwrap.dedent(
|
root = safe_xml_fromstring(textwrap.dedent(
|
||||||
'''
|
'''
|
||||||
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0">
|
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0">
|
||||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
|
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
|
||||||
|
@ -7,9 +7,8 @@ from collections import defaultdict
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS
|
from calibre.ebooks.metadata.book import ALL_METADATA_FIELDS
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.metadata.opf2 import OPF
|
from calibre.ebooks.metadata.opf2 import OPF
|
||||||
from calibre.ebooks.metadata.opf3 import (
|
from calibre.ebooks.metadata.opf3 import (
|
||||||
parse_prefixes, reserved_prefixes, expand_prefix, read_identifiers,
|
parse_prefixes, reserved_prefixes, expand_prefix, read_identifiers,
|
||||||
@ -37,7 +36,7 @@ class TestOPF3(unittest.TestCase):
|
|||||||
ae = unittest.TestCase.assertEqual
|
ae = unittest.TestCase.assertEqual
|
||||||
|
|
||||||
def get_opf(self, metadata='', manifest=''):
|
def get_opf(self, metadata='', manifest=''):
|
||||||
return etree.fromstring(TEMPLATE.format(metadata=metadata, manifest=manifest))
|
return safe_xml_fromstring(TEMPLATE.format(metadata=metadata, manifest=manifest))
|
||||||
|
|
||||||
def test_prefix_parsing(self): # {{{
|
def test_prefix_parsing(self): # {{{
|
||||||
self.ae(parse_prefixes('foaf: http://xmlns.com/foaf/spec/\n dbp: http://dbpedia.org/ontology/'),
|
self.ae(parse_prefixes('foaf: http://xmlns.com/foaf/spec/\n dbp: http://dbpedia.org/ontology/'),
|
||||||
@ -523,7 +522,7 @@ class TestOPF3(unittest.TestCase):
|
|||||||
self.ae(v2, v3, '%s: %r != %r' % (field, v2, v3))
|
self.ae(v2, v3, '%s: %r != %r' % (field, v2, v3))
|
||||||
|
|
||||||
mi2 = OPF(BytesIO(raw.encode('utf-8'))).to_book_metadata()
|
mi2 = OPF(BytesIO(raw.encode('utf-8'))).to_book_metadata()
|
||||||
root = etree.fromstring(raw)
|
root = safe_xml_fromstring(raw)
|
||||||
root.set('version', '3.0')
|
root.set('version', '3.0')
|
||||||
mi3, _, raster_cover, first_spine_item = read_metadata(root, return_extra_data=True)
|
mi3, _, raster_cover, first_spine_item = read_metadata(root, return_extra_data=True)
|
||||||
self.assertIsNone(raster_cover)
|
self.assertIsNone(raster_cover)
|
||||||
|
@ -9,7 +9,7 @@ import os
|
|||||||
import io
|
import io
|
||||||
from calibre.ebooks.metadata import MetaInformation
|
from calibre.ebooks.metadata import MetaInformation
|
||||||
from calibre.ebooks.snb.snbfile import SNBFile
|
from calibre.ebooks.snb.snbfile import SNBFile
|
||||||
from lxml import etree
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
|
|
||||||
def get_metadata(stream, extract_cover=True):
|
def get_metadata(stream, extract_cover=True):
|
||||||
@ -27,7 +27,7 @@ def get_metadata(stream, extract_cover=True):
|
|||||||
meta = snbFile.GetFileStream('snbf/book.snbf')
|
meta = snbFile.GetFileStream('snbf/book.snbf')
|
||||||
|
|
||||||
if meta is not None:
|
if meta is not None:
|
||||||
meta = etree.fromstring(meta)
|
meta = safe_xml_fromstring(meta)
|
||||||
mi.title = meta.find('.//head/name').text
|
mi.title = meta.find('.//head/name').text
|
||||||
mi.authors = [meta.find('.//head/author').text]
|
mi.authors = [meta.find('.//head/author').text]
|
||||||
mi.language = meta.find('.//head/language').text.lower().replace('_', '-')
|
mi.language = meta.find('.//head/language').text.lower().replace('_', '-')
|
||||||
|
@ -49,7 +49,7 @@ class Douban(Source):
|
|||||||
|
|
||||||
name = 'Douban Books'
|
name = 'Douban Books'
|
||||||
author = 'Li Fanxi'
|
author = 'Li Fanxi'
|
||||||
version = (2, 1, 1)
|
version = (2, 1, 2)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
|
|
||||||
description = _('Downloads metadata and covers from Douban.com. '
|
description = _('Downloads metadata and covers from Douban.com. '
|
||||||
@ -119,8 +119,10 @@ class Douban(Source):
|
|||||||
try:
|
try:
|
||||||
log.info(id_url)
|
log.info(id_url)
|
||||||
raw = get_details(browser, id_url, timeout)
|
raw = get_details(browser, id_url, timeout)
|
||||||
feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
|
feed = etree.fromstring(
|
||||||
strip_encoding_pats=True)[0])
|
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||||
|
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||||
|
)
|
||||||
extra = entry(feed)[0]
|
extra = entry(feed)[0]
|
||||||
except:
|
except:
|
||||||
log.exception('Failed to get additional details for', mi.title)
|
log.exception('Failed to get additional details for', mi.title)
|
||||||
|
@ -105,7 +105,8 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
try:
|
try:
|
||||||
raw = get_details(browser, id_url, timeout)
|
raw = get_details(browser, id_url, timeout)
|
||||||
feed = etree.fromstring(
|
feed = etree.fromstring(
|
||||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]
|
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||||
|
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||||
)
|
)
|
||||||
extra = entry(feed)[0]
|
extra = entry(feed)[0]
|
||||||
except:
|
except:
|
||||||
@ -173,7 +174,7 @@ def to_metadata(browser, log, entry_, timeout): # {{{
|
|||||||
class GoogleBooks(Source):
|
class GoogleBooks(Source):
|
||||||
|
|
||||||
name = 'Google'
|
name = 'Google'
|
||||||
version = (1, 0, 0)
|
version = (1, 0, 1)
|
||||||
minimum_calibre_version = (2, 80, 0)
|
minimum_calibre_version = (2, 80, 0)
|
||||||
description = _('Downloads metadata and covers from Google Books')
|
description = _('Downloads metadata and covers from Google Books')
|
||||||
|
|
||||||
@ -371,10 +372,9 @@ class GoogleBooks(Source):
|
|||||||
return as_unicode(e)
|
return as_unicode(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
|
||||||
feed = etree.fromstring(
|
feed = etree.fromstring(
|
||||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||||
parser=parser
|
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||||
)
|
)
|
||||||
entries = entry(feed)
|
entries = entry(feed)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -12,6 +12,7 @@ from lxml.builder import ElementMaker
|
|||||||
|
|
||||||
from calibre.constants import __appname__, __version__
|
from calibre.constants import __appname__, __version__
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.cleantext import clean_xml_chars
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
from polyglot.builtins import unicode_type, getcwd
|
from polyglot.builtins import unicode_type, getcwd
|
||||||
from polyglot.urllib import unquote, urlparse
|
from polyglot.urllib import unquote, urlparse
|
||||||
@ -177,8 +178,7 @@ class TOC(list):
|
|||||||
with open(toc, 'rb') as f:
|
with open(toc, 'rb') as f:
|
||||||
raw = xml_to_unicode(f.read(), assume_utf8=True,
|
raw = xml_to_unicode(f.read(), assume_utf8=True,
|
||||||
strip_encoding_pats=True)[0]
|
strip_encoding_pats=True)[0]
|
||||||
root = etree.fromstring(raw, parser=etree.XMLParser(recover=True,
|
root = safe_xml_fromstring(raw)
|
||||||
no_network=True))
|
|
||||||
xpn = {'re': 'http://exslt.org/regular-expressions'}
|
xpn = {'re': 'http://exslt.org/regular-expressions'}
|
||||||
XPath = functools.partial(etree.XPath, namespaces=xpn)
|
XPath = functools.partial(etree.XPath, namespaces=xpn)
|
||||||
|
|
||||||
|
@ -6,17 +6,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
|||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from polyglot.builtins import map
|
from polyglot.builtins import map
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.oeb.base import OPF
|
from calibre.ebooks.oeb.base import OPF
|
||||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||||
from calibre.spell import parse_lang_code
|
from calibre.spell import parse_lang_code
|
||||||
from calibre.utils.localization import lang_as_iso639_1
|
from calibre.utils.localization import lang_as_iso639_1
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from polyglot.builtins import filter
|
from polyglot.builtins import filter
|
||||||
|
|
||||||
PARSER = etree.XMLParser(recover=True, no_network=True)
|
|
||||||
|
|
||||||
OPFVersion = namedtuple('OPFVersion', 'major minor patch')
|
OPFVersion = namedtuple('OPFVersion', 'major minor patch')
|
||||||
|
|
||||||
|
|
||||||
@ -45,7 +43,7 @@ def parse_opf(stream_or_path):
|
|||||||
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
|
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
|
||||||
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
|
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
|
||||||
raw = raw[raw.find('<'):]
|
raw = raw[raw.find('<'):]
|
||||||
root = etree.fromstring(raw, PARSER)
|
root = safe_xml_fromstring(raw)
|
||||||
if root is None:
|
if root is None:
|
||||||
raise ValueError('Not an OPF file')
|
raise ValueError('Not an OPF file')
|
||||||
return root
|
return root
|
||||||
|
@ -14,6 +14,7 @@ from lxml.builder import ElementMaker
|
|||||||
|
|
||||||
from calibre import prints
|
from calibre import prints
|
||||||
from calibre.ebooks.metadata import check_isbn, check_doi
|
from calibre.ebooks.metadata import check_isbn, check_doi
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.metadata.book.base import Metadata
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
from calibre.ebooks.metadata.opf2 import dump_dict
|
from calibre.ebooks.metadata.opf2 import dump_dict
|
||||||
from calibre.utils.date import parse_date, isoformat, now
|
from calibre.utils.date import parse_date, isoformat, now
|
||||||
@ -74,9 +75,9 @@ def parse_xmp_packet(raw_bytes):
|
|||||||
enc = emap.get(m.group(1), enc)
|
enc = emap.get(m.group(1), enc)
|
||||||
break
|
break
|
||||||
if enc is None:
|
if enc is None:
|
||||||
return etree.fromstring(raw_bytes)
|
return safe_xml_fromstring(raw_bytes)
|
||||||
raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string
|
raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string
|
||||||
return etree.fromstring(raw)
|
return safe_xml_fromstring(raw)
|
||||||
|
|
||||||
|
|
||||||
def serialize_xmp_packet(root, encoding='utf-8'):
|
def serialize_xmp_packet(root, encoding='utf-8'):
|
||||||
|
@ -6,8 +6,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
from lxml import etree
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import (urlnormalize, XPath, XHTML_NS, XHTML,
|
from calibre.ebooks.oeb.base import (urlnormalize, XPath, XHTML_NS, XHTML,
|
||||||
XHTML_MIME, css_text)
|
XHTML_MIME, css_text)
|
||||||
|
|
||||||
@ -88,7 +87,7 @@ class TOCAdder(object):
|
|||||||
'body { font-family: %s }'%s.body_font_family]
|
'body { font-family: %s }'%s.body_font_family]
|
||||||
embed_css = '\n\n'.join(css)
|
embed_css = '\n\n'.join(css)
|
||||||
|
|
||||||
root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS,
|
root = safe_xml_fromstring(TEMPLATE.format(xhtmlns=XHTML_NS,
|
||||||
title=self.title, embed_css=embed_css,
|
title=self.title, embed_css=embed_css,
|
||||||
extra_css=(opts.extra_css or '')))
|
extra_css=(opts.extra_css or '')))
|
||||||
parent = XPath('//h:ul')(root)[0]
|
parent = XPath('//h:ul')(root)[0]
|
||||||
|
@ -19,6 +19,7 @@ from odf.namespaces import TEXTNS as odTEXTNS
|
|||||||
|
|
||||||
from calibre import CurrentDir, walk
|
from calibre import CurrentDir, walk
|
||||||
from calibre.ebooks.oeb.base import _css_logger
|
from calibre.ebooks.oeb.base import _css_logger
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from polyglot.builtins import unicode_type, string_or_bytes, filter, getcwd, as_bytes
|
from polyglot.builtins import unicode_type, string_or_bytes, filter, getcwd, as_bytes
|
||||||
|
|
||||||
|
|
||||||
@ -45,7 +46,7 @@ class Extract(ODF2XHTML):
|
|||||||
ol.set('start', val)
|
ol.set('start', val)
|
||||||
|
|
||||||
def fix_markup(self, html, log):
|
def fix_markup(self, html, log):
|
||||||
root = etree.fromstring(html)
|
root = safe_xml_fromstring(html)
|
||||||
self.filter_css(root, log)
|
self.filter_css(root, log)
|
||||||
self.extract_css(root, log)
|
self.extract_css(root, log)
|
||||||
self.epubify_markup(root, log)
|
self.epubify_markup(root, log)
|
||||||
|
@ -16,11 +16,11 @@ from lxml import etree, html
|
|||||||
from calibre import force_unicode
|
from calibre import force_unicode
|
||||||
from calibre.constants import filesystem_encoding, __version__, ispy3
|
from calibre.constants import filesystem_encoding, __version__, ispy3
|
||||||
from calibre.translations.dynamic import translate
|
from calibre.translations.dynamic import translate
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
|
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
|
||||||
from calibre import (isbytestring, as_unicode, get_types_map)
|
from calibre import (isbytestring, as_unicode, get_types_map)
|
||||||
from calibre.ebooks.oeb.parse_utils import (barename, XHTML_NS, RECOVER_PARSER,
|
from calibre.ebooks.oeb.parse_utils import barename, XHTML_NS, namespace, XHTML, parse_html, NotHTML
|
||||||
namespace, XHTML, parse_html, NotHTML)
|
|
||||||
from calibre.utils.cleantext import clean_xml_chars
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
from calibre.utils.short_uuid import uuid4
|
from calibre.utils.short_uuid import uuid4
|
||||||
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter, codepoint_to_chr
|
from polyglot.builtins import iteritems, unicode_type, string_or_bytes, range, itervalues, filter, codepoint_to_chr
|
||||||
@ -946,7 +946,7 @@ class Manifest(object):
|
|||||||
return
|
return
|
||||||
data = xml_to_unicode(data, strip_encoding_pats=True,
|
data = xml_to_unicode(data, strip_encoding_pats=True,
|
||||||
assume_utf8=True, resolve_entities=True)[0]
|
assume_utf8=True, resolve_entities=True)[0]
|
||||||
return etree.fromstring(data, parser=RECOVER_PARSER)
|
return safe_xml_fromstring(data)
|
||||||
|
|
||||||
def _parse_xhtml(self, data):
|
def _parse_xhtml(self, data):
|
||||||
orig_data = data
|
orig_data = data
|
||||||
|
@ -11,6 +11,7 @@ import re
|
|||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
|
|
||||||
from calibre import xml_replace_entities, force_unicode
|
from calibre import xml_replace_entities, force_unicode
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.constants import filesystem_encoding
|
from calibre.constants import filesystem_encoding
|
||||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||||
from polyglot.builtins import iteritems, itervalues, unicode_type, string_or_bytes, map
|
from polyglot.builtins import iteritems, itervalues, unicode_type, string_or_bytes, map
|
||||||
@ -114,12 +115,7 @@ def _html4_parse(data):
|
|||||||
elem.text = elem.text.strip('-')
|
elem.text = elem.text.strip('-')
|
||||||
data = etree.tostring(data, encoding='unicode')
|
data = etree.tostring(data, encoding='unicode')
|
||||||
|
|
||||||
# Setting huge_tree=True causes crashes in windows with large files
|
data = safe_xml_fromstring(data)
|
||||||
parser = etree.XMLParser(no_network=True)
|
|
||||||
try:
|
|
||||||
data = etree.fromstring(data, parser=parser)
|
|
||||||
except etree.XMLSyntaxError:
|
|
||||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
@ -210,19 +206,16 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
data = data.replace('\0', '')
|
data = data.replace('\0', '')
|
||||||
data = raw = clean_word_doc(data, log)
|
data = raw = clean_word_doc(data, log)
|
||||||
|
|
||||||
# Setting huge_tree=True causes crashes in windows with large files
|
|
||||||
parser = etree.XMLParser(no_network=True)
|
|
||||||
|
|
||||||
# Try with more & more drastic measures to parse
|
# Try with more & more drastic measures to parse
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data, parser=parser)
|
data = safe_xml_fromstring(data)
|
||||||
check_for_html5(pre, data)
|
check_for_html5(pre, data)
|
||||||
except (HTML5Doc, etree.XMLSyntaxError):
|
except (HTML5Doc, etree.XMLSyntaxError):
|
||||||
log.debug('Initial parse failed, using more'
|
log.debug('Initial parse failed, using more'
|
||||||
' forgiving parsers')
|
' forgiving parsers')
|
||||||
raw = data = xml_replace_entities(raw)
|
raw = data = xml_replace_entities(raw)
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data, parser=parser)
|
data = safe_xml_fromstring(data)
|
||||||
check_for_html5(pre, data)
|
check_for_html5(pre, data)
|
||||||
except (HTML5Doc, etree.XMLSyntaxError):
|
except (HTML5Doc, etree.XMLSyntaxError):
|
||||||
log.debug('Parsing %s as HTML' % filename)
|
log.debug('Parsing %s as HTML' % filename)
|
||||||
@ -251,7 +244,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
if barename(data.tag) in non_html_file_tags:
|
if barename(data.tag) in non_html_file_tags:
|
||||||
raise NotHTML(data.tag)
|
raise NotHTML(data.tag)
|
||||||
log.warn('File %r does not appear to be (X)HTML'%filename)
|
log.warn('File %r does not appear to be (X)HTML'%filename)
|
||||||
nroot = etree.fromstring('<html></html>')
|
nroot = safe_xml_fromstring('<html></html>')
|
||||||
has_body = False
|
has_body = False
|
||||||
for child in list(data):
|
for child in list(data):
|
||||||
if isinstance(child.tag, (unicode_type, bytes)) and barename(child.tag) == 'body':
|
if isinstance(child.tag, (unicode_type, bytes)) and barename(child.tag) == 'body':
|
||||||
@ -260,7 +253,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
parent = nroot
|
parent = nroot
|
||||||
if not has_body:
|
if not has_body:
|
||||||
log.warn('File %r appears to be a HTML fragment'%filename)
|
log.warn('File %r appears to be a HTML fragment'%filename)
|
||||||
nroot = etree.fromstring('<html><body/></html>')
|
nroot = safe_xml_fromstring('<html><body/></html>')
|
||||||
parent = nroot[0]
|
parent = nroot[0]
|
||||||
for child in list(data.iter()):
|
for child in list(data.iter()):
|
||||||
oparent = child.getparent()
|
oparent = child.getparent()
|
||||||
@ -276,12 +269,12 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
data = etree.tostring(data, encoding='unicode')
|
data = etree.tostring(data, encoding='unicode')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data, parser=parser)
|
data = safe_xml_fromstring(data)
|
||||||
except:
|
except:
|
||||||
data = data.replace(':=', '=').replace(':>', '>')
|
data = data.replace(':=', '=').replace(':>', '>')
|
||||||
data = data.replace('<http:/>', '')
|
data = data.replace('<http:/>', '')
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data, parser=parser)
|
data = safe_xml_fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
log.warn('Stripping comments from %s'%
|
log.warn('Stripping comments from %s'%
|
||||||
filename)
|
filename)
|
||||||
@ -292,12 +285,11 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|||||||
'')
|
'')
|
||||||
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
||||||
try:
|
try:
|
||||||
data = etree.fromstring(data,
|
data = safe_xml_fromstring(data)
|
||||||
parser=RECOVER_PARSER)
|
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
log.warn('Stripping meta tags from %s'% filename)
|
log.warn('Stripping meta tags from %s'% filename)
|
||||||
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
||||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
data = safe_xml_fromstring(data)
|
||||||
elif namespace(data.tag) != XHTML_NS:
|
elif namespace(data.tag) != XHTML_NS:
|
||||||
# OEB_DOC_NS, but possibly others
|
# OEB_DOC_NS, but possibly others
|
||||||
ns = namespace(data.tag)
|
ns = namespace(data.tag)
|
||||||
|
@ -7,11 +7,12 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from lxml.etree import XMLParser, fromstring, XMLSyntaxError
|
from lxml.etree import XMLSyntaxError
|
||||||
import css_parser
|
import css_parser
|
||||||
|
|
||||||
from calibre import force_unicode, human_readable, prepare_string_for_xml
|
from calibre import force_unicode, human_readable, prepare_string_for_xml
|
||||||
from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
|
from calibre.ebooks.chardet import replace_encoding_declarations, find_declared_encoding
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.html_entities import html5_entities
|
from calibre.ebooks.html_entities import html5_entities
|
||||||
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
|
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style as fix_style_tag
|
||||||
from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
|
from calibre.ebooks.oeb.polish.utils import PositionFinder, guess_type
|
||||||
@ -276,7 +277,6 @@ def check_xml_parsing(name, mt, raw):
|
|||||||
# Get rid of entities as named entities trip up the XML parser
|
# Get rid of entities as named entities trip up the XML parser
|
||||||
eproc = EntitityProcessor(mt)
|
eproc = EntitityProcessor(mt)
|
||||||
eraw = entity_pat.sub(eproc, raw)
|
eraw = entity_pat.sub(eproc, raw)
|
||||||
parser = XMLParser(recover=False)
|
|
||||||
errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
|
errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
|
||||||
errors = []
|
errors = []
|
||||||
if eproc.ok_named_entities:
|
if eproc.ok_named_entities:
|
||||||
@ -288,7 +288,7 @@ def check_xml_parsing(name, mt, raw):
|
|||||||
errors.append(BadEntity(ent, name, lnum, col))
|
errors.append(BadEntity(ent, name, lnum, col))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
root = fromstring(eraw, parser=parser)
|
root = safe_xml_fromstring(eraw, recover=False)
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
return errors + [DecodeError(name)]
|
return errors + [DecodeError(name)]
|
||||||
except XMLSyntaxError as err:
|
except XMLSyntaxError as err:
|
||||||
|
@ -18,7 +18,6 @@ from io import BytesIO
|
|||||||
from itertools import count
|
from itertools import count
|
||||||
|
|
||||||
from css_parser import getUrls, replaceUrls
|
from css_parser import getUrls, replaceUrls
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre import CurrentDir, walk
|
from calibre import CurrentDir, walk
|
||||||
from calibre.constants import iswindows
|
from calibre.constants import iswindows
|
||||||
@ -42,7 +41,7 @@ from calibre.ebooks.oeb.base import (
|
|||||||
DC11_NS, OEB_DOCS, OEB_STYLES, OPF, OPF2_NS, Manifest, itercsslinks, iterlinks,
|
DC11_NS, OEB_DOCS, OEB_STYLES, OPF, OPF2_NS, Manifest, itercsslinks, iterlinks,
|
||||||
rewrite_links, serialize, urlquote, urlunquote
|
rewrite_links, serialize, urlquote, urlunquote
|
||||||
)
|
)
|
||||||
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER, NotHTML, parse_html
|
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html
|
||||||
from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook
|
from calibre.ebooks.oeb.polish.errors import DRMError, InvalidBook
|
||||||
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
|
from calibre.ebooks.oeb.polish.parsing import parse as parse_html_tweak
|
||||||
from calibre.ebooks.oeb.polish.utils import (
|
from calibre.ebooks.oeb.polish.utils import (
|
||||||
@ -52,6 +51,7 @@ from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryF
|
|||||||
from calibre.utils.filenames import hardlink_file, nlinks_file
|
from calibre.utils.filenames import hardlink_file, nlinks_file
|
||||||
from calibre.utils.ipc.simple_worker import WorkerError, fork_job
|
from calibre.utils.ipc.simple_worker import WorkerError, fork_job
|
||||||
from calibre.utils.logging import default_log
|
from calibre.utils.logging import default_log
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.zipfile import ZipFile
|
from calibre.utils.zipfile import ZipFile
|
||||||
from polyglot.builtins import iteritems, map, unicode_type, zip
|
from polyglot.builtins import iteritems, map, unicode_type, zip
|
||||||
from polyglot.urllib import urlparse
|
from polyglot.urllib import urlparse
|
||||||
@ -201,7 +201,7 @@ class ContainerBase(object): # {{{
|
|||||||
data, self.used_encoding = xml_to_unicode(
|
data, self.used_encoding = xml_to_unicode(
|
||||||
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
|
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
|
||||||
data = unicodedata.normalize('NFC', data)
|
data = unicodedata.normalize('NFC', data)
|
||||||
return etree.fromstring(data, parser=RECOVER_PARSER)
|
return safe_xml_fromstring(data)
|
||||||
|
|
||||||
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
|
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
|
||||||
if self.tweak_mode:
|
if self.tweak_mode:
|
||||||
@ -1178,7 +1178,7 @@ class EpubContainer(Container):
|
|||||||
container_path = join(self.root, 'META-INF', 'container.xml')
|
container_path = join(self.root, 'META-INF', 'container.xml')
|
||||||
if not exists(container_path):
|
if not exists(container_path):
|
||||||
raise InvalidEpub('No META-INF/container.xml in epub')
|
raise InvalidEpub('No META-INF/container.xml in epub')
|
||||||
container = etree.fromstring(open(container_path, 'rb').read())
|
container = safe_xml_fromstring(open(container_path, 'rb').read())
|
||||||
opf_files = container.xpath((
|
opf_files = container.xpath((
|
||||||
r'child::ocf:rootfiles/ocf:rootfile'
|
r'child::ocf:rootfiles/ocf:rootfile'
|
||||||
'[@media-type="%s" and @full-path]'%guess_type('a.opf')
|
'[@media-type="%s" and @full-path]'%guess_type('a.opf')
|
||||||
|
@ -7,10 +7,11 @@ __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from lxml.etree import XMLParser, fromstring, Element as LxmlElement
|
from lxml.etree import Element as LxmlElement
|
||||||
import html5_parser
|
import html5_parser
|
||||||
|
|
||||||
from calibre import xml_replace_entities
|
from calibre import xml_replace_entities
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||||
from calibre.utils.cleantext import clean_xml_chars
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
from polyglot.builtins import unicode_type
|
from polyglot.builtins import unicode_type
|
||||||
@ -77,8 +78,7 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
|||||||
if force_html5_parse:
|
if force_html5_parse:
|
||||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||||
try:
|
try:
|
||||||
parser = XMLParser(no_network=True)
|
ans = safe_xml_fromstring(raw)
|
||||||
ans = fromstring(raw, parser=parser)
|
|
||||||
if ans.tag != '{%s}html' % XHTML_NS:
|
if ans.tag != '{%s}html' % XHTML_NS:
|
||||||
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
||||||
if linenumber_attribute:
|
if linenumber_attribute:
|
||||||
|
@ -21,6 +21,7 @@ from calibre.ebooks.oeb.base import namespace, barename, XPath, xpath, \
|
|||||||
urlnormalize, BINARY_MIME, \
|
urlnormalize, BINARY_MIME, \
|
||||||
OEBError, OEBBook, DirContainer
|
OEBError, OEBBook, DirContainer
|
||||||
from calibre.ebooks.oeb.writer import OEBWriter
|
from calibre.ebooks.oeb.writer import OEBWriter
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.cleantext import clean_xml_chars
|
from calibre.utils.cleantext import clean_xml_chars
|
||||||
from calibre.utils.localization import get_lang
|
from calibre.utils.localization import get_lang
|
||||||
from calibre.ptempfile import TemporaryDirectory
|
from calibre.ptempfile import TemporaryDirectory
|
||||||
@ -108,23 +109,18 @@ class OEBReader(object):
|
|||||||
data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
|
data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
|
||||||
OPF1_NS, data)
|
OPF1_NS, data)
|
||||||
try:
|
try:
|
||||||
opf = etree.fromstring(data)
|
opf = safe_xml_fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
|
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
|
||||||
try:
|
try:
|
||||||
opf = etree.fromstring(data)
|
opf = safe_xml_fromstring(data)
|
||||||
self.logger.warn('OPF contains invalid HTML named entities')
|
self.logger.warn('OPF contains invalid HTML named entities')
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
|
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
|
||||||
data = data.replace('<dc-metadata>',
|
data = data.replace('<dc-metadata>',
|
||||||
'<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
|
'<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
|
||||||
try:
|
opf = safe_xml_fromstring(data)
|
||||||
opf = etree.fromstring(data)
|
|
||||||
self.logger.warn('OPF contains invalid tours section')
|
self.logger.warn('OPF contains invalid tours section')
|
||||||
except etree.XMLSyntaxError:
|
|
||||||
self.logger.warn('OPF contains invalid markup, trying to parse it anyway')
|
|
||||||
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
|
||||||
opf = etree.fromstring(data, parser=RECOVER_PARSER)
|
|
||||||
|
|
||||||
ns = namespace(opf.tag)
|
ns = namespace(opf.tag)
|
||||||
if ns not in ('', OPF1_NS, OPF2_NS):
|
if ns not in ('', OPF1_NS, OPF2_NS):
|
||||||
|
@ -8,9 +8,9 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
from calibre import guess_type
|
from calibre import guess_type
|
||||||
from calibre.utils.imghdr import identify
|
from calibre.utils.imghdr import identify
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from polyglot.builtins import unicode_type
|
from polyglot.builtins import unicode_type
|
||||||
from polyglot.urllib import unquote
|
from polyglot.urllib import unquote
|
||||||
|
|
||||||
@ -156,7 +156,7 @@ class CoverManager(object):
|
|||||||
tp = templ%unquote(href)
|
tp = templ%unquote(href)
|
||||||
id, href = m.generate('titlepage', 'titlepage.xhtml')
|
id, href = m.generate('titlepage', 'titlepage.xhtml')
|
||||||
item = m.add(id, href, guess_type('t.xhtml')[0],
|
item = m.add(id, href, guess_type('t.xhtml')[0],
|
||||||
data=etree.fromstring(tp))
|
data=safe_xml_fromstring(tp))
|
||||||
else:
|
else:
|
||||||
item = self.oeb.manifest.hrefs[
|
item = self.oeb.manifest.hrefs[
|
||||||
urldefrag(self.oeb.guide['titlepage'].href)[0]]
|
urldefrag(self.oeb.guide['titlepage'].href)[0]]
|
||||||
|
@ -129,9 +129,9 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
|||||||
|
|
||||||
def parse_outline(raw, output_dir):
|
def parse_outline(raw, output_dir):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
|
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
|
||||||
outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]')
|
outline = safe_xml_fromstring(raw).xpath('(//outline)[1]')
|
||||||
if outline:
|
if outline:
|
||||||
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
|
from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
|
||||||
outline = outline[0]
|
outline = outline[0]
|
||||||
|
@ -12,6 +12,7 @@ from itertools import count
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from polyglot.builtins import range, map
|
from polyglot.builtins import range, map
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
|
|
||||||
class Font(object):
|
class Font(object):
|
||||||
@ -622,8 +623,7 @@ class PDFDocument(object):
|
|||||||
|
|
||||||
def __init__(self, xml, opts, log):
|
def __init__(self, xml, opts, log):
|
||||||
self.opts, self.log = opts, log
|
self.opts, self.log = opts, log
|
||||||
parser = etree.XMLParser(recover=True)
|
self.root = safe_xml_fromstring(xml)
|
||||||
self.root = etree.fromstring(xml, parser=parser)
|
|
||||||
idc = count()
|
idc = count()
|
||||||
|
|
||||||
self.fonts = []
|
self.fonts = []
|
||||||
|
@ -14,6 +14,7 @@ import re
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.pdb.ereader import image_name
|
from calibre.ebooks.pdb.ereader import image_name
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.pml import unipmlcode
|
from calibre.ebooks.pml import unipmlcode
|
||||||
from polyglot.builtins import unicode_type, string_or_bytes
|
from polyglot.builtins import unicode_type, string_or_bytes
|
||||||
|
|
||||||
@ -138,7 +139,7 @@ class PMLMLizer(object):
|
|||||||
self.log.debug('Converting %s to PML markup...' % item.href)
|
self.log.debug('Converting %s to PML markup...' % item.href)
|
||||||
content = etree.tostring(item.data, encoding='unicode')
|
content = etree.tostring(item.data, encoding='unicode')
|
||||||
content = self.prepare_text(content)
|
content = self.prepare_text(content)
|
||||||
content = etree.fromstring(content)
|
content = safe_xml_fromstring(content)
|
||||||
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
text.append(self.add_page_anchor(item))
|
text.append(self.add_page_anchor(item))
|
||||||
text += self.dump_text(content.find(XHTML('body')), stylizer, item)
|
text += self.dump_text(content.find(XHTML('body')), stylizer, item)
|
||||||
|
@ -109,6 +109,7 @@ class RTFMLizer(object):
|
|||||||
def mlize_spine(self):
|
def mlize_spine(self):
|
||||||
from calibre.ebooks.oeb.base import XHTML
|
from calibre.ebooks.oeb.base import XHTML
|
||||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
output = self.header()
|
output = self.header()
|
||||||
if 'titlepage' in self.oeb_book.guide:
|
if 'titlepage' in self.oeb_book.guide:
|
||||||
href = self.oeb_book.guide['titlepage'].href
|
href = self.oeb_book.guide['titlepage'].href
|
||||||
@ -126,7 +127,7 @@ class RTFMLizer(object):
|
|||||||
content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
|
content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL)
|
||||||
content = self.remove_newlines(content)
|
content = self.remove_newlines(content)
|
||||||
content = self.remove_tabs(content)
|
content = self.remove_tabs(content)
|
||||||
content = etree.fromstring(content)
|
content = safe_xml_fromstring(content)
|
||||||
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
self.currently_dumping_item = item
|
self.currently_dumping_item = item
|
||||||
output += self.dump_text(content.find(XHTML('body')), stylizer)
|
output += self.dump_text(content.find(XHTML('body')), stylizer)
|
||||||
|
@ -84,6 +84,7 @@ class SNBMLizer(object):
|
|||||||
def mlize(self):
|
def mlize(self):
|
||||||
from calibre.ebooks.oeb.base import XHTML
|
from calibre.ebooks.oeb.base import XHTML
|
||||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
output = [u'']
|
output = [u'']
|
||||||
stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
content = etree.tostring(self.item.data.find(XHTML('body')), encoding='unicode')
|
content = etree.tostring(self.item.data.find(XHTML('body')), encoding='unicode')
|
||||||
@ -98,7 +99,7 @@ class SNBMLizer(object):
|
|||||||
etree.SubElement(snbcTree, "body")
|
etree.SubElement(snbcTree, "body")
|
||||||
trees[subitem] = snbcTree
|
trees[subitem] = snbcTree
|
||||||
output.append('%s%s\n\n' % (CALIBRE_SNB_BM_TAG, ""))
|
output.append('%s%s\n\n' % (CALIBRE_SNB_BM_TAG, ""))
|
||||||
output += self.dump_text(self.subitems, etree.fromstring(content), stylizer)[0]
|
output += self.dump_text(self.subitems, safe_xml_fromstring(content), stylizer)[0]
|
||||||
output = self.cleanup_text(''.join(output))
|
output = self.cleanup_text(''.join(output))
|
||||||
|
|
||||||
subitem = ''
|
subitem = ''
|
||||||
|
@ -67,6 +67,7 @@ class TXTMLizer(object):
|
|||||||
def mlize_spine(self):
|
def mlize_spine(self):
|
||||||
from calibre.ebooks.oeb.base import XHTML
|
from calibre.ebooks.oeb.base import XHTML
|
||||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
output = [u'']
|
output = [u'']
|
||||||
output.append(self.get_toc())
|
output.append(self.get_toc())
|
||||||
for item in self.oeb_book.spine:
|
for item in self.oeb_book.spine:
|
||||||
@ -76,7 +77,7 @@ class TXTMLizer(object):
|
|||||||
x.text = x.text.replace('--', '__')
|
x.text = x.text.replace('--', '__')
|
||||||
content = etree.tostring(item.data, encoding='unicode')
|
content = etree.tostring(item.data, encoding='unicode')
|
||||||
content = self.remove_newlines(content)
|
content = self.remove_newlines(content)
|
||||||
content = etree.fromstring(content)
|
content = safe_xml_fromstring(content)
|
||||||
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||||
output += self.dump_text(content.find(XHTML('body')), stylizer, item)
|
output += self.dump_text(content.find(XHTML('body')), stylizer, item)
|
||||||
output += '\n\n\n\n\n\n'
|
output += '\n\n\n\n\n\n'
|
||||||
|
@ -15,6 +15,7 @@ from PyQt5.Qt import (
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.gui2 import choose_files, error_dialog
|
from calibre.gui2 import choose_files, error_dialog
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.icu import sort_key
|
from calibre.utils.icu import sort_key
|
||||||
from polyglot.builtins import unicode_type
|
from polyglot.builtins import unicode_type
|
||||||
|
|
||||||
@ -32,7 +33,7 @@ def uniq(vals, kmap=lambda x:x):
|
|||||||
|
|
||||||
|
|
||||||
def import_opml(raw, preserve_groups=True):
|
def import_opml(raw, preserve_groups=True):
|
||||||
root = etree.fromstring(raw)
|
root = safe_xml_fromstring(raw)
|
||||||
groups = defaultdict(list)
|
groups = defaultdict(list)
|
||||||
ax = etree.XPath('ancestor::outline[@title or @text]')
|
ax = etree.XPath('ancestor::outline[@title or @text]')
|
||||||
for outline in root.xpath('//outline[@type="rss" and @xmlUrl]'):
|
for outline in root.xpath('//outline[@type="rss" and @xmlUrl]'):
|
||||||
|
@ -8,12 +8,11 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from PyQt5.Qt import QUrl
|
from PyQt5.Qt import QUrl
|
||||||
|
|
||||||
from calibre import (browser, guess_extension)
|
from calibre import (browser, guess_extension)
|
||||||
from calibre.gui2 import open_url
|
from calibre.gui2 import open_url
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.gui2.store import StorePlugin
|
from calibre.gui2.store import StorePlugin
|
||||||
from calibre.gui2.store.search_result import SearchResult
|
from calibre.gui2.store.search_result import SearchResult
|
||||||
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||||
@ -36,7 +35,7 @@ def open_search(url, query, max_results=10, timeout=60):
|
|||||||
counter = max_results
|
counter = max_results
|
||||||
br = browser()
|
br = browser()
|
||||||
with closing(br.open(url, timeout=timeout)) as f:
|
with closing(br.open(url, timeout=timeout)) as f:
|
||||||
doc = etree.fromstring(f.read())
|
doc = safe_xml_fromstring(f.read())
|
||||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
for data in doc.xpath('//*[local-name() = "entry"]'):
|
||||||
if counter <= 0:
|
if counter <= 0:
|
||||||
break
|
break
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
store_version = 5 # Needed for dynamic plugin loading
|
store_version = 6 # Needed for dynamic plugin loading
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2011, 2013, John Schember <john@nachtimwald.com>'
|
||||||
@ -43,7 +43,7 @@ def search(query, max_results=10, timeout=60, write_raw_to=None):
|
|||||||
if write_raw_to is not None:
|
if write_raw_to is not None:
|
||||||
with open(write_raw_to, 'wb') as f:
|
with open(write_raw_to, 'wb') as f:
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
doc = etree.fromstring(raw)
|
doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
for data in doc.xpath('//*[local-name() = "entry"]'):
|
||||||
if counter <= 0:
|
if counter <= 0:
|
||||||
break
|
break
|
||||||
@ -63,7 +63,7 @@ def search(query, max_results=10, timeout=60, write_raw_to=None):
|
|||||||
|
|
||||||
# Get the formats and direct download links.
|
# Get the formats and direct download links.
|
||||||
with closing(br.open(id, timeout=timeout/4)) as nf:
|
with closing(br.open(id, timeout=timeout/4)) as nf:
|
||||||
ndoc = etree.fromstring(nf.read())
|
ndoc = etree.fromstring(nf.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||||
for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
|
for link in ndoc.xpath('//*[local-name() = "link" and @rel = "http://opds-spec.org/acquisition"]'):
|
||||||
type = link.get('type')
|
type = link.get('type')
|
||||||
href = link.get('href')
|
href = link.get('href')
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
store_version = 1 # Needed for dynamic plugin loading
|
store_version = 2 # Needed for dynamic plugin loading
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
|
__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>'
|
||||||
@ -63,8 +63,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
|
|||||||
ungzipResponse(r,br)
|
ungzipResponse(r,br)
|
||||||
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
|
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
|
||||||
|
|
||||||
parser = etree.XMLParser(recover=True, no_network=True)
|
doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||||
doc = etree.fromstring(raw, parser=parser)
|
|
||||||
for data in doc.xpath('//*[local-name() = "fb2-book"]'):
|
for data in doc.xpath('//*[local-name() = "fb2-book"]'):
|
||||||
if counter <= 0:
|
if counter <= 0:
|
||||||
break
|
break
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
store_version = 1 # Needed for dynamic plugin loading
|
store_version = 2 # Needed for dynamic plugin loading
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
@ -46,7 +46,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
|
|||||||
with closing(br.open(url, timeout=timeout)) as f:
|
with closing(br.open(url, timeout=timeout)) as f:
|
||||||
raw_data = f.read()
|
raw_data = f.read()
|
||||||
raw_data = raw_data.decode('utf-8', 'replace')
|
raw_data = raw_data.decode('utf-8', 'replace')
|
||||||
doc = etree.fromstring(raw_data)
|
doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
for data in doc.xpath('//*[local-name() = "entry"]'):
|
||||||
if counter <= 0:
|
if counter <= 0:
|
||||||
break
|
break
|
||||||
@ -71,7 +71,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
|
|||||||
|
|
||||||
# Follow the detail link to get the rest of the info.
|
# Follow the detail link to get the rest of the info.
|
||||||
with closing(br.open(detail_href, timeout=timeout/4)) as df:
|
with closing(br.open(detail_href, timeout=timeout/4)) as df:
|
||||||
ddoc = etree.fromstring(df.read())
|
ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||||
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
|
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
|
||||||
if ddata:
|
if ddata:
|
||||||
ddata = ddata[0]
|
ddata = ddata[0]
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
store_version = 1 # Needed for dynamic plugin loading
|
store_version = 2 # Needed for dynamic plugin loading
|
||||||
|
|
||||||
__license__ = 'GPL 3'
|
__license__ = 'GPL 3'
|
||||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||||
@ -47,7 +47,7 @@ class XinXiiStore(BasicStoreConfig, OpenSearchOPDSStore):
|
|||||||
counter = max_results
|
counter = max_results
|
||||||
br = browser()
|
br = browser()
|
||||||
with closing(br.open(url, timeout=timeout)) as f:
|
with closing(br.open(url, timeout=timeout)) as f:
|
||||||
doc = etree.fromstring(f.read())
|
doc = etree.fromstring(f.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
for data in doc.xpath('//*[local-name() = "entry"]'):
|
||||||
if counter <= 0:
|
if counter <= 0:
|
||||||
break
|
break
|
||||||
|
@ -28,6 +28,7 @@ from calibre.gui2.tweak_book.editor.text import PlainTextEdit, default_font_fami
|
|||||||
from calibre.gui2.tweak_book.editor.themes import theme_color, get_theme
|
from calibre.gui2.tweak_book.editor.themes import theme_color, get_theme
|
||||||
from calibre.gui2.tweak_book.diff import get_sequence_matcher
|
from calibre.gui2.tweak_book.diff import get_sequence_matcher
|
||||||
from calibre.gui2.tweak_book.diff.highlight import get_highlighter
|
from calibre.gui2.tweak_book.diff.highlight import get_highlighter
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
Change = namedtuple('Change', 'ltop lbot rtop rbot kind')
|
Change = namedtuple('Change', 'ltop lbot rtop rbot kind')
|
||||||
|
|
||||||
@ -47,7 +48,7 @@ def beautify_text(raw, syntax):
|
|||||||
from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
|
from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
|
||||||
from calibre.ebooks.chardet import strip_encoding_declarations
|
from calibre.ebooks.chardet import strip_encoding_declarations
|
||||||
if syntax == 'xml':
|
if syntax == 'xml':
|
||||||
root = etree.fromstring(strip_encoding_declarations(raw))
|
root = safe_xml_fromstring(strip_encoding_declarations(raw))
|
||||||
pretty_xml_tree(root)
|
pretty_xml_tree(root)
|
||||||
elif syntax == 'css':
|
elif syntax == 'css':
|
||||||
import logging
|
import logging
|
||||||
|
@ -21,6 +21,7 @@ from calibre import (
|
|||||||
replace_entities, strftime, xml_replace_entities
|
replace_entities, strftime, xml_replace_entities
|
||||||
)
|
)
|
||||||
from calibre.constants import cache_dir, isosx
|
from calibre.constants import cache_dir, isosx
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.customize.conversion import DummyReporter
|
from calibre.customize.conversion import DummyReporter
|
||||||
from calibre.customize.ui import output_profiles
|
from calibre.customize.ui import output_profiles
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, prettify
|
||||||
@ -2992,7 +2993,7 @@ class CatalogBuilder(object):
|
|||||||
<navMap/>
|
<navMap/>
|
||||||
</ncx>
|
</ncx>
|
||||||
'''
|
'''
|
||||||
root = self.ncx_root = etree.fromstring(header)
|
root = self.ncx_root = safe_xml_fromstring(header)
|
||||||
navMapTag = root[0]
|
navMapTag = root[0]
|
||||||
|
|
||||||
if self.generate_for_kindle_mobi:
|
if self.generate_for_kindle_mobi:
|
||||||
@ -3668,7 +3669,7 @@ class CatalogBuilder(object):
|
|||||||
lang=prepare_string_for_xml(lang),
|
lang=prepare_string_for_xml(lang),
|
||||||
pt="periodical:default" if self.generate_for_kindle_mobi else ""
|
pt="periodical:default" if self.generate_for_kindle_mobi else ""
|
||||||
)
|
)
|
||||||
root = etree.fromstring(header)
|
root = safe_xml_fromstring(header)
|
||||||
manifest = root.xpath('//*[local-name()="manifest"]')[0]
|
manifest = root.xpath('//*[local-name()="manifest"]')[0]
|
||||||
spine = root.xpath('//*[local-name()="spine"]')[0]
|
spine = root.xpath('//*[local-name()="spine"]')[0]
|
||||||
guide = root.xpath('//*[local-name()="guide"]')[0]
|
guide = root.xpath('//*[local-name()="guide"]')[0]
|
||||||
|
@ -10,6 +10,7 @@ import sys, glob, os, tempfile, re, codecs
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.constants import config_dir
|
from calibre.constants import config_dir
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.zipfile import ZipFile
|
from calibre.utils.zipfile import ZipFile
|
||||||
from polyglot.builtins import iteritems
|
from polyglot.builtins import iteritems
|
||||||
|
|
||||||
@ -26,7 +27,7 @@ BUILTIN_LOCALES = {'en-US', 'en-GB', 'es-ES'}
|
|||||||
def parse_xcu(raw, origin='%origin%'):
|
def parse_xcu(raw, origin='%origin%'):
|
||||||
' Get the dictionary and affix file names as well as supported locales for each dictionary '
|
' Get the dictionary and affix file names as well as supported locales for each dictionary '
|
||||||
ans = {}
|
ans = {}
|
||||||
root = etree.fromstring(raw)
|
root = safe_xml_fromstring(raw)
|
||||||
|
|
||||||
for node in XPath('//prop[@oor:name="Format"]/value[text()="DICT_SPELL"]/../..')(root):
|
for node in XPath('//prop[@oor:name="Format"]/value[text()="DICT_SPELL"]/../..')(root):
|
||||||
value = XPath('descendant::prop[@oor:name="Locations"]/value')(node)
|
value = XPath('descendant::prop[@oor:name="Locations"]/value')(node)
|
||||||
@ -123,7 +124,7 @@ def import_from_oxt(source_path, name, dest_dir=None, prefix='dic-'):
|
|||||||
key = key[3:]
|
key = key[3:]
|
||||||
return zf.open(key.lstrip('/')).read()
|
return zf.open(key.lstrip('/')).read()
|
||||||
|
|
||||||
root = etree.fromstring(zf.open('META-INF/manifest.xml').read())
|
root = safe_xml_fromstring(zf.open('META-INF/manifest.xml').read())
|
||||||
xcu = XPath('//manifest:file-entry[@manifest:media-type="application/vnd.sun.star.configuration-data"]')(root)[0].get(
|
xcu = XPath('//manifest:file-entry[@manifest:media-type="application/vnd.sun.star.configuration-data"]')(root)[0].get(
|
||||||
'{%s}full-path' % NS_MAP['manifest'])
|
'{%s}full-path' % NS_MAP['manifest'])
|
||||||
for (dic, aff), locales in iteritems(parse_xcu(zf.open(xcu).read(), origin='')):
|
for (dic, aff), locales in iteritems(parse_xcu(zf.open(xcu).read(), origin='')):
|
||||||
|
@ -15,6 +15,7 @@ from lxml.builder import ElementMaker
|
|||||||
|
|
||||||
from calibre.constants import __appname__
|
from calibre.constants import __appname__
|
||||||
from calibre.db.view import sanitize_sort_field_name
|
from calibre.db.view import sanitize_sort_field_name
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.ebooks.metadata import fmt_sidx, authors_to_string, rating_to_stars
|
from calibre.ebooks.metadata import fmt_sidx, authors_to_string, rating_to_stars
|
||||||
from calibre.library.comments import comments_to_html
|
from calibre.library.comments import comments_to_html
|
||||||
from calibre import guess_type, prepare_string_for_xml as xml
|
from calibre import guess_type, prepare_string_for_xml as xml
|
||||||
@ -123,7 +124,7 @@ def html_to_lxml(raw):
|
|||||||
root.set('xmlns', "http://www.w3.org/1999/xhtml")
|
root.set('xmlns', "http://www.w3.org/1999/xhtml")
|
||||||
raw = etree.tostring(root, encoding=None)
|
raw = etree.tostring(root, encoding=None)
|
||||||
try:
|
try:
|
||||||
return etree.fromstring(raw)
|
return safe_xml_fromstring(raw)
|
||||||
except:
|
except:
|
||||||
for x in root.iterdescendants():
|
for x in root.iterdescendants():
|
||||||
remove = []
|
remove = []
|
||||||
@ -134,7 +135,7 @@ def html_to_lxml(raw):
|
|||||||
del x.attrib[a]
|
del x.attrib[a]
|
||||||
raw = etree.tostring(root, encoding=None)
|
raw = etree.tostring(root, encoding=None)
|
||||||
try:
|
try:
|
||||||
return etree.fromstring(raw)
|
return safe_xml_fromstring(raw)
|
||||||
except:
|
except:
|
||||||
from calibre.ebooks.oeb.parse_utils import _html4_parse
|
from calibre.ebooks.oeb.parse_utils import _html4_parse
|
||||||
return _html4_parse(raw)
|
return _html4_parse(raw)
|
||||||
|
@ -11,9 +11,8 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre import browser
|
from calibre import browser
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.opensearch.url import URL
|
from calibre.utils.opensearch.url import URL
|
||||||
|
|
||||||
|
|
||||||
@ -38,7 +37,7 @@ class Description(object):
|
|||||||
'''
|
'''
|
||||||
br = browser()
|
br = browser()
|
||||||
with closing(br.open(url, timeout=15)) as f:
|
with closing(br.open(url, timeout=15)) as f:
|
||||||
doc = etree.fromstring(f.read())
|
doc = safe_xml_fromstring(f.read())
|
||||||
|
|
||||||
# version 1.1 has repeating Url elements.
|
# version 1.1 has repeating Url elements.
|
||||||
self.urls = []
|
self.urls = []
|
||||||
|
19
src/calibre/utils/xml_parse.py
Normal file
19
src/calibre/utils/xml_parse.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
# License: GPL v3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# resolve_entities is turned off as entities can cause
|
||||||
|
# reads of local files, for example:
|
||||||
|
# <!DOCTYPE foo [ <!ENTITY passwd SYSTEM "file:///etc/passwd" >]>
|
||||||
|
SAFE_XML_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||||
|
SAFE_XML_PARSER_NO_RECOVER = etree.XMLParser(recover=False, no_network=True, resolve_entities=False)
|
||||||
|
fs = etree.fromstring
|
||||||
|
|
||||||
|
|
||||||
|
def safe_xml_fromstring(string_or_bytes, recover=True):
|
||||||
|
return fs(string_or_bytes, SAFE_XML_PARSER if recover else SAFE_XML_PARSER_NO_RECOVER)
|
@ -14,6 +14,7 @@ from lxml import etree
|
|||||||
from lxml.builder import ElementMaker
|
from lxml.builder import ElementMaker
|
||||||
|
|
||||||
from calibre import force_unicode
|
from calibre import force_unicode
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.constants import numeric_version
|
from calibre.constants import numeric_version
|
||||||
from calibre.utils.iso8601 import parse_iso8601
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
from calibre.utils.date import now as nowf, utcnow, local_tz, isoformat, EPOCH, UNDEFINED_DATE
|
from calibre.utils.date import now as nowf, utcnow, local_tz, isoformat, EPOCH, UNDEFINED_DATE
|
||||||
@ -124,7 +125,7 @@ def get_custom_recipe_collection(*args):
|
|||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
continue
|
continue
|
||||||
return etree.fromstring(serialize_collection(rmap))
|
return safe_xml_fromstring(serialize_collection(rmap))
|
||||||
|
|
||||||
|
|
||||||
def update_custom_recipe(id_, title, script):
|
def update_custom_recipe(id_, title, script):
|
||||||
@ -287,7 +288,7 @@ class SchedulerConfig(object):
|
|||||||
if os.access(self.conf_path, os.R_OK):
|
if os.access(self.conf_path, os.R_OK):
|
||||||
with ExclusiveFile(self.conf_path) as f:
|
with ExclusiveFile(self.conf_path) as f:
|
||||||
try:
|
try:
|
||||||
self.root = etree.fromstring(f.read())
|
self.root = safe_xml_fromstring(f.read())
|
||||||
except:
|
except:
|
||||||
print('Failed to read recipe scheduler config')
|
print('Failed to read recipe scheduler config')
|
||||||
import traceback
|
import traceback
|
||||||
|
Loading…
x
Reference in New Issue
Block a user