mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
Replace various other uses of etree.fromstring
etree.fromstring is terminally broken on Windows with unicode objects with non-BMP chars.
This commit is contained in:
parent
a4c0f08a0d
commit
07068b3049
@ -164,11 +164,11 @@ class TXTInput(InputFormatPlugin):
|
|||||||
with open(x, 'rb') as tf:
|
with open(x, 'rb') as tf:
|
||||||
txt += tf.read() + b'\n\n'
|
txt += tf.read() + b'\n\n'
|
||||||
if os.path.exists('metadata.opf'):
|
if os.path.exists('metadata.opf'):
|
||||||
from lxml import etree
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
with open('metadata.opf', 'rb') as mf:
|
with open('metadata.opf', 'rb') as mf:
|
||||||
raw = mf.read()
|
raw = mf.read()
|
||||||
try:
|
try:
|
||||||
root = etree.fromstring(raw)
|
root = safe_xml_fromstring(raw)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
@ -24,7 +24,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from lxml.etree import fromstring, tostring
|
from lxml.etree import tostring
|
||||||
from odf.draw import Frame as odFrame
|
from odf.draw import Frame as odFrame
|
||||||
from odf.draw import Image as odImage
|
from odf.draw import Image as odImage
|
||||||
from odf.namespaces import DCNS, METANS, OFFICENS
|
from odf.namespaces import DCNS, METANS, OFFICENS
|
||||||
@ -34,6 +34,7 @@ from calibre.ebooks.metadata import MetaInformation, authors_to_string, check_is
|
|||||||
from calibre.utils.date import isoformat, parse_date
|
from calibre.utils.date import isoformat, parse_date
|
||||||
from calibre.utils.imghdr import identify
|
from calibre.utils.imghdr import identify
|
||||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
from calibre.utils.zipfile import ZipFile, safe_replace
|
from calibre.utils.zipfile import ZipFile, safe_replace
|
||||||
from polyglot.builtins import as_unicode
|
from polyglot.builtins import as_unicode
|
||||||
|
|
||||||
@ -74,7 +75,7 @@ def get_metadata(stream, extract_cover=True):
|
|||||||
|
|
||||||
with ZipFile(stream) as zf:
|
with ZipFile(stream) as zf:
|
||||||
meta = zf.read('meta.xml')
|
meta = zf.read('meta.xml')
|
||||||
root = fromstring(meta)
|
root = safe_xml_fromstring(meta)
|
||||||
|
|
||||||
def find(field):
|
def find(field):
|
||||||
ns, tag = fields[field]
|
ns, tag = fields[field]
|
||||||
@ -175,7 +176,7 @@ def set_metadata(stream, mi):
|
|||||||
|
|
||||||
|
|
||||||
def _set_metadata(raw, mi):
|
def _set_metadata(raw, mi):
|
||||||
root = fromstring(raw)
|
root = safe_xml_fromstring(raw)
|
||||||
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
|
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
|
||||||
nsrmap = {v: k for k, v in namespaces.items()}
|
nsrmap = {v: k for k, v in namespaces.items()}
|
||||||
|
|
||||||
|
@ -75,8 +75,7 @@ def XPath(x):
|
|||||||
|
|
||||||
|
|
||||||
def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
|
def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
|
||||||
from lxml import etree
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
# total_results = XPath('//openSearch:totalResults')
|
# total_results = XPath('//openSearch:totalResults')
|
||||||
# start_index = XPath('//openSearch:startIndex')
|
# start_index = XPath('//openSearch:startIndex')
|
||||||
# items_per_page = XPath('//openSearch:itemsPerPage')
|
# items_per_page = XPath('//openSearch:itemsPerPage')
|
||||||
@ -111,10 +110,7 @@ def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
|
|||||||
with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:
|
with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:
|
||||||
f.write(raw)
|
f.write(raw)
|
||||||
print('Book details saved to:', f.name, file=sys.stderr)
|
print('Book details saved to:', f.name, file=sys.stderr)
|
||||||
feed = etree.fromstring(
|
feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
|
||||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
|
||||||
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
|
||||||
)
|
|
||||||
return entry(feed)[0]
|
return entry(feed)[0]
|
||||||
|
|
||||||
if isinstance(entry_, str):
|
if isinstance(entry_, str):
|
||||||
@ -494,7 +490,7 @@ class GoogleBooks(Source):
|
|||||||
identifiers={},
|
identifiers={},
|
||||||
timeout=30
|
timeout=30
|
||||||
):
|
):
|
||||||
from lxml import etree
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
entry = XPath('//atom:entry')
|
entry = XPath('//atom:entry')
|
||||||
identifiers = identifiers.copy()
|
identifiers = identifiers.copy()
|
||||||
br = self.browser
|
br = self.browser
|
||||||
@ -525,10 +521,7 @@ class GoogleBooks(Source):
|
|||||||
return False, as_unicode(e)
|
return False, as_unicode(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
feed = etree.fromstring(
|
feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
|
||||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
|
||||||
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
|
||||||
)
|
|
||||||
return True, entry(feed)
|
return True, entry(feed)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.exception('Failed to parse identify results')
|
log.exception('Failed to parse identify results')
|
||||||
|
@ -1425,6 +1425,7 @@ class Page:
|
|||||||
class PDFDocument:
|
class PDFDocument:
|
||||||
|
|
||||||
def __init__(self, xml, opts, log):
|
def __init__(self, xml, opts, log):
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
# from calibre.rpdb import set_trace; set_trace()
|
# from calibre.rpdb import set_trace; set_trace()
|
||||||
|
|
||||||
self.opts, self.log = opts, log
|
self.opts, self.log = opts, log
|
||||||
@ -1435,8 +1436,7 @@ class PDFDocument:
|
|||||||
if self.opts.pdf_footer_regex is None:
|
if self.opts.pdf_footer_regex is None:
|
||||||
self.opts.pdf_footer_regex = '' # Do nothing
|
self.opts.pdf_footer_regex = '' # Do nothing
|
||||||
|
|
||||||
parser = etree.XMLParser(recover=True)
|
self.root = safe_xml_fromstring(xml)
|
||||||
self.root = etree.fromstring(xml, parser=parser)
|
|
||||||
idc = iter(range(sys.maxsize))
|
idc = iter(range(sys.maxsize))
|
||||||
self.stats = DocStats()
|
self.stats = DocStats()
|
||||||
|
|
||||||
|
@ -17,7 +17,6 @@ except ImportError:
|
|||||||
|
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
from qt.core import QUrl
|
from qt.core import QUrl
|
||||||
|
|
||||||
from calibre import browser, prints, url_slash_cleaner
|
from calibre import browser, prints, url_slash_cleaner
|
||||||
@ -27,6 +26,7 @@ from calibre.gui2.store import StorePlugin
|
|||||||
from calibre.gui2.store.basic_config import BasicStoreConfig
|
from calibre.gui2.store.basic_config import BasicStoreConfig
|
||||||
from calibre.gui2.store.search_result import SearchResult
|
from calibre.gui2.store.search_result import SearchResult
|
||||||
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
|
|
||||||
class LitResStore(BasicStoreConfig, StorePlugin):
|
class LitResStore(BasicStoreConfig, StorePlugin):
|
||||||
@ -65,7 +65,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
|
|||||||
ungzipResponse(r, br)
|
ungzipResponse(r, br)
|
||||||
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
|
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
|
||||||
|
|
||||||
doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
doc = safe_xml_fromstring(raw)
|
||||||
for data in doc.xpath('//*[local-name() = "fb2-book"]'):
|
for data in doc.xpath('//*[local-name() = "fb2-book"]'):
|
||||||
if counter <= 0:
|
if counter <= 0:
|
||||||
break
|
break
|
||||||
|
@ -10,14 +10,13 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from calibre import browser
|
from calibre import browser
|
||||||
from calibre.gui2.store.basic_config import BasicStoreConfig
|
from calibre.gui2.store.basic_config import BasicStoreConfig
|
||||||
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
|
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
|
||||||
from calibre.gui2.store.search_result import SearchResult
|
from calibre.gui2.store.search_result import SearchResult
|
||||||
from calibre.utils.opensearch.description import Description
|
from calibre.utils.opensearch.description import Description
|
||||||
from calibre.utils.opensearch.query import Query
|
from calibre.utils.opensearch.query import Query
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
|
|
||||||
|
|
||||||
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'):
|
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'):
|
||||||
@ -45,8 +44,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
|
|||||||
br = browser()
|
br = browser()
|
||||||
with closing(br.open(url, timeout=timeout)) as f:
|
with closing(br.open(url, timeout=timeout)) as f:
|
||||||
raw_data = f.read()
|
raw_data = f.read()
|
||||||
raw_data = raw_data.decode('utf-8', 'replace')
|
doc = safe_xml_fromstring(raw_data)
|
||||||
doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
|
||||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
for data in doc.xpath('//*[local-name() = "entry"]'):
|
||||||
if counter <= 0:
|
if counter <= 0:
|
||||||
break
|
break
|
||||||
@ -71,7 +69,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
|
|||||||
|
|
||||||
# Follow the detail link to get the rest of the info.
|
# Follow the detail link to get the rest of the info.
|
||||||
with closing(br.open(detail_href, timeout=timeout/4)) as df:
|
with closing(br.open(detail_href, timeout=timeout/4)) as df:
|
||||||
ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
ddoc = safe_xml_fromstring(df.read())
|
||||||
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
|
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
|
||||||
if ddata:
|
if ddata:
|
||||||
ddata = ddata[0]
|
ddata = ddata[0]
|
||||||
|
@ -763,8 +763,10 @@ def read_text_from_container(container, target_lang=''):
|
|||||||
|
|
||||||
def read_alt_text_from_xmp(xmp, target_lang='') -> str:
|
def read_alt_text_from_xmp(xmp, target_lang='') -> str:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||||
try:
|
try:
|
||||||
root = etree.fromstring(xmp)
|
root = safe_xml_fromstring(xmp)
|
||||||
except Exception:
|
except Exception:
|
||||||
return ''
|
return ''
|
||||||
# print(etree.tostring(root, encoding='utf-8', pretty_print=True).decode())
|
# print(etree.tostring(root, encoding='utf-8', pretty_print=True).decode())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user