mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
Replace various other uses of etree.fromstring
etree.fromstring is terminally broken on Windows with unicode objects with non-BMP chars.
This commit is contained in:
parent
a4c0f08a0d
commit
07068b3049
@ -164,11 +164,11 @@ class TXTInput(InputFormatPlugin):
|
||||
with open(x, 'rb') as tf:
|
||||
txt += tf.read() + b'\n\n'
|
||||
if os.path.exists('metadata.opf'):
|
||||
from lxml import etree
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
with open('metadata.opf', 'rb') as mf:
|
||||
raw = mf.read()
|
||||
try:
|
||||
root = etree.fromstring(raw)
|
||||
root = safe_xml_fromstring(raw)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
|
@ -24,7 +24,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from lxml.etree import fromstring, tostring
|
||||
from lxml.etree import tostring
|
||||
from odf.draw import Frame as odFrame
|
||||
from odf.draw import Image as odImage
|
||||
from odf.namespaces import DCNS, METANS, OFFICENS
|
||||
@ -34,6 +34,7 @@ from calibre.ebooks.metadata import MetaInformation, authors_to_string, check_is
|
||||
from calibre.utils.date import isoformat, parse_date
|
||||
from calibre.utils.imghdr import identify
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.utils.zipfile import ZipFile, safe_replace
|
||||
from polyglot.builtins import as_unicode
|
||||
|
||||
@ -74,7 +75,7 @@ def get_metadata(stream, extract_cover=True):
|
||||
|
||||
with ZipFile(stream) as zf:
|
||||
meta = zf.read('meta.xml')
|
||||
root = fromstring(meta)
|
||||
root = safe_xml_fromstring(meta)
|
||||
|
||||
def find(field):
|
||||
ns, tag = fields[field]
|
||||
@ -175,7 +176,7 @@ def set_metadata(stream, mi):
|
||||
|
||||
|
||||
def _set_metadata(raw, mi):
|
||||
root = fromstring(raw)
|
||||
root = safe_xml_fromstring(raw)
|
||||
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
|
||||
nsrmap = {v: k for k, v in namespaces.items()}
|
||||
|
||||
|
@ -75,8 +75,7 @@ def XPath(x):
|
||||
|
||||
|
||||
def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
|
||||
from lxml import etree
|
||||
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
# total_results = XPath('//openSearch:totalResults')
|
||||
# start_index = XPath('//openSearch:startIndex')
|
||||
# items_per_page = XPath('//openSearch:itemsPerPage')
|
||||
@ -111,10 +110,7 @@ def to_metadata(browser, log, entry_, timeout, running_a_test=False): # {{{
|
||||
with open(os.path.join(tempfile.gettempdir(), 'Google-' + details_url.split('/')[-1] + '.xml'), 'wb') as f:
|
||||
f.write(raw)
|
||||
print('Book details saved to:', f.name, file=sys.stderr)
|
||||
feed = etree.fromstring(
|
||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||
)
|
||||
feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
|
||||
return entry(feed)[0]
|
||||
|
||||
if isinstance(entry_, str):
|
||||
@ -494,7 +490,7 @@ class GoogleBooks(Source):
|
||||
identifiers={},
|
||||
timeout=30
|
||||
):
|
||||
from lxml import etree
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
entry = XPath('//atom:entry')
|
||||
identifiers = identifiers.copy()
|
||||
br = self.browser
|
||||
@ -525,10 +521,7 @@ class GoogleBooks(Source):
|
||||
return False, as_unicode(e)
|
||||
|
||||
try:
|
||||
feed = etree.fromstring(
|
||||
xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0],
|
||||
parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||
)
|
||||
feed = safe_xml_fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0])
|
||||
return True, entry(feed)
|
||||
except Exception as e:
|
||||
log.exception('Failed to parse identify results')
|
||||
|
@ -1425,6 +1425,7 @@ class Page:
|
||||
class PDFDocument:
|
||||
|
||||
def __init__(self, xml, opts, log):
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
# from calibre.rpdb import set_trace; set_trace()
|
||||
|
||||
self.opts, self.log = opts, log
|
||||
@ -1435,8 +1436,7 @@ class PDFDocument:
|
||||
if self.opts.pdf_footer_regex is None:
|
||||
self.opts.pdf_footer_regex = '' # Do nothing
|
||||
|
||||
parser = etree.XMLParser(recover=True)
|
||||
self.root = etree.fromstring(xml, parser=parser)
|
||||
self.root = safe_xml_fromstring(xml)
|
||||
idc = iter(range(sys.maxsize))
|
||||
self.stats = DocStats()
|
||||
|
||||
|
@ -17,7 +17,6 @@ except ImportError:
|
||||
|
||||
from contextlib import closing
|
||||
|
||||
from lxml import etree
|
||||
from qt.core import QUrl
|
||||
|
||||
from calibre import browser, prints, url_slash_cleaner
|
||||
@ -27,6 +26,7 @@ from calibre.gui2.store import StorePlugin
|
||||
from calibre.gui2.store.basic_config import BasicStoreConfig
|
||||
from calibre.gui2.store.search_result import SearchResult
|
||||
from calibre.gui2.store.web_store_dialog import WebStoreDialog
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
|
||||
class LitResStore(BasicStoreConfig, StorePlugin):
|
||||
@ -65,7 +65,7 @@ class LitResStore(BasicStoreConfig, StorePlugin):
|
||||
ungzipResponse(r, br)
|
||||
raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]
|
||||
|
||||
doc = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||
doc = safe_xml_fromstring(raw)
|
||||
for data in doc.xpath('//*[local-name() = "fb2-book"]'):
|
||||
if counter <= 0:
|
||||
break
|
||||
|
@ -10,14 +10,13 @@ __docformat__ = 'restructuredtext en'
|
||||
import mimetypes
|
||||
from contextlib import closing
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre import browser
|
||||
from calibre.gui2.store.basic_config import BasicStoreConfig
|
||||
from calibre.gui2.store.opensearch_store import OpenSearchOPDSStore
|
||||
from calibre.gui2.store.search_result import SearchResult
|
||||
from calibre.utils.opensearch.description import Description
|
||||
from calibre.utils.opensearch.query import Query
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
|
||||
def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://www.manybooks.net/opds/'):
|
||||
@ -45,8 +44,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
|
||||
br = browser()
|
||||
with closing(br.open(url, timeout=timeout)) as f:
|
||||
raw_data = f.read()
|
||||
raw_data = raw_data.decode('utf-8', 'replace')
|
||||
doc = etree.fromstring(raw_data, parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||
doc = safe_xml_fromstring(raw_data)
|
||||
for data in doc.xpath('//*[local-name() = "entry"]'):
|
||||
if counter <= 0:
|
||||
break
|
||||
@ -71,7 +69,7 @@ def search_manybooks(query, max_results=10, timeout=60, open_search_url='http://
|
||||
|
||||
# Follow the detail link to get the rest of the info.
|
||||
with closing(br.open(detail_href, timeout=timeout/4)) as df:
|
||||
ddoc = etree.fromstring(df.read(), parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False))
|
||||
ddoc = safe_xml_fromstring(df.read())
|
||||
ddata = ddoc.xpath('//*[local-name() = "entry"][1]')
|
||||
if ddata:
|
||||
ddata = ddata[0]
|
||||
|
@ -763,8 +763,10 @@ def read_text_from_container(container, target_lang=''):
|
||||
|
||||
def read_alt_text_from_xmp(xmp, target_lang='') -> str:
|
||||
from lxml import etree
|
||||
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
try:
|
||||
root = etree.fromstring(xmp)
|
||||
root = safe_xml_fromstring(xmp)
|
||||
except Exception:
|
||||
return ''
|
||||
# print(etree.tostring(root, encoding='utf-8', pretty_print=True).decode())
|
||||
|
Loading…
x
Reference in New Issue
Block a user