Content server OPDS feeds: Fix error if the metadata for a book contains particular unicode characters. Fixes #1897410 [Private bug](https://bugs.launchpad.net/calibre/+bug/1897410)

libxml's parser chokes on them on windows. Use my html5-parser instead.
This commit is contained in:
Kovid Goyal 2020-10-03 15:07:25 +05:30
parent 0e25387a9f
commit e258ae93bd
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -7,31 +7,29 @@ __copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import hashlib import hashlib
from functools import partial
from collections import OrderedDict, namedtuple from collections import OrderedDict, namedtuple
from functools import partial
from lxml import etree, html from html5_parser import parse
from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from calibre import force_unicode, guess_type, prepare_string_for_xml as xml
from calibre.constants import __appname__ from calibre.constants import __appname__
from calibre.db.view import sanitize_sort_field_name from calibre.db.view import sanitize_sort_field_name
from calibre.utils.xml_parse import safe_xml_fromstring from calibre.ebooks.metadata import authors_to_string, fmt_sidx, rating_to_stars
from calibre.ebooks.metadata import fmt_sidx, authors_to_string, rating_to_stars
from calibre.library.comments import comments_to_html from calibre.library.comments import comments_to_html
from calibre import guess_type, prepare_string_for_xml as xml from calibre.srv.errors import HTTPInternalServerError, HTTPNotFound
from calibre.utils.icu import sort_key
from calibre.utils.date import as_utc, timestampfromdt, is_date_undefined
from calibre.utils.search_query_parser import ParseException
from calibre.utils.config import prefs
from calibre import force_unicode
from calibre.srv.errors import HTTPNotFound, HTTPInternalServerError
from calibre.srv.routes import endpoint
from calibre.srv.http_request import parse_uri from calibre.srv.http_request import parse_uri
from calibre.srv.utils import get_library_data, http_date, Offsets from calibre.srv.routes import endpoint
from polyglot.builtins import iteritems, unicode_type, filter, as_bytes from calibre.srv.utils import Offsets, get_library_data, http_date
from polyglot.urllib import urlencode, unquote_plus from calibre.utils.config import prefs
from calibre.utils.date import as_utc, is_date_undefined, timestampfromdt
from calibre.utils.icu import sort_key
from calibre.utils.search_query_parser import ParseException
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.binary import as_hex_unicode, from_hex_unicode from polyglot.binary import as_hex_unicode, from_hex_unicode
from polyglot.builtins import as_bytes, filter, iteritems, unicode_type
from polyglot.urllib import unquote_plus, urlencode
def atom(ctx, rd, endpoint, output): def atom(ctx, rd, endpoint, output):
@ -42,7 +40,6 @@ def atom(ctx, rd, endpoint, output):
elif isinstance(output, unicode_type): elif isinstance(output, unicode_type):
ans = output.encode('utf-8') ans = output.encode('utf-8')
else: else:
from lxml import etree
ans = etree.tostring(output, encoding='utf-8', xml_declaration=True, pretty_print=True) ans = etree.tostring(output, encoding='utf-8', xml_declaration=True, pretty_print=True)
return ans return ans
@ -120,7 +117,8 @@ PREVIOUS_LINK = partial(NAVLINK, rel='previous')
def html_to_lxml(raw): def html_to_lxml(raw):
raw = '<div>%s</div>'%raw raw = '<div>%s</div>'%raw
root = html.fragment_fromstring(raw) root = parse(raw, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True)
root = next(root.iterdescendants('div'))
root.set('xmlns', "http://www.w3.org/1999/xhtml") root.set('xmlns', "http://www.w3.org/1999/xhtml")
raw = etree.tostring(root, encoding=None) raw = etree.tostring(root, encoding=None)
try: try: