Do not use BeautifulSoup to read HTML tocs

This commit is contained in:
Kovid Goyal 2019-03-24 13:15:50 +05:30
parent a3c0ce3b24
commit 296eda53ea
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -10,7 +10,6 @@ from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from calibre.constants import __appname__, __version__ from calibre.constants import __appname__, __version__
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars from calibre.utils.cleantext import clean_xml_chars
from polyglot.builtins import unicode_type from polyglot.builtins import unicode_type
@ -29,6 +28,26 @@ E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP) C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
def parse_html_toc(data):
from html5_parser import parse
from calibre.utils.cleantext import clean_xml_chars
from lxml import etree
if isinstance(data, bytes):
data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
for a in root.xpath('//*[@href and local-name()="a"]'):
purl = urlparse(unquote(a.get('href')))
href, fragment = purl[2], purl[5]
if not fragment:
fragment = None
else:
fragment = fragment.strip()
href = href.strip()
txt = etree.tostring(a, method='text', encoding='unicode')
yield href, fragment, txt
class TOC(list): class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None, def __init__(self, href=None, fragment=None, text=None, parent=None,
@ -217,19 +236,7 @@ class TOC(list):
def read_html_toc(self, toc): def read_html_toc(self, toc):
self.base_path = os.path.dirname(toc) self.base_path = os.path.dirname(toc)
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) for href, fragment, txt in parse_html_toc(lopen(toc, 'rb').read()):
for a in soup.findAll('a'):
if not a.has_key('href'): # noqa
continue
purl = urlparse(unquote(a['href']))
href, fragment = purl[2], purl[5]
if not fragment:
fragment = None
else:
fragment = fragment.strip()
href = href.strip()
txt = ''.join([unicode_type(s).strip() for s in a.findAll(text=True)])
add = True add = True
for i in self.flat(): for i in self.flat():
if i.href == href and i.fragment == fragment: if i.href == href and i.fragment == fragment: