mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Do not use BeautifulSoup to read HTML tocs
This commit is contained in:
parent
a3c0ce3b24
commit
296eda53ea
@ -10,7 +10,6 @@ from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from polyglot.builtins import unicode_type
|
||||
@ -29,6 +28,26 @@ E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
|
||||
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
|
||||
|
||||
|
||||
def parse_html_toc(data):
|
||||
from html5_parser import parse
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from lxml import etree
|
||||
if isinstance(data, bytes):
|
||||
data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
|
||||
for a in root.xpath('//*[@href and local-name()="a"]'):
|
||||
purl = urlparse(unquote(a.get('href')))
|
||||
href, fragment = purl[2], purl[5]
|
||||
if not fragment:
|
||||
fragment = None
|
||||
else:
|
||||
fragment = fragment.strip()
|
||||
href = href.strip()
|
||||
|
||||
txt = etree.tostring(a, method='text', encoding='unicode')
|
||||
yield href, fragment, txt
|
||||
|
||||
|
||||
class TOC(list):
|
||||
|
||||
def __init__(self, href=None, fragment=None, text=None, parent=None,
|
||||
@ -217,19 +236,7 @@ class TOC(list):
|
||||
|
||||
def read_html_toc(self, toc):
|
||||
self.base_path = os.path.dirname(toc)
|
||||
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||
for a in soup.findAll('a'):
|
||||
if not a.has_key('href'): # noqa
|
||||
continue
|
||||
purl = urlparse(unquote(a['href']))
|
||||
href, fragment = purl[2], purl[5]
|
||||
if not fragment:
|
||||
fragment = None
|
||||
else:
|
||||
fragment = fragment.strip()
|
||||
href = href.strip()
|
||||
|
||||
txt = ''.join([unicode_type(s).strip() for s in a.findAll(text=True)])
|
||||
for href, fragment, txt in parse_html_toc(lopen(toc, 'rb').read()):
|
||||
add = True
|
||||
for i in self.flat():
|
||||
if i.href == href and i.fragment == fragment:
|
||||
|
Loading…
x
Reference in New Issue
Block a user