mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
EPUB Input: Make parsing of toc.ncx more robust. Fixes #7170 (Some epub hierarchical tables of contents are not interpreted correctly)
This commit is contained in:
parent
1022ddd101
commit
c2d0a57a41
@ -2,7 +2,7 @@
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
import os, glob, re
|
import os, glob, re, functools
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
@ -11,7 +11,7 @@ from lxml import etree
|
|||||||
from lxml.builder import ElementMaker
|
from lxml.builder import ElementMaker
|
||||||
|
|
||||||
from calibre.constants import __appname__, __version__
|
from calibre.constants import __appname__, __version__
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
|
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
|
||||||
@ -26,14 +26,6 @@ E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
|
|||||||
|
|
||||||
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
|
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
|
||||||
|
|
||||||
class NCXSoup(BeautifulStoneSoup):
|
|
||||||
|
|
||||||
NESTABLE_TAGS = {'navpoint':[]}
|
|
||||||
|
|
||||||
def __init__(self, raw):
|
|
||||||
BeautifulStoneSoup.__init__(self, raw,
|
|
||||||
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
|
||||||
selfClosingTags=['meta', 'content'])
|
|
||||||
|
|
||||||
class TOC(list):
|
class TOC(list):
|
||||||
|
|
||||||
@ -166,40 +158,60 @@ class TOC(list):
|
|||||||
|
|
||||||
def read_ncx_toc(self, toc):
|
def read_ncx_toc(self, toc):
|
||||||
self.base_path = os.path.dirname(toc)
|
self.base_path = os.path.dirname(toc)
|
||||||
raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True)[0]
|
raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True,
|
||||||
soup = NCXSoup(raw)
|
strip_encoding_pats=True)[0]
|
||||||
|
root = etree.fromstring(raw, parser=etree.XMLParser(recover=True,
|
||||||
|
no_network=True))
|
||||||
|
xpn = {'re': 'http://exslt.org/regular-expressions'}
|
||||||
|
XPath = functools.partial(etree.XPath, namespaces=xpn)
|
||||||
|
|
||||||
|
def get_attr(node, default=None, attr='playorder'):
|
||||||
|
for name, val in node.attrib.items():
|
||||||
|
if name and val and name.lower().endswith(attr):
|
||||||
|
return val
|
||||||
|
return default
|
||||||
|
|
||||||
|
nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
|
||||||
|
txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
|
||||||
|
content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
|
||||||
|
np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')
|
||||||
|
|
||||||
def process_navpoint(np, dest):
|
def process_navpoint(np, dest):
|
||||||
play_order = np.get('playOrder', None)
|
try:
|
||||||
if play_order is None:
|
play_order = int(get_attr(np, 1))
|
||||||
play_order = int(np.get('playorder', 1))
|
except:
|
||||||
|
play_order = 1
|
||||||
href = fragment = text = None
|
href = fragment = text = None
|
||||||
nl = np.find(re.compile('navlabel'))
|
nl = nl_path(np)
|
||||||
if nl is not None:
|
if nl:
|
||||||
|
nl = nl[0]
|
||||||
text = u''
|
text = u''
|
||||||
for txt in nl.findAll(re.compile('text')):
|
for txt in txt_path(nl):
|
||||||
text += u''.join([unicode(s) for s in txt.findAll(text=True)])
|
text += etree.tostring(txt, method='text',
|
||||||
content = np.find(re.compile('content'))
|
encoding=unicode, with_tail=False)
|
||||||
if content is None or not content.has_key('src') or not txt:
|
content = content_path(np)
|
||||||
|
if not content or not text:
|
||||||
|
return
|
||||||
|
content = content[0]
|
||||||
|
src = get_attr(content, attr='src')
|
||||||
|
if src is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
purl = urlparse(unquote(content['src']))
|
purl = urlparse(unquote(content.get('src')))
|
||||||
href, fragment = purl[2], purl[5]
|
href, fragment = purl[2], purl[5]
|
||||||
nd = dest.add_item(href, fragment, text)
|
nd = dest.add_item(href, fragment, text)
|
||||||
nd.play_order = play_order
|
nd.play_order = play_order
|
||||||
|
|
||||||
for c in np:
|
for c in np_path(np):
|
||||||
if 'navpoint' in getattr(c, 'name', ''):
|
process_navpoint(c, nd)
|
||||||
process_navpoint(c, nd)
|
|
||||||
|
|
||||||
nm = soup.find(re.compile('navmap'))
|
nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
|
||||||
if nm is None:
|
if not nm:
|
||||||
raise ValueError('NCX files must have a <navmap> element.')
|
raise ValueError('NCX files must have a <navmap> element.')
|
||||||
|
nm = nm[0]
|
||||||
|
|
||||||
for elem in nm:
|
for child in np_path(nm):
|
||||||
if 'navpoint' in getattr(elem, 'name', ''):
|
process_navpoint(child, self)
|
||||||
process_navpoint(elem, self)
|
|
||||||
|
|
||||||
|
|
||||||
def read_html_toc(self, toc):
|
def read_html_toc(self, toc):
|
||||||
self.base_path = os.path.dirname(toc)
|
self.base_path = os.path.dirname(toc)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user