Implement support for parsing ToC from NCX files

2025-07-09 03:04:10 -04:00 · 2008-02-14 04:49:17 +00:00 · 2008-02-14 04:49:17 +00:00 · 2bc40380db
commit 2bc40380db
parent be22c9a3b0
1 changed files with 72 additions and 27 deletions
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -14,7 +14,7 @@
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Read/Write metadata from Open Packaging Format (.opf) files.'''
-import sys, re, os
+import sys, re, os, glob
 from urllib import unquote
 from urlparse import urlparse
 import xml.dom.minidom as dom
@ -85,35 +85,80 @@ class TOC(list):
    def __init__(self, opfreader, cwd):
        self.toc = toc = None
-        try:
+        toc = opfreader.soup.find('spine', toc=True)
            toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
        except:
            for item in opfreader.manifest:
                if 'toc' in item.href.lower():
                    toc = item.href
                    break
        if toc is not None:
-            toc = urlparse(unquote(toc))[2]
+            toc = toc['toc']
-            if not os.path.isabs(toc):
+        if toc is None:
                toc = os.path.join(cwd, toc)
            try:
-                if not os.path.exists(toc):
+                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
                    bn  = os.path.basename(toc)
                    bn  = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
                    toc = os.path.join(os.path.dirname(toc), bn) 
                soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
                for a in soup.findAll('a'):
                    if not a.has_key('href'):
                        continue
                    purl = urlparse(unquote(a['href']))
                    href, fragment = purl[2], purl[5]
                    if not os.path.isabs(href):
                        href = os.path.join(cwd, href)
                    txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
                    self.append((href, fragment, txt))
                self.toc = toc
            except:
-                pass
+                for item in opfreader.manifest:
                    if 'toc' in item.href.lower():
                        toc = item.href
                        break
        if toc is not None:
            if toc.lower() != 'ncx':
                toc = urlparse(unquote(toc))[2]
                if not os.path.isabs(toc):
                    toc = os.path.join(cwd, toc)
                try:
                    if not os.path.exists(toc):
                        bn  = os.path.basename(toc)
                        bn  = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)
                    self.read_html_toc(toc, cwd)
                    self.toc = toc
                except:
                    pass
            else:
                cwd = os.path.abspath(cwd)
                m = glob.glob(os.path.join(cwd, '*.ncx'))
                if m:
                    toc = m[0]
                    try:
                        self.read_ncx_toc(toc)
                        self.toc = toc
                    except:
                        raise
                        pass
    def read_ncx_toc(self, toc):
        bdir = os.path.dirname(toc)
        soup = BeautifulStoneSoup(open(toc, 'rb').read(),
                                  convertEntities=BeautifulSoup.HTML_ENTITIES)
        elems = soup.findAll('navpoint')
        elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
        for elem in elems:
            txt = u''
            for nl in elem.findAll('navlabel'):
                for text in nl.findAll('text'):
                    txt += ''.join([unicode(s) for s in text.findAll(text=True)])
            content = elem.find('content')
            if content is None or not content.has_key('src') or not txt:
                continue
            purl = urlparse(unquote(content['src']))
            href, fragment = purl[2], purl[5]
            if not os.path.isabs(href):
                href = os.path.join(bdir, href)
            self.append((href, fragment, txt))
    def read_html_toc(self, toc, cwd):
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll('a'):
            if not a.has_key('href'):
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
            if not os.path.isabs(href):
                href = os.path.join(cwd, href)
            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
            self.append((href, fragment, txt))
 class standard_field(object):