Implement support for parsing ToC from NCX files

2025-07-09 03:04:10 -04:00 · 2008-02-14 04:49:17 +00:00 · 2008-02-14 04:49:17 +00:00 · 2bc40380db
commit 2bc40380db
parent be22c9a3b0
1 changed files with 72 additions and 27 deletions
--- a/src/libprs500/ebooks/metadata/opf.py
+++ b/src/libprs500/ebooks/metadata/opf.py
@ -14,7 +14,7 @@
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''Read/Write metadata from Open Packaging Format (.opf) files.'''

-import sys, re, os
+import sys, re, os, glob
 from urllib import unquote
 from urlparse import urlparse
 import xml.dom.minidom as dom
@ -85,6 +85,10 @@ class TOC(list):
    
    def __init__(self, opfreader, cwd):
        self.toc = toc = None
+        toc = opfreader.soup.find('spine', toc=True)
+        if toc is not None:
+            toc = toc['toc']
+        if toc is None:
            try:
                toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
            except:
@ -92,7 +96,9 @@ class TOC(list):
                    if 'toc' in item.href.lower():
                        toc = item.href
                        break
+                            
        if toc is not None:
+            if toc.lower() != 'ncx':
                toc = urlparse(unquote(toc))[2]
                if not os.path.isabs(toc):
                    toc = os.path.join(cwd, toc)
@ -101,6 +107,48 @@ class TOC(list):
                        bn  = os.path.basename(toc)
                        bn  = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)
+                    
+                    self.read_html_toc(toc, cwd)
+                    self.toc = toc
+                except:
+                    pass
+            else:
+                cwd = os.path.abspath(cwd)
+                m = glob.glob(os.path.join(cwd, '*.ncx'))
+                if m:
+                    toc = m[0]
+                    try:
+                        self.read_ncx_toc(toc)
+                        self.toc = toc
+                    except:
+                        raise
+                        pass
+            
+    def read_ncx_toc(self, toc):
+        bdir = os.path.dirname(toc)
+        soup = BeautifulStoneSoup(open(toc, 'rb').read(),
+                                  convertEntities=BeautifulSoup.HTML_ENTITIES)
+        elems = soup.findAll('navpoint')
+        elems.sort(cmp=lambda x, y: cmp(int(x['playorder']), int(y['playorder'])))
+        
+        for elem in elems:
+            txt = u''
+            for nl in elem.findAll('navlabel'):
+                for text in nl.findAll('text'):
+                    txt += ''.join([unicode(s) for s in text.findAll(text=True)])
+            
+            content = elem.find('content')
+            if content is None or not content.has_key('src') or not txt:
+                continue
+            
+            purl = urlparse(unquote(content['src']))
+            href, fragment = purl[2], purl[5]
+            if not os.path.isabs(href):
+                href = os.path.join(bdir, href)
+            self.append((href, fragment, txt))
+        
+    
+    def read_html_toc(self, toc, cwd):
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll('a'):
            if not a.has_key('href'):
@ -111,9 +159,6 @@ class TOC(list):
                href = os.path.join(cwd, href)
            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
            self.append((href, fragment, txt))
-                self.toc = toc
-            except:
-                pass
            

 class standard_field(object):