Fix #2177 (Google ePub TOC)

2025-07-08 02:34:06 -04:00 · 2009-03-29 15:41:35 -07:00 · 2009-03-29 15:41:35 -07:00 · 11d33c0c8b
commit 11d33c0c8b
parent 551e07031a
2 changed files with 37 additions and 36 deletions
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@ -508,6 +508,7 @@ class OPF(object):
                    toc.partition('#')[0], toc.partition('#')[-1]
                self.toc.read_html_toc(toc)
        except:
+            raise
            pass


--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -1,7 +1,7 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
-import os, glob
+import os, glob, re
 from urlparse import urlparse
 from urllib import unquote

@ -10,17 +10,17 @@ from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
 from calibre.ebooks.chardet import xml_to_unicode

 class NCXSoup(BeautifulStoneSoup):
-    
+
    NESTABLE_TAGS = {'navpoint':[]}
-    
+
    def __init__(self, raw):
-        BeautifulStoneSoup.__init__(self, raw,  
+        BeautifulStoneSoup.__init__(self, raw,
                                  convertEntities=BeautifulSoup.HTML_ENTITIES,
                                  selfClosingTags=['meta', 'content'])

 class TOC(list):
-    
-    def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0, 
+
+    def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
                 base_path=os.getcwd(), type='unknown'):
        self.href = href
        self.fragment = fragment
@ -31,7 +31,7 @@ class TOC(list):
        self.base_path = base_path
        self.play_order = play_order
        self.type = type
-    
+
    def __str__(self):
        lines = ['TOC: %s#%s'%(self.href, self.fragment)]
        for child in self:
@ -39,10 +39,10 @@ class TOC(list):
            for l in c:
                lines.append('\t'+l)
        return '\n'.join(lines)
-    
+
    def count(self, type):
        return len([i for i in self.flat() if i.type == type])
-    
+
    def purge(self, types, max=0):
        remove = []
        for entry in self.flat():
@ -54,23 +54,23 @@ class TOC(list):
                continue
            entry.parent.remove(entry)
        return remove
-    
+
    def remove(self, entry):
        list.remove(self, entry)
        entry.parent = None
-        
+
    def add_item(self, href, fragment, text, play_order=None, type='unknown'):
        if play_order is None:
            play_order = (self[-1].play_order if len(self) else self.play_order) + 1
        self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
                        base_path=self.base_path, play_order=play_order, type=type))
        return self[-1]
-    
+
    def top_level_items(self):
        for item in self:
            if item.text is not None:
                yield item
-    
+
    def depth(self):
        depth = 1
        for obj in self:
@ -78,14 +78,14 @@ class TOC(list):
            if c > depth - 1:
                depth = c + 1
        return depth
-    
+
    def flat(self):
        'Depth first iteration over the tree rooted at self'
        yield self
        for obj in self:
            for i in obj.flat():
                yield i
-    
+
    @apply
    def abspath():
        doc='Return the file this toc entry points to as a absolute path to a file on the system.'
@ -96,9 +96,9 @@ class TOC(list):
            if not os.path.isabs(path):
                path = os.path.join(self.base_path, path)
            return path
-            
-        return property(fget=fget, doc=doc) 
-    
+
+        return property(fget=fget, doc=doc)
+
    def read_from_opf(self, opfreader):
        toc = opfreader.soup.find('spine', toc=True)
        if toc is not None:
@ -111,7 +111,7 @@ class TOC(list):
                    if 'toc' in item.href().lower():
                        toc = item.href()
                        break
-        
+
        if toc is not None:
            if toc.lower() not in ('ncx', 'ncxtoc'):
                toc = urlparse(unquote(toc))[2]
@ -123,7 +123,7 @@ class TOC(list):
                        bn  = os.path.basename(toc)
                        bn  = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
                        toc = os.path.join(os.path.dirname(toc), bn)
-                    
+
                    self.read_html_toc(toc)
                except:
                    print 'WARNING: Could not read Table of Contents. Continuing anyway.'
@ -141,43 +141,43 @@ class TOC(list):
                if m:
                    toc = m[0]
                    self.read_ncx_toc(toc)
-                    
+
    def read_ncx_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
-        
+
        def process_navpoint(np, dest):
            play_order = np.get('playOrder', None)
            if play_order is None:
                play_order = int(np.get('playorder', 1))
            href = fragment = text = None
-            nl = np.find('navlabel')
+            nl = np.find(re.compile('navlabel'))
            if nl is not None:
                text = u''
-                for txt in nl.findAll('text'):
+                for txt in nl.findAll(re.compile('text')):
                    text += ''.join([unicode(s) for s in txt.findAll(text=True)])
-                content = np.find('content')
+                content = np.find(re.compile('content'))
                if content is None or not content.has_key('src') or not txt:
                    return
-                
+
                purl = urlparse(unquote(content['src']))
                href, fragment = purl[2], purl[5]
            nd = dest.add_item(href, fragment, text)
            nd.play_order = play_order
-                
+
            for c in np:
-                if getattr(c, 'name', None) == 'navpoint':
+                if 'navpoint' in getattr(c, 'name', ''):
                    process_navpoint(c, nd)
-            
-        nm = soup.find('navmap')
+
+        nm = soup.find(re.compile('navmap'))
        if nm is None:
            raise ValueError('NCX files must have a <navmap> element.')
-        
+
        for elem in nm:
-            if getattr(elem, 'name', None) == 'navpoint':
+            if 'navpoint' in getattr(elem, 'name', ''):
                process_navpoint(elem, self)
-            
-        
+
+
    def read_html_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
@ -191,13 +191,13 @@ class TOC(list):
            else:
                fragment = fragment.strip()
            href = href.strip()
-            
+
            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
            add = True
            for i in self.flat():
                if i.href == href and i.fragment == fragment:
                    add = False
-                    break 
+                    break
            if add:
                self.add_item(href, fragment, txt)