diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py index 0e6bf0b4bf..952978fc36 100644 --- a/src/calibre/ebooks/metadata/opf2.py +++ b/src/calibre/ebooks/metadata/opf2.py @@ -508,6 +508,7 @@ class OPF(object): toc.partition('#')[0], toc.partition('#')[-1] self.toc.read_html_toc(toc) except: + raise pass diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py index e4f9161ebb..839c966271 100644 --- a/src/calibre/ebooks/metadata/toc.py +++ b/src/calibre/ebooks/metadata/toc.py @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import os, glob +import os, glob, re from urlparse import urlparse from urllib import unquote @@ -10,17 +10,17 @@ from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup from calibre.ebooks.chardet import xml_to_unicode class NCXSoup(BeautifulStoneSoup): - + NESTABLE_TAGS = {'navpoint':[]} - + def __init__(self, raw): - BeautifulStoneSoup.__init__(self, raw, + BeautifulStoneSoup.__init__(self, raw, convertEntities=BeautifulSoup.HTML_ENTITIES, selfClosingTags=['meta', 'content']) class TOC(list): - - def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0, + + def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0, base_path=os.getcwd(), type='unknown'): self.href = href self.fragment = fragment @@ -31,7 +31,7 @@ class TOC(list): self.base_path = base_path self.play_order = play_order self.type = type - + def __str__(self): lines = ['TOC: %s#%s'%(self.href, self.fragment)] for child in self: @@ -39,10 +39,10 @@ class TOC(list): for l in c: lines.append('\t'+l) return '\n'.join(lines) - + def count(self, type): return len([i for i in self.flat() if i.type == type]) - + def purge(self, types, max=0): remove = [] for entry in self.flat(): @@ -54,23 +54,23 @@ class TOC(list): continue entry.parent.remove(entry) return remove - + def remove(self, entry): list.remove(self, entry) entry.parent = None - + def add_item(self, href, fragment, text, play_order=None, type='unknown'): if play_order is None: play_order = (self[-1].play_order if len(self) else self.play_order) + 1 self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path, play_order=play_order, type=type)) return self[-1] - + def top_level_items(self): for item in self: if item.text is not None: yield item - + def depth(self): depth = 1 for obj in self: @@ -78,14 +78,14 @@ class TOC(list): if c > depth - 1: depth = c + 1 return depth - + def flat(self): 'Depth first iteration over the tree rooted at self' yield self for obj in self: for i in obj.flat(): yield i - + @apply def abspath(): doc='Return the file this toc entry points to as a absolute path to a file on the system.' @@ -96,9 +96,9 @@ class TOC(list): if not os.path.isabs(path): path = os.path.join(self.base_path, path) return path - - return property(fget=fget, doc=doc) - + + return property(fget=fget, doc=doc) + def read_from_opf(self, opfreader): toc = opfreader.soup.find('spine', toc=True) if toc is not None: @@ -111,7 +111,7 @@ class TOC(list): if 'toc' in item.href().lower(): toc = item.href() break - + if toc is not None: if toc.lower() not in ('ncx', 'ncxtoc'): toc = urlparse(unquote(toc))[2] @@ -123,7 +123,7 @@ class TOC(list): bn = os.path.basename(toc) bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files toc = os.path.join(os.path.dirname(toc), bn) - + self.read_html_toc(toc) except: print 'WARNING: Could not read Table of Contents. Continuing anyway.' @@ -141,43 +141,43 @@ class TOC(list): if m: toc = m[0] self.read_ncx_toc(toc) - + def read_ncx_toc(self, toc): self.base_path = os.path.dirname(toc) soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0]) - + def process_navpoint(np, dest): play_order = np.get('playOrder', None) if play_order is None: play_order = int(np.get('playorder', 1)) href = fragment = text = None - nl = np.find('navlabel') + nl = np.find(re.compile('navlabel')) if nl is not None: text = u'' - for txt in nl.findAll('text'): + for txt in nl.findAll(re.compile('text')): text += ''.join([unicode(s) for s in txt.findAll(text=True)]) - content = np.find('content') + content = np.find(re.compile('content')) if content is None or not content.has_key('src') or not txt: return - + purl = urlparse(unquote(content['src'])) href, fragment = purl[2], purl[5] nd = dest.add_item(href, fragment, text) nd.play_order = play_order - + for c in np: - if getattr(c, 'name', None) == 'navpoint': + if 'navpoint' in getattr(c, 'name', ''): process_navpoint(c, nd) - - nm = soup.find('navmap') + + nm = soup.find(re.compile('navmap')) if nm is None: raise ValueError('NCX files must have a element.') - + for elem in nm: - if getattr(elem, 'name', None) == 'navpoint': + if 'navpoint' in getattr(elem, 'name', ''): process_navpoint(elem, self) - - + + def read_html_toc(self, toc): self.base_path = os.path.dirname(toc) soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) @@ -191,13 +191,13 @@ class TOC(list): else: fragment = fragment.strip() href = href.strip() - + txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)]) add = True for i in self.flat(): if i.href == href and i.fragment == fragment: add = False - break + break if add: self.add_item(href, fragment, txt)