Fix #2177 (Google ePub TOC)

This commit is contained in:
Kovid Goyal 2009-03-29 15:41:35 -07:00
parent 551e07031a
commit 11d33c0c8b
2 changed files with 37 additions and 36 deletions

View File

@ -508,6 +508,7 @@ class OPF(object):
toc.partition('#')[0], toc.partition('#')[-1] toc.partition('#')[0], toc.partition('#')[-1]
self.toc.read_html_toc(toc) self.toc.read_html_toc(toc)
except: except:
raise
pass pass

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, glob import os, glob, re
from urlparse import urlparse from urlparse import urlparse
from urllib import unquote from urllib import unquote
@ -10,17 +10,17 @@ from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
class NCXSoup(BeautifulStoneSoup): class NCXSoup(BeautifulStoneSoup):
NESTABLE_TAGS = {'navpoint':[]} NESTABLE_TAGS = {'navpoint':[]}
def __init__(self, raw): def __init__(self, raw):
BeautifulStoneSoup.__init__(self, raw, BeautifulStoneSoup.__init__(self, raw,
convertEntities=BeautifulSoup.HTML_ENTITIES, convertEntities=BeautifulSoup.HTML_ENTITIES,
selfClosingTags=['meta', 'content']) selfClosingTags=['meta', 'content'])
class TOC(list): class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0, def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
base_path=os.getcwd(), type='unknown'): base_path=os.getcwd(), type='unknown'):
self.href = href self.href = href
self.fragment = fragment self.fragment = fragment
@ -31,7 +31,7 @@ class TOC(list):
self.base_path = base_path self.base_path = base_path
self.play_order = play_order self.play_order = play_order
self.type = type self.type = type
def __str__(self): def __str__(self):
lines = ['TOC: %s#%s'%(self.href, self.fragment)] lines = ['TOC: %s#%s'%(self.href, self.fragment)]
for child in self: for child in self:
@ -39,10 +39,10 @@ class TOC(list):
for l in c: for l in c:
lines.append('\t'+l) lines.append('\t'+l)
return '\n'.join(lines) return '\n'.join(lines)
def count(self, type): def count(self, type):
return len([i for i in self.flat() if i.type == type]) return len([i for i in self.flat() if i.type == type])
def purge(self, types, max=0): def purge(self, types, max=0):
remove = [] remove = []
for entry in self.flat(): for entry in self.flat():
@ -54,23 +54,23 @@ class TOC(list):
continue continue
entry.parent.remove(entry) entry.parent.remove(entry)
return remove return remove
def remove(self, entry): def remove(self, entry):
list.remove(self, entry) list.remove(self, entry)
entry.parent = None entry.parent = None
def add_item(self, href, fragment, text, play_order=None, type='unknown'): def add_item(self, href, fragment, text, play_order=None, type='unknown'):
if play_order is None: if play_order is None:
play_order = (self[-1].play_order if len(self) else self.play_order) + 1 play_order = (self[-1].play_order if len(self) else self.play_order) + 1
self.append(TOC(href=href, fragment=fragment, text=text, parent=self, self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
base_path=self.base_path, play_order=play_order, type=type)) base_path=self.base_path, play_order=play_order, type=type))
return self[-1] return self[-1]
def top_level_items(self): def top_level_items(self):
for item in self: for item in self:
if item.text is not None: if item.text is not None:
yield item yield item
def depth(self): def depth(self):
depth = 1 depth = 1
for obj in self: for obj in self:
@ -78,14 +78,14 @@ class TOC(list):
if c > depth - 1: if c > depth - 1:
depth = c + 1 depth = c + 1
return depth return depth
def flat(self): def flat(self):
'Depth first iteration over the tree rooted at self' 'Depth first iteration over the tree rooted at self'
yield self yield self
for obj in self: for obj in self:
for i in obj.flat(): for i in obj.flat():
yield i yield i
@apply @apply
def abspath(): def abspath():
doc='Return the file this toc entry points to as a absolute path to a file on the system.' doc='Return the file this toc entry points to as a absolute path to a file on the system.'
@ -96,9 +96,9 @@ class TOC(list):
if not os.path.isabs(path): if not os.path.isabs(path):
path = os.path.join(self.base_path, path) path = os.path.join(self.base_path, path)
return path return path
return property(fget=fget, doc=doc) return property(fget=fget, doc=doc)
def read_from_opf(self, opfreader): def read_from_opf(self, opfreader):
toc = opfreader.soup.find('spine', toc=True) toc = opfreader.soup.find('spine', toc=True)
if toc is not None: if toc is not None:
@ -111,7 +111,7 @@ class TOC(list):
if 'toc' in item.href().lower(): if 'toc' in item.href().lower():
toc = item.href() toc = item.href()
break break
if toc is not None: if toc is not None:
if toc.lower() not in ('ncx', 'ncxtoc'): if toc.lower() not in ('ncx', 'ncxtoc'):
toc = urlparse(unquote(toc))[2] toc = urlparse(unquote(toc))[2]
@ -123,7 +123,7 @@ class TOC(list):
bn = os.path.basename(toc) bn = os.path.basename(toc)
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
toc = os.path.join(os.path.dirname(toc), bn) toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc) self.read_html_toc(toc)
except: except:
print 'WARNING: Could not read Table of Contents. Continuing anyway.' print 'WARNING: Could not read Table of Contents. Continuing anyway.'
@ -141,43 +141,43 @@ class TOC(list):
if m: if m:
toc = m[0] toc = m[0]
self.read_ncx_toc(toc) self.read_ncx_toc(toc)
def read_ncx_toc(self, toc): def read_ncx_toc(self, toc):
self.base_path = os.path.dirname(toc) self.base_path = os.path.dirname(toc)
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0]) soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
def process_navpoint(np, dest): def process_navpoint(np, dest):
play_order = np.get('playOrder', None) play_order = np.get('playOrder', None)
if play_order is None: if play_order is None:
play_order = int(np.get('playorder', 1)) play_order = int(np.get('playorder', 1))
href = fragment = text = None href = fragment = text = None
nl = np.find('navlabel') nl = np.find(re.compile('navlabel'))
if nl is not None: if nl is not None:
text = u'' text = u''
for txt in nl.findAll('text'): for txt in nl.findAll(re.compile('text')):
text += ''.join([unicode(s) for s in txt.findAll(text=True)]) text += ''.join([unicode(s) for s in txt.findAll(text=True)])
content = np.find('content') content = np.find(re.compile('content'))
if content is None or not content.has_key('src') or not txt: if content is None or not content.has_key('src') or not txt:
return return
purl = urlparse(unquote(content['src'])) purl = urlparse(unquote(content['src']))
href, fragment = purl[2], purl[5] href, fragment = purl[2], purl[5]
nd = dest.add_item(href, fragment, text) nd = dest.add_item(href, fragment, text)
nd.play_order = play_order nd.play_order = play_order
for c in np: for c in np:
if getattr(c, 'name', None) == 'navpoint': if 'navpoint' in getattr(c, 'name', ''):
process_navpoint(c, nd) process_navpoint(c, nd)
nm = soup.find('navmap') nm = soup.find(re.compile('navmap'))
if nm is None: if nm is None:
raise ValueError('NCX files must have a <navmap> element.') raise ValueError('NCX files must have a <navmap> element.')
for elem in nm: for elem in nm:
if getattr(elem, 'name', None) == 'navpoint': if 'navpoint' in getattr(elem, 'name', ''):
process_navpoint(elem, self) process_navpoint(elem, self)
def read_html_toc(self, toc): def read_html_toc(self, toc):
self.base_path = os.path.dirname(toc) self.base_path = os.path.dirname(toc)
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
@ -191,13 +191,13 @@ class TOC(list):
else: else:
fragment = fragment.strip() fragment = fragment.strip()
href = href.strip() href = href.strip()
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)]) txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
add = True add = True
for i in self.flat(): for i in self.flat():
if i.href == href and i.fragment == fragment: if i.href == href and i.fragment == fragment:
add = False add = False
break break
if add: if add:
self.add_item(href, fragment, txt) self.add_item(href, fragment, txt)