mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Fix #2177 (Google ePub TOC)
This commit is contained in:
parent
551e07031a
commit
11d33c0c8b
@ -508,6 +508,7 @@ class OPF(object):
|
|||||||
toc.partition('#')[0], toc.partition('#')[-1]
|
toc.partition('#')[0], toc.partition('#')[-1]
|
||||||
self.toc.read_html_toc(toc)
|
self.toc.read_html_toc(toc)
|
||||||
except:
|
except:
|
||||||
|
raise
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
import os, glob
|
import os, glob, re
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
|
|
||||||
@ -10,17 +10,17 @@ from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
|
|||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
|
||||||
class NCXSoup(BeautifulStoneSoup):
|
class NCXSoup(BeautifulStoneSoup):
|
||||||
|
|
||||||
NESTABLE_TAGS = {'navpoint':[]}
|
NESTABLE_TAGS = {'navpoint':[]}
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw):
|
||||||
BeautifulStoneSoup.__init__(self, raw,
|
BeautifulStoneSoup.__init__(self, raw,
|
||||||
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
convertEntities=BeautifulSoup.HTML_ENTITIES,
|
||||||
selfClosingTags=['meta', 'content'])
|
selfClosingTags=['meta', 'content'])
|
||||||
|
|
||||||
class TOC(list):
|
class TOC(list):
|
||||||
|
|
||||||
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
|
def __init__(self, href=None, fragment=None, text=None, parent=None, play_order=0,
|
||||||
base_path=os.getcwd(), type='unknown'):
|
base_path=os.getcwd(), type='unknown'):
|
||||||
self.href = href
|
self.href = href
|
||||||
self.fragment = fragment
|
self.fragment = fragment
|
||||||
@ -31,7 +31,7 @@ class TOC(list):
|
|||||||
self.base_path = base_path
|
self.base_path = base_path
|
||||||
self.play_order = play_order
|
self.play_order = play_order
|
||||||
self.type = type
|
self.type = type
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
lines = ['TOC: %s#%s'%(self.href, self.fragment)]
|
lines = ['TOC: %s#%s'%(self.href, self.fragment)]
|
||||||
for child in self:
|
for child in self:
|
||||||
@ -39,10 +39,10 @@ class TOC(list):
|
|||||||
for l in c:
|
for l in c:
|
||||||
lines.append('\t'+l)
|
lines.append('\t'+l)
|
||||||
return '\n'.join(lines)
|
return '\n'.join(lines)
|
||||||
|
|
||||||
def count(self, type):
|
def count(self, type):
|
||||||
return len([i for i in self.flat() if i.type == type])
|
return len([i for i in self.flat() if i.type == type])
|
||||||
|
|
||||||
def purge(self, types, max=0):
|
def purge(self, types, max=0):
|
||||||
remove = []
|
remove = []
|
||||||
for entry in self.flat():
|
for entry in self.flat():
|
||||||
@ -54,23 +54,23 @@ class TOC(list):
|
|||||||
continue
|
continue
|
||||||
entry.parent.remove(entry)
|
entry.parent.remove(entry)
|
||||||
return remove
|
return remove
|
||||||
|
|
||||||
def remove(self, entry):
|
def remove(self, entry):
|
||||||
list.remove(self, entry)
|
list.remove(self, entry)
|
||||||
entry.parent = None
|
entry.parent = None
|
||||||
|
|
||||||
def add_item(self, href, fragment, text, play_order=None, type='unknown'):
|
def add_item(self, href, fragment, text, play_order=None, type='unknown'):
|
||||||
if play_order is None:
|
if play_order is None:
|
||||||
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
|
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
|
||||||
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
|
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
|
||||||
base_path=self.base_path, play_order=play_order, type=type))
|
base_path=self.base_path, play_order=play_order, type=type))
|
||||||
return self[-1]
|
return self[-1]
|
||||||
|
|
||||||
def top_level_items(self):
|
def top_level_items(self):
|
||||||
for item in self:
|
for item in self:
|
||||||
if item.text is not None:
|
if item.text is not None:
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
def depth(self):
|
def depth(self):
|
||||||
depth = 1
|
depth = 1
|
||||||
for obj in self:
|
for obj in self:
|
||||||
@ -78,14 +78,14 @@ class TOC(list):
|
|||||||
if c > depth - 1:
|
if c > depth - 1:
|
||||||
depth = c + 1
|
depth = c + 1
|
||||||
return depth
|
return depth
|
||||||
|
|
||||||
def flat(self):
|
def flat(self):
|
||||||
'Depth first iteration over the tree rooted at self'
|
'Depth first iteration over the tree rooted at self'
|
||||||
yield self
|
yield self
|
||||||
for obj in self:
|
for obj in self:
|
||||||
for i in obj.flat():
|
for i in obj.flat():
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
@apply
|
@apply
|
||||||
def abspath():
|
def abspath():
|
||||||
doc='Return the file this toc entry points to as a absolute path to a file on the system.'
|
doc='Return the file this toc entry points to as a absolute path to a file on the system.'
|
||||||
@ -96,9 +96,9 @@ class TOC(list):
|
|||||||
if not os.path.isabs(path):
|
if not os.path.isabs(path):
|
||||||
path = os.path.join(self.base_path, path)
|
path = os.path.join(self.base_path, path)
|
||||||
return path
|
return path
|
||||||
|
|
||||||
return property(fget=fget, doc=doc)
|
return property(fget=fget, doc=doc)
|
||||||
|
|
||||||
def read_from_opf(self, opfreader):
|
def read_from_opf(self, opfreader):
|
||||||
toc = opfreader.soup.find('spine', toc=True)
|
toc = opfreader.soup.find('spine', toc=True)
|
||||||
if toc is not None:
|
if toc is not None:
|
||||||
@ -111,7 +111,7 @@ class TOC(list):
|
|||||||
if 'toc' in item.href().lower():
|
if 'toc' in item.href().lower():
|
||||||
toc = item.href()
|
toc = item.href()
|
||||||
break
|
break
|
||||||
|
|
||||||
if toc is not None:
|
if toc is not None:
|
||||||
if toc.lower() not in ('ncx', 'ncxtoc'):
|
if toc.lower() not in ('ncx', 'ncxtoc'):
|
||||||
toc = urlparse(unquote(toc))[2]
|
toc = urlparse(unquote(toc))[2]
|
||||||
@ -123,7 +123,7 @@ class TOC(list):
|
|||||||
bn = os.path.basename(toc)
|
bn = os.path.basename(toc)
|
||||||
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
||||||
toc = os.path.join(os.path.dirname(toc), bn)
|
toc = os.path.join(os.path.dirname(toc), bn)
|
||||||
|
|
||||||
self.read_html_toc(toc)
|
self.read_html_toc(toc)
|
||||||
except:
|
except:
|
||||||
print 'WARNING: Could not read Table of Contents. Continuing anyway.'
|
print 'WARNING: Could not read Table of Contents. Continuing anyway.'
|
||||||
@ -141,43 +141,43 @@ class TOC(list):
|
|||||||
if m:
|
if m:
|
||||||
toc = m[0]
|
toc = m[0]
|
||||||
self.read_ncx_toc(toc)
|
self.read_ncx_toc(toc)
|
||||||
|
|
||||||
def read_ncx_toc(self, toc):
|
def read_ncx_toc(self, toc):
|
||||||
self.base_path = os.path.dirname(toc)
|
self.base_path = os.path.dirname(toc)
|
||||||
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
|
soup = NCXSoup(xml_to_unicode(open(toc, 'rb').read())[0])
|
||||||
|
|
||||||
def process_navpoint(np, dest):
|
def process_navpoint(np, dest):
|
||||||
play_order = np.get('playOrder', None)
|
play_order = np.get('playOrder', None)
|
||||||
if play_order is None:
|
if play_order is None:
|
||||||
play_order = int(np.get('playorder', 1))
|
play_order = int(np.get('playorder', 1))
|
||||||
href = fragment = text = None
|
href = fragment = text = None
|
||||||
nl = np.find('navlabel')
|
nl = np.find(re.compile('navlabel'))
|
||||||
if nl is not None:
|
if nl is not None:
|
||||||
text = u''
|
text = u''
|
||||||
for txt in nl.findAll('text'):
|
for txt in nl.findAll(re.compile('text')):
|
||||||
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
|
text += ''.join([unicode(s) for s in txt.findAll(text=True)])
|
||||||
content = np.find('content')
|
content = np.find(re.compile('content'))
|
||||||
if content is None or not content.has_key('src') or not txt:
|
if content is None or not content.has_key('src') or not txt:
|
||||||
return
|
return
|
||||||
|
|
||||||
purl = urlparse(unquote(content['src']))
|
purl = urlparse(unquote(content['src']))
|
||||||
href, fragment = purl[2], purl[5]
|
href, fragment = purl[2], purl[5]
|
||||||
nd = dest.add_item(href, fragment, text)
|
nd = dest.add_item(href, fragment, text)
|
||||||
nd.play_order = play_order
|
nd.play_order = play_order
|
||||||
|
|
||||||
for c in np:
|
for c in np:
|
||||||
if getattr(c, 'name', None) == 'navpoint':
|
if 'navpoint' in getattr(c, 'name', ''):
|
||||||
process_navpoint(c, nd)
|
process_navpoint(c, nd)
|
||||||
|
|
||||||
nm = soup.find('navmap')
|
nm = soup.find(re.compile('navmap'))
|
||||||
if nm is None:
|
if nm is None:
|
||||||
raise ValueError('NCX files must have a <navmap> element.')
|
raise ValueError('NCX files must have a <navmap> element.')
|
||||||
|
|
||||||
for elem in nm:
|
for elem in nm:
|
||||||
if getattr(elem, 'name', None) == 'navpoint':
|
if 'navpoint' in getattr(elem, 'name', ''):
|
||||||
process_navpoint(elem, self)
|
process_navpoint(elem, self)
|
||||||
|
|
||||||
|
|
||||||
def read_html_toc(self, toc):
|
def read_html_toc(self, toc):
|
||||||
self.base_path = os.path.dirname(toc)
|
self.base_path = os.path.dirname(toc)
|
||||||
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
|
soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||||
@ -191,13 +191,13 @@ class TOC(list):
|
|||||||
else:
|
else:
|
||||||
fragment = fragment.strip()
|
fragment = fragment.strip()
|
||||||
href = href.strip()
|
href = href.strip()
|
||||||
|
|
||||||
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
|
txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
|
||||||
add = True
|
add = True
|
||||||
for i in self.flat():
|
for i in self.flat():
|
||||||
if i.href == href and i.fragment == fragment:
|
if i.href == href and i.fragment == fragment:
|
||||||
add = False
|
add = False
|
||||||
break
|
break
|
||||||
if add:
|
if add:
|
||||||
self.add_item(href, fragment, txt)
|
self.add_item(href, fragment, txt)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user