2009-05-31 13:11:16 -07:00

175 lines
7.1 KiB
Python

#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from lxml import etree
from urlparse import urlparse
from calibre.ebooks.oeb.base import XPNSMAP, TOC, XHTML
from calibre.ebooks import ConversionError
def XPath(x):
try:
return etree.XPath(x, namespaces=XPNSMAP)
except etree.XPathSyntaxError:
raise ConversionError(
'The syntax of the XPath expression %s is invalid.' % repr(x))
class DetectStructure(object):
def __call__(self, oeb, opts):
self.log = oeb.log
self.oeb = oeb
self.opts = opts
self.log('Detecting structure...')
self.detect_chapters()
if self.oeb.auto_generated_toc or opts.use_auto_toc:
orig_toc = self.oeb.toc
self.oeb.toc = TOC()
self.create_level_based_toc()
if self.oeb.toc.count() < 1:
if not opts.no_chapters_in_toc and self.detected_chapters:
self.create_toc_from_chapters()
if self.oeb.toc.count() < opts.toc_threshold:
self.create_toc_from_links()
if self.oeb.toc.count() < 2 and orig_toc.count() > 2:
self.oeb.toc = orig_toc
else:
self.oeb.auto_generated_toc = True
self.log('Auto generated TOC with %d entries.' %
self.oeb.toc.count())
if opts.toc_filter is not None:
regexp = re.compile(opts.toc_filter)
for node in self.oeb.toc.iter():
if not node.title or regexp.search(node.title) is not None:
self.oeb.toc.remove(node)
if opts.page_breaks_before is not None:
pb_xpath = XPath(opts.page_breaks_before)
for item in oeb.spine:
for elem in pb_xpath(item.data):
style = elem.get('style', '')
if style:
style += '; '
elem.set('style', style+'page-break-before:always')
def detect_chapters(self):
self.detected_chapters = []
if self.opts.chapter:
chapter_xpath = XPath(self.opts.chapter)
for item in self.oeb.spine:
for x in chapter_xpath(item.data):
self.detected_chapters.append((item, x))
chapter_mark = self.opts.chapter_mark
page_break_before = 'display: block; page-break-before: always'
page_break_after = 'display: block; page-break-after: always'
for item, elem in self.detected_chapters:
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
self.log('\tDetected chapter:', text[:50])
if chapter_mark == 'none':
continue
elif chapter_mark == 'rule':
mark = etree.Element(XHTML('hr'))
elif chapter_mark == 'pagebreak':
mark = etree.Element(XHTML('div'), style=page_break_after)
else: # chapter_mark == 'both':
mark = etree.Element(XHTML('hr'), style=page_break_before)
elem.addprevious(mark)
def create_level_based_toc(self):
if self.opts.level1_toc is None:
return
for item in self.oeb.spine:
self.add_leveled_toc_items(item)
def create_toc_from_chapters(self):
counter = self.oeb.toc.next_play_order()
for item, elem in self.detected_chapters:
text, href = self.elem_to_link(item, elem, counter)
self.oeb.toc.add(text, href, play_order=counter)
counter += 1
def create_toc_from_links(self):
for item in self.oeb.spine:
for a in XPath('//h:a[@href]')(item.data):
href = a.get('href')
purl = urlparse(href)
if not purl[0] or purl[0] == 'file':
href, frag = purl.path, purl.fragment
href = item.abshref(href)
if frag:
href = '#'.join((href, frag))
if not self.oeb.toc.has_href(href):
text = u' '.join([t.strip() for t in \
a.xpath('descendant::text()')])
text = text[:100].strip()
if not self.oeb.toc.has_text(text):
self.oeb.toc.add(text, href,
play_order=self.oeb.toc.next_play_order())
def elem_to_link(self, item, elem, counter):
text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
text = text[:100].strip()
id = elem.get('id', 'calibre_toc_%d'%counter)
elem.set('id', id)
href = '#'.join((item.href, id))
return text, href
def add_leveled_toc_items(self, item):
level1 = XPath(self.opts.level1_toc)(item.data)
level1_order = []
document = item
counter = 1
if level1:
added = {}
for elem in level1:
text, _href = self.elem_to_link(document, elem, counter)
counter += 1
if text:
node = self.oeb.toc.add(text, _href,
play_order=self.oeb.toc.next_play_order())
level1_order.append(node)
added[elem] = node
#node.add(_('Top'), _href)
if self.opts.level2_toc is not None:
added2 = {}
level2 = list(XPath(self.opts.level2_toc)(item.data))
for elem in level2:
level1 = None
for item in document.data.iterdescendants():
if item in added.keys():
level1 = added[item]
elif item == elem and level1 is not None:
text, _href = self.elem_to_link(document, elem, counter)
counter += 1
if text:
added2[elem] = level1.add(text, _href,
play_order=self.oeb.toc.next_play_order())
if self.opts.level3_toc is not None:
level3 = list(XPath(self.opts.level3_toc)(item.data))
for elem in level3:
level2 = None
for item in document.data.iterdescendants():
if item in added2.keys():
level2 = added2[item]
elif item == elem and level2 is not None:
text, _href = \
self.elem_to_link(document, elem, counter)
counter += 1
if text:
level2.add(text, _href,
play_order=self.oeb.toc.next_play_order())