Implement adding page-map information to EPUB output.

2025-08-30 23:00:21 -04:00 · 2009-02-06 14:33:48 -05:00 · 2009-02-06 14:33:48 -05:00 · 28a6f11a94
commit 28a6f11a94
parent 9929ba8eeb
4 changed files with 84 additions and 5 deletions
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -153,6 +153,14 @@ help on using this feature.
                     'slow and if your source file contains a very large '
                     'number of page breaks, you should turn off splitting '
                     'on page breaks.'))
    structure('page', ['--page'], default=None,
              help=_('XPath expression to detect page boundaries for building '
                     'a custom pagination map, as used by AdobeDE. Default is '
                     'not to build an explicit pagination map.'))
    structure('page_names', ['--page-names'], default=None,
              help=_('XPath expression to find the name of each page in the '
                     'pagination map relative to its boundary element. '
                     'Default is to number all pages staring with 1.'))
    toc = c.add_group('toc', 
        _('''\
 Control the automatic generation of a Table of Contents. If an OPF file is detected
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -46,6 +46,7 @@ from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.epub import initialize_container, PROFILES
 from calibre.ebooks.epub.split import split
 from calibre.ebooks.epub.pages import add_page_map
 from calibre.ebooks.epub.fonts import Rationalizer
 from calibre.constants import preferred_encoding
 from calibre.customize.ui import run_plugins_on_postprocess
@ -438,6 +439,8 @@ def convert(htmlfile, opts, notification=None, create_epub=True,
            if opts.show_ncx:
                print toc
        split(opf_path, opts, stylesheet_map)
        if opts.page:
            add_page_map(opf_path, opts)
        check_links(opf_path, opts.pretty_print)
        opf = OPF(opf_path, tdir)
--- a/src/calibre/ebooks/epub/pages.py
+++ b/src/calibre/ebooks/epub/pages.py
@ -0,0 +1,59 @@
 '''
 Add page mapping information to an EPUB book.
 '''
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 __docformat__ = 'restructuredtext en'
 import os, re
 from itertools import count, chain
 from calibre.ebooks.oeb.base import XHTML, XHTML_NS
 from calibre.ebooks.oeb.base import OEBBook, DirWriter
 from lxml import etree, html
 from lxml.etree import XPath
 NSMAP = {'h': XHTML_NS, 'html': XHTML_NS, 'xhtml': XHTML_NS}
 PAGE_RE = re.compile(r'page', re.IGNORECASE)
 ROMAN_RE = re.compile(r'^[ivxlcdm]+$', re.IGNORECASE)
 def filter_name(name):
    name = name.strip()
    name = PAGE_RE.sub('', name)
    for word in name.split():
        if word.isdigit() or ROMAN_RE.match(word):
            name = word
            break
    return name
 def build_name_for(expr):
    if expr is None:
        counter = count(1)
        return lambda elem: str(counter.next())
    selector = XPath(expr, namespaces=NSMAP)
    def name_for(elem):
        results = selector(elem)
        if not results:
            return ''
        name = ' '.join(results)
        return filter_name(name)
    return name_for
 def add_page_map(opfpath, opts):
    oeb = OEBBook(opfpath)
    selector = XPath(opts.page, namespaces=NSMAP)
    name_for = build_name_for(opts.page_names)
    idgen = ("calibre-page-%d" % n for n in count(1))
    for item in oeb.spine:
        data = item.data
        for elem in selector(data):
            name = name_for(elem)
            id = elem.get('id', None)
            if id is None:
                id = elem.attrib['id'] = idgen.next()
            href = '#'.join((item.href, id))
            oeb.pages.add(name, href)
    writer = DirWriter(version='2.0', page_map=True)
    writer.dump(oeb, opfpath)
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -246,6 +246,10 @@ class DirWriter(object):
    def dump(self, oeb, path):
        version = int(self.version[0])
        opfname = None
        if os.path.splitext(path)[1].lower() == '.opf':
            opfname = os.path.basename(path)
            path = os.path.dirname(path)
        if not os.path.isdir(path):
            os.mkdir(path)
        output = DirContainer(path)
@ -257,7 +261,9 @@ class DirWriter(object):
            metadata = oeb.to_opf2(page_map=self.page_map)
        else:
            raise OEBError("Unrecognized OPF version %r" % self.version)
-        for href, data in metadata.values():
+        for mime, (href, data) in metadata.items():
            if opfname and mime == OPF_MIME:
                href = opfname
            output.write(href, xml2str(data))
        return
@ -551,9 +557,6 @@ class Manifest(object):
                for elem in data:
                    nroot.append(elem)
                data = nroot
            # Remove any encoding-specifying <meta/> elements
            for meta in self.META_XP(data):
                meta.getparent().remove(meta)
            # Ensure has a <head/>
            head = xpath(data, '/h:html/h:head')
            head = head[0] if head else None
@ -569,6 +572,12 @@ class Manifest(object):
                    'File %r missing <title/> element' % self.href)
                title = etree.SubElement(head, XHTML('title'))
                title.text = self.oeb.translate(__('Unknown'))
            # Remove any encoding-specifying <meta/> elements
            for meta in self.META_XP(data):
                meta.getparent().remove(meta)
            etree.SubElement(head, XHTML('meta'),
                attrib={'http-equiv': 'Content-Type',
                        'content': '%s; charset=utf-8' % XHTML_NS})
            # Ensure has a <body/>
            if not xpath(data, '/h:html/h:body'):
                self.oeb.logger.warn(