From 28a6f11a94848617311e36d62f6106fd951137a5 Mon Sep 17 00:00:00 2001
From: "Marshall T. Vandegrift" <llasram@gmail.com>
Date: Fri, 6 Feb 2009 14:33:48 -0500
Subject: [PATCH] Implement adding page-map information to EPUB output.

---
 src/calibre/ebooks/epub/__init__.py  | 10 ++++-
 src/calibre/ebooks/epub/from_html.py |  3 ++
 src/calibre/ebooks/epub/pages.py     | 59 ++++++++++++++++++++++++++++
 src/calibre/ebooks/oeb/base.py       | 17 ++++++--
 4 files changed, 84 insertions(+), 5 deletions(-)
 create mode 100644 src/calibre/ebooks/epub/pages.py

diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py
index 863f2f8db0..aa17024d50 100644
--- a/src/calibre/ebooks/epub/__init__.py
+++ b/src/calibre/ebooks/epub/__init__.py
@@ -153,6 +153,14 @@ help on using this feature.
                      'slow and if your source file contains a very large '
                      'number of page breaks, you should turn off splitting '
                      'on page breaks.'))
+    structure('page', ['--page'], default=None,
+              help=_('XPath expression to detect page boundaries for building '
+                     'a custom pagination map, as used by AdobeDE. Default is '
+                     'not to build an explicit pagination map.'))
+    structure('page_names', ['--page-names'], default=None,
+              help=_('XPath expression to find the name of each page in the '
+                     'pagination map relative to its boundary element. '
+                     'Default is to number all pages staring with 1.'))
     toc = c.add_group('toc', 
         _('''\
 Control the automatic generation of a Table of Contents. If an OPF file is detected
@@ -230,4 +238,4 @@ to auto-generate a Table of Contents.
     c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
               help=_('Extract the contents of the produced EPUB file to the '
                      'specified directory.'))
-    return c
\ No newline at end of file
+    return c
diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index ca50fe7a5d..b8fa3e8fd0 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -46,6 +46,7 @@ from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.epub import initialize_container, PROFILES
 from calibre.ebooks.epub.split import split
+from calibre.ebooks.epub.pages import add_page_map
 from calibre.ebooks.epub.fonts import Rationalizer
 from calibre.constants import preferred_encoding
 from calibre.customize.ui import run_plugins_on_postprocess
@@ -438,6 +439,8 @@ def convert(htmlfile, opts, notification=None, create_epub=True,
             if opts.show_ncx:
                 print toc
         split(opf_path, opts, stylesheet_map)
+        if opts.page:
+            add_page_map(opf_path, opts)
         check_links(opf_path, opts.pretty_print)
         
         opf = OPF(opf_path, tdir)
diff --git a/src/calibre/ebooks/epub/pages.py b/src/calibre/ebooks/epub/pages.py
new file mode 100644
index 0000000000..c1b38b9be1
--- /dev/null
+++ b/src/calibre/ebooks/epub/pages.py
@@ -0,0 +1,59 @@
+'''
+Add page mapping information to an EPUB book.
+'''
+
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
+__docformat__ = 'restructuredtext en'
+
+import os, re
+from itertools import count, chain
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS
+from calibre.ebooks.oeb.base import OEBBook, DirWriter
+from lxml import etree, html
+from lxml.etree import XPath
+
+NSMAP = {'h': XHTML_NS, 'html': XHTML_NS, 'xhtml': XHTML_NS}
+PAGE_RE = re.compile(r'page', re.IGNORECASE)
+ROMAN_RE = re.compile(r'^[ivxlcdm]+$', re.IGNORECASE)
+
+def filter_name(name):
+    name = name.strip()
+    name = PAGE_RE.sub('', name)
+    for word in name.split():
+        if word.isdigit() or ROMAN_RE.match(word):
+            name = word
+            break
+    return name
+
+def build_name_for(expr):
+    if expr is None:
+        counter = count(1)
+        return lambda elem: str(counter.next())
+    selector = XPath(expr, namespaces=NSMAP)
+    def name_for(elem):
+        results = selector(elem)
+        if not results:
+            return ''
+        name = ' '.join(results)
+        return filter_name(name)
+    return name_for
+
+def add_page_map(opfpath, opts):
+    oeb = OEBBook(opfpath)
+    selector = XPath(opts.page, namespaces=NSMAP)
+    name_for = build_name_for(opts.page_names)
+    idgen = ("calibre-page-%d" % n for n in count(1))
+    for item in oeb.spine:
+        data = item.data
+        for elem in selector(data):
+            name = name_for(elem)
+            id = elem.get('id', None)
+            if id is None:
+                id = elem.attrib['id'] = idgen.next()
+            href = '#'.join((item.href, id))
+            oeb.pages.add(name, href)
+    writer = DirWriter(version='2.0', page_map=True)
+    writer.dump(oeb, opfpath)
diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py
index 778cec54cf..80d4797905 100644
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@@ -246,6 +246,10 @@ class DirWriter(object):
 
     def dump(self, oeb, path):
         version = int(self.version[0])
+        opfname = None
+        if os.path.splitext(path)[1].lower() == '.opf':
+            opfname = os.path.basename(path)
+            path = os.path.dirname(path)
         if not os.path.isdir(path):
             os.mkdir(path)
         output = DirContainer(path)
@@ -257,7 +261,9 @@ class DirWriter(object):
             metadata = oeb.to_opf2(page_map=self.page_map)
         else:
             raise OEBError("Unrecognized OPF version %r" % self.version)
-        for href, data in metadata.values():
+        for mime, (href, data) in metadata.items():
+            if opfname and mime == OPF_MIME:
+                href = opfname
             output.write(href, xml2str(data))
         return
 
@@ -551,9 +557,6 @@ class Manifest(object):
                 for elem in data:
                     nroot.append(elem)
                 data = nroot
-            # Remove any encoding-specifying <meta/> elements
-            for meta in self.META_XP(data):
-                meta.getparent().remove(meta)
             # Ensure has a <head/>
             head = xpath(data, '/h:html/h:head')
             head = head[0] if head else None
@@ -569,6 +572,12 @@ class Manifest(object):
                     'File %r missing <title/> element' % self.href)
                 title = etree.SubElement(head, XHTML('title'))
                 title.text = self.oeb.translate(__('Unknown'))
+            # Remove any encoding-specifying <meta/> elements
+            for meta in self.META_XP(data):
+                meta.getparent().remove(meta)
+            etree.SubElement(head, XHTML('meta'),
+                attrib={'http-equiv': 'Content-Type',
+                        'content': '%s; charset=utf-8' % XHTML_NS})
             # Ensure has a <body/>
             if not xpath(data, '/h:html/h:body'):
                 self.oeb.logger.warn(