PML Output: Generate \CX Tags as chapter anchors. PDB eReader Output: Use \CX tags to generate chapter index.

2025-07-08 02:34:06 -04:00 · 2009-12-15 18:11:36 -05:00 · 2009-12-15 18:11:36 -05:00 · 35fc570d24
commit 35fc570d24
parent 288b64529c
2 changed files with 43 additions and 38 deletions
--- a/src/calibre/ebooks/pdb/ereader/writer.py
+++ b/src/calibre/ebooks/pdb/ereader/writer.py
@ -42,8 +42,8 @@ class Writer(FormatWriter):
        pml = unicode(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace')
        text, text_sizes = self._text(pml)
-        chapter_index = self._chapter_index(pml)
+        chapter_index = self._index_item(r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"', pml)
-        link_index = self._link_index(pml)
+        link_index = self._index_item(r'(?s)\\Q="(?P<text>.+?)"', pml)
        images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs)
        metadata = [self._metadata(metadata)]
        hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))]
@ -101,38 +101,24 @@ class Writer(FormatWriter):
        return pml_pages, text_sizes
-    def _index_item(self, mo):
+    def _index_item(self, regex, pml):
        index = ''
        if 'text' in mo.groupdict().keys():
            index += struct.pack('>L', mo.start())
            text = mo.group('text')
            # Strip all PML tags from text
            text = re.sub(r'\\U[0-9a-z]{4}', '', text)
            text = re.sub(r'\\a\d{3}', '', text)
            text = re.sub(r'\\.', '', text)
            # Add appropriate spacing to denote the various levels of headings
            if 'val' in mo.groupdict().keys():
                text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
            index += text
            index += '\x00'
        return index
    def _chapter_index(self, pml):
        chapter_marks = [
            r'(?s)\\x(?P<text>.+?)\\x',
            r'(?s)\\X(?P<val>[0-4])(?P<text>.*?)\\X[0-4]',
            r'(?s)\\C(?P<val>\d)="(?P<text>.+?)"',
        ]
        index = []
-        for chapter_mark in chapter_marks:
+        for mo in re.finditer(regex, pml):
-            for mo in re.finditer(chapter_mark, pml):
+            item = ''
-                index.append(self._index_item(mo))
+            if 'text' in mo.groupdict().keys():
-        return index
+                item += struct.pack('>L', mo.start())
-
+                text = mo.group('text')
-    def _link_index(self, pml):
+                # Strip all PML tags from text
-        index = []
+                text = re.sub(r'\\U[0-9a-z]{4}', '', text)
-        for mo in re.finditer(r'(?s)\\Q="(?P<text>.+?)"', pml):
+                text = re.sub(r'\\a\d{3}', '', text)
-            index.append(self._index_item(mo))
+                text = re.sub(r'\\.', '', text)
                # Add appropriate spacing to denote the various levels of headings
                if 'val' in mo.groupdict().keys():
                    text = '%s%s' % (' ' * 4 * int(mo.group('val')), text)
                item += text
                item += '\x00'
            if item:
                index.append(item)
        return index
    def _images(self, manifest, image_hrefs):
--- a/src/calibre/ebooks/pml/pmlml.py
+++ b/src/calibre/ebooks/pml/pmlml.py
@ -79,6 +79,16 @@ class PMLMLizer(object):
        self.log.info('Converting XHTML to PML markup...')
        self.oeb_book = oeb_book
        self.opts = opts
        # This is used for adding \CX tags chapter markers. This is separate
        # from the optional inline toc.
        self.toc = {}
        for item in oeb_book.toc:
            page, mid, id = item.href.partition('#')
            if not self.toc.get(page, None):
                self.toc[page] = {}
            self.toc[page][id] = item.title
        return self.pmlmlize_spine()
    def pmlmlize_spine(self):
@ -107,7 +117,11 @@ class PMLMLizer(object):
        return output
    def get_toc(self):
-        toc = [u'']
+        '''
        Generation of inline TOC
        '''
        toc = []
        if self.opts.inline_toc:
            self.log.debug('Generating table of contents...')
            toc.append(u'\\X0%s\\X0\n\n' % _('Table of Contents:'))
@ -177,14 +191,14 @@ class PMLMLizer(object):
    def dump_text(self, elem, stylizer, page, tag_stack=[]):
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
-            return [u'']
+            return []
-        text = [u'']
+        text = []
        style = stylizer.style(elem)
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
-            return [u'']
+            return []
        tag = barename(elem.tag)
        tag_count = 0
@ -213,6 +227,12 @@ class PMLMLizer(object):
            else:
                w += '="50%"'
            text.append(w)
        toc_id = elem.attrib.get('id', None)
        if toc_id:
            if self.toc.get(page.href, None):
                toc_title = self.toc[page.href].get(toc_id, None)
                if toc_title:
                    text.append('\\C1="%s"' % toc_title)
        # Process style information that needs holds a single tag
        # Commented out because every page in an OEB book starts with this style
@ -287,4 +307,3 @@ class PMLMLizer(object):
            if tag != 'block':
                text.append('\\%s' % tag)
        return text