KF8 Output: Implement the chunking algorithm. Needs review and testing

2025-08-30 23:00:21 -04:00 · 2012-04-19 17:51:08 +05:30 · 2012-04-19 17:51:08 +05:30 · de700eb326
commit de700eb326
parent 5c0cd6e070
3 changed files with 242 additions and 10 deletions
--- a/src/calibre/ebooks/conversion/plugins/mobi_output.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py
@ -172,6 +172,7 @@ class MOBIOutput(OutputFormatPlugin):

        kf8 = self.create_kf8(resources) if create_kf8 else None

+        self.log('Creating MOBI 6 output')
        self.write_mobi(input_plugin, output_path, kf8, resources)

    def create_kf8(self, resources):
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -19,6 +19,7 @@ from calibre.ebooks.mobi.utils import to_base
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
        extract, XHTML, urlnormalize)
 from calibre.ebooks.oeb.parse_utils import barename
+from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags

 XML_DOCS = OEB_DOCS | {SVG_MIME}

@ -28,20 +29,11 @@ to_ref = partial(to_base, base=32, min_num_digits=4)
 # References in links are stored with 10 digits
 to_href = partial(to_base, base=32, min_num_digits=10)

-# Tags to which kindlegen adds the aid attribute
-aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
-'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
-'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
-'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
-'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
-'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
-'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
-'video'}
-
 class KF8Writer(object):

    def __init__(self, oeb, opts, resources):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
+        self.log.info('Creating KF8 output')
        self.used_images = set()
        self.resources = resources
        self.dup_data()
@ -52,6 +44,7 @@ class KF8Writer(object):
        self.extract_svg_into_flows()
        self.replace_internal_links_with_placeholders()
        self.insert_aid_attributes()
+        self.chunk_it_up()

    def dup_data(self):
        ''' Duplicate data so that any changes we make to markup/CSS only
@ -144,6 +137,7 @@ class KF8Writer(object):
                    continue
                repl = etree.Element(XHTML('link'), type='text/css',
                        rel='stylesheet')
+                repl.tail='\n'
                p.insert(idx, repl)
                extract(tag)
                inlines[raw].append(repl)
@ -204,3 +198,8 @@ class KF8Writer(object):

                    j += 1

+    def chunk_it_up(self):
+        chunker = Chunker(self.oeb, self.data)
+        chunker
+
+
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@ -0,0 +1,232 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+from collections import namedtuple
+
+from lxml import etree
+
+from calibre.ebooks.oeb.base import XHTML_NS
+
+CHUNK_SIZE = 8192
+
+# Tags to which kindlegen adds the aid attribute
+aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
+'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
+'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
+'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
+'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
+'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
+'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
+'video'}
+
+_self_closing_pat = re.compile(bytes(
+    r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags))),
+    re.IGNORECASE)
+
+def close_self_closing_tags(raw):
+    return _self_closing_pat.sub(br'<\g<tag>\g<arg>></\g<tag>>', raw)
+
+def path_to_node(node):
+    ans = []
+    parent = node.getparent()
+    while parent is not None:
+        ans.append(parent.index(node))
+        node = parent
+        parent = parent.getparent()
+    return tuple(reversed(ans))
+
+def node_from_path(root, path):
+    parent = root
+    for idx in path:
+        parent = parent[idx]
+    return parent
+
+class Chunk(object):
+
+    def __init__(self, raw):
+        self.raw = raw
+        self.starts_tags = []
+        self.ends_tags = []
+        self.insert_pos = None
+
+    def __len__(self):
+        return len(self.raw)
+
+    def merge(self, chunk):
+        self.raw += chunk.raw
+        self.ends_tags = chunk.ends_tags
+
+class Skeleton(object):
+
+    def __init__(self, file_number, item, root, chunks):
+        self.file_number, self.item = file_number, item
+        self.chunks = chunks
+
+        self.skeleton = self.render(root)
+        self.body_offset = self.skeleton.find('<body')
+        self.calculate_metrics(root)
+
+        self.calculate_insert_positions()
+
+    def render(self, root):
+        raw = etree.tostring(root, encoding='UTF-8', xml_declaration=True)
+        raw = raw.replace('<html', '<html xmlns="%s"'%XHTML_NS, 1)
+        return raw
+
+    def calculate_metrics(self, root):
+        Metric = namedtuple('Metric', 'start end')
+        self.metrics = {}
+        for tag in root.xpath('//*[@aid]'):
+            text = (tag.text or '').encode('utf-8')
+            raw = etree.tostring(tag, encoding='UTF-8', with_tail=True,
+                    xml_declaration=False)
+            start_length = len(raw.partition(b'>')[0]) + len(text) + 1
+            end_length = len(raw.rpartition(b'<')[-1]) + 1
+            self.metrics[tag.get('aid')] = Metric(start_length, end_length)
+
+    def calculate_insert_positions(self):
+        pos = self.body_offset
+        for chunk in self.chunks:
+            for tag in chunk.starts_tags:
+                pos += self.metrics[tag].start
+            chunk.insert_pos = pos
+            pos += len(chunk)
+            for tag in chunk.ends_tags:
+                pos += self.metrics[tag].end
+
+class Chunker(object):
+
+    def __init__(self, oeb, data_func):
+        self.oeb, self.log = oeb, oeb.log
+        self.data = data_func
+
+        self.skeletons = []
+
+        for i, item in enumerate(self.oeb.spine):
+            root = self.remove_namespaces(self.data(item))
+            body = root.xpath('//body')[0]
+            body.tail = '\n'
+
+            # First pass: break up document into rendered strings of length no
+            # more than CHUNK_SIZE
+            chunks = []
+            self.step_into_tag(body, chunks)
+
+            # Second pass: Merge neighboring small chunks within the same
+            # skeleton tag so as to have chunks as close to the CHUNK_SIZE as
+            # possible.
+            chunks = self.merge_small_chunks(chunks)
+
+            # Third pass: Create the skeleton and calculate the insert position
+            # for all chunks
+            self.skeletons.append(Skeleton(i, item, root, chunks))
+
+    def remove_namespaces(self, root):
+        lang = None
+        for attr, val in root.attrib.iteritems():
+            if attr.rpartition('}')[-1] == 'lang':
+                lang = val
+
+        # Remove all namespace information from the tree. This means namespaced
+        # tags have their namespaces removed and all namespace declarations are
+        # removed. We have to do this manual cloning of the tree as there is no
+        # other way to remove namespace declarations in lxml. This is done so
+        # that serialization creates clean HTML 5 markup with no namespaces. We
+        # insert the XHTML namespace manually after serialization. The
+        # preceding layers should have removed svg and any other non html
+        # namespaced tags.
+        attrib = {'lang':lang} if lang else {}
+        nroot = etree.Element('html', attrib=attrib)
+        nroot.text = root.text
+        nroot.tail = '\n'
+
+        for tag in root.iterdescendants(etree.Element):
+            # We are ignoring all non tag entities in the tree
+            # like comments and processing instructions, as they make the
+            # chunking code even harder, for minimal gain.
+            elem = nroot.makeelement(tag.tag.rpartition('}')[-1],
+                    attrib={k.rpartition('}')[-1]:v for k, v in
+                        tag.attrib.iteritems()})
+            elem.text, elem.tail = tag.text, tag.tail
+            parent = node_from_path(nroot, path_to_node(tag.getparent()))
+            parent.append(elem)
+
+        return nroot
+
+
+    def step_into_tag(self, tag, chunks):
+        aid = tag.get('aid')
+
+        first_chunk_idx = len(chunks)
+
+        # First handle any text
+        if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
+            chunks.extend(self.chunk_up_text(tag.text))
+            tag.text = None
+
+        # Now loop over children
+        for child in list(tag):
+            raw = etree.tostring(child, encoding='UTF-8',
+                    xml_declaration=False, with_tail=False)
+            raw = close_self_closing_tags(raw)
+            if len(raw) > CHUNK_SIZE and child.get('aid', None):
+                self.step_into_tag(child, chunks)
+                if child.tail and child.tail.strip(): # Leave pure whitespace
+                    chunks.extend(self.chunk_up_text(child.tail))
+                    child.tail = None
+            else:
+                if len(raw) > CHUNK_SIZE:
+                    self.log.warn('Tag %s has no aid and a too large chunk'
+                            ' size. Adding anyway.'%child.tag)
+                chunks.append(Chunk(raw))
+                if child.tail:
+                    chunks.extend(self.chunk_up_text(child.tail))
+                tag.remove(child)
+
+        if len(chunks) <= first_chunk_idx and chunks:
+            raise ValueError('Stepped into a tag that generated no chunks.')
+
+        # Mark the first and last chunks of this tag
+        if chunks:
+            chunks[first_chunk_idx].starts_tags.append(aid)
+            chunks[-1].ends_tags.append(aid)
+
+    def chunk_up_text(self, text):
+        text = text.encode('utf-8')
+        ans = []
+
+        def split_multibyte_text(raw):
+            if len(raw) <= CHUNK_SIZE:
+                return raw, b''
+            l = raw[:CHUNK_SIZE]
+            l = l.decode('utf-8', 'ignore').encode('utf-8')
+            return l, raw[len(l):]
+
+        start, rest = split_multibyte_text(text)
+        ans.append(start)
+        while rest:
+            start, rest = split_multibyte_text(rest)
+            ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
+        return [Chunk(x) for x in ans]
+
+    def merge_small_chunks(self, chunks):
+        ans = chunks[:1]
+        for chunk in chunks[1:]:
+            prev = ans[-1]
+            if (
+                    chunk.starts_tags or # Starts a tag in the skel
+                    len(chunk) + len(prev) > CHUNK_SIZE or # Too large
+                    prev.ends_tags # Prev chunk ended a tag
+                    ):
+                ans.append(chunk)
+            else:
+                prev.merge(chunk)
+        return ans
+