KF8 Output: Implement the chunking algorithm. Needs review and testing

2025-07-09 03:04:10 -04:00 · 2012-04-19 17:51:08 +05:30 · 2012-04-19 17:51:08 +05:30 · de700eb326
commit de700eb326
parent 5c0cd6e070
3 changed files with 242 additions and 10 deletions
--- a/src/calibre/ebooks/conversion/plugins/mobi_output.py
+++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py
@ -172,6 +172,7 @@ class MOBIOutput(OutputFormatPlugin):
        kf8 = self.create_kf8(resources) if create_kf8 else None
        self.log('Creating MOBI 6 output')
        self.write_mobi(input_plugin, output_path, kf8, resources)
    def create_kf8(self, resources):
--- a/src/calibre/ebooks/mobi/writer8/main.py
+++ b/src/calibre/ebooks/mobi/writer8/main.py
@ -19,6 +19,7 @@ from calibre.ebooks.mobi.utils import to_base
 from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
        extract, XHTML, urlnormalize)
 from calibre.ebooks.oeb.parse_utils import barename
 from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags
 XML_DOCS = OEB_DOCS | {SVG_MIME}
@ -28,20 +29,11 @@ to_ref = partial(to_base, base=32, min_num_digits=4)
 # References in links are stored with 10 digits
 to_href = partial(to_base, base=32, min_num_digits=10)
 # Tags to which kindlegen adds the aid attribute
 aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
 'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
 'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
 'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
 'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
 'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
 'video'}
 class KF8Writer(object):
    def __init__(self, oeb, opts, resources):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.log.info('Creating KF8 output')
        self.used_images = set()
        self.resources = resources
        self.dup_data()
@ -52,6 +44,7 @@ class KF8Writer(object):
        self.extract_svg_into_flows()
        self.replace_internal_links_with_placeholders()
        self.insert_aid_attributes()
        self.chunk_it_up()
    def dup_data(self):
        ''' Duplicate data so that any changes we make to markup/CSS only
@ -144,6 +137,7 @@ class KF8Writer(object):
                    continue
                repl = etree.Element(XHTML('link'), type='text/css',
                        rel='stylesheet')
                repl.tail='\n'
                p.insert(idx, repl)
                extract(tag)
                inlines[raw].append(repl)
@ -204,3 +198,8 @@ class KF8Writer(object):
                    j += 1
    def chunk_it_up(self):
        chunker = Chunker(self.oeb, self.data)
        chunker
--- a/src/calibre/ebooks/mobi/writer8/skeleton.py
+++ b/src/calibre/ebooks/mobi/writer8/skeleton.py
@ -0,0 +1,232 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from collections import namedtuple
 from lxml import etree
 from calibre.ebooks.oeb.base import XHTML_NS
 CHUNK_SIZE = 8192
 # Tags to which kindlegen adds the aid attribute
 aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
 'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
 'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
 'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
 'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
 'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
 'video'}
 _self_closing_pat = re.compile(bytes(
    r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags))),
    re.IGNORECASE)
 def close_self_closing_tags(raw):
    return _self_closing_pat.sub(br'<\g<tag>\g<arg>></\g<tag>>', raw)
 def path_to_node(node):
    ans = []
    parent = node.getparent()
    while parent is not None:
        ans.append(parent.index(node))
        node = parent
        parent = parent.getparent()
    return tuple(reversed(ans))
 def node_from_path(root, path):
    parent = root
    for idx in path:
        parent = parent[idx]
    return parent
 class Chunk(object):
    def __init__(self, raw):
        self.raw = raw
        self.starts_tags = []
        self.ends_tags = []
        self.insert_pos = None
    def __len__(self):
        return len(self.raw)
    def merge(self, chunk):
        self.raw += chunk.raw
        self.ends_tags = chunk.ends_tags
 class Skeleton(object):
    def __init__(self, file_number, item, root, chunks):
        self.file_number, self.item = file_number, item
        self.chunks = chunks
        self.skeleton = self.render(root)
        self.body_offset = self.skeleton.find('<body')
        self.calculate_metrics(root)
        self.calculate_insert_positions()
    def render(self, root):
        raw = etree.tostring(root, encoding='UTF-8', xml_declaration=True)
        raw = raw.replace('<html', '<html xmlns="%s"'%XHTML_NS, 1)
        return raw
    def calculate_metrics(self, root):
        Metric = namedtuple('Metric', 'start end')
        self.metrics = {}
        for tag in root.xpath('//*[@aid]'):
            text = (tag.text or '').encode('utf-8')
            raw = etree.tostring(tag, encoding='UTF-8', with_tail=True,
                    xml_declaration=False)
            start_length = len(raw.partition(b'>')[0]) + len(text) + 1
            end_length = len(raw.rpartition(b'<')[-1]) + 1
            self.metrics[tag.get('aid')] = Metric(start_length, end_length)
    def calculate_insert_positions(self):
        pos = self.body_offset
        for chunk in self.chunks:
            for tag in chunk.starts_tags:
                pos += self.metrics[tag].start
            chunk.insert_pos = pos
            pos += len(chunk)
            for tag in chunk.ends_tags:
                pos += self.metrics[tag].end
 class Chunker(object):
    def __init__(self, oeb, data_func):
        self.oeb, self.log = oeb, oeb.log
        self.data = data_func
        self.skeletons = []
        for i, item in enumerate(self.oeb.spine):
            root = self.remove_namespaces(self.data(item))
            body = root.xpath('//body')[0]
            body.tail = '\n'
            # First pass: break up document into rendered strings of length no
            # more than CHUNK_SIZE
            chunks = []
            self.step_into_tag(body, chunks)
            # Second pass: Merge neighboring small chunks within the same
            # skeleton tag so as to have chunks as close to the CHUNK_SIZE as
            # possible.
            chunks = self.merge_small_chunks(chunks)
            # Third pass: Create the skeleton and calculate the insert position
            # for all chunks
            self.skeletons.append(Skeleton(i, item, root, chunks))
    def remove_namespaces(self, root):
        lang = None
        for attr, val in root.attrib.iteritems():
            if attr.rpartition('}')[-1] == 'lang':
                lang = val
        # Remove all namespace information from the tree. This means namespaced
        # tags have their namespaces removed and all namespace declarations are
        # removed. We have to do this manual cloning of the tree as there is no
        # other way to remove namespace declarations in lxml. This is done so
        # that serialization creates clean HTML 5 markup with no namespaces. We
        # insert the XHTML namespace manually after serialization. The
        # preceding layers should have removed svg and any other non html
        # namespaced tags.
        attrib = {'lang':lang} if lang else {}
        nroot = etree.Element('html', attrib=attrib)
        nroot.text = root.text
        nroot.tail = '\n'
        for tag in root.iterdescendants(etree.Element):
            # We are ignoring all non tag entities in the tree
            # like comments and processing instructions, as they make the
            # chunking code even harder, for minimal gain.
            elem = nroot.makeelement(tag.tag.rpartition('}')[-1],
                    attrib={k.rpartition('}')[-1]:v for k, v in
                        tag.attrib.iteritems()})
            elem.text, elem.tail = tag.text, tag.tail
            parent = node_from_path(nroot, path_to_node(tag.getparent()))
            parent.append(elem)
        return nroot
    def step_into_tag(self, tag, chunks):
        aid = tag.get('aid')
        first_chunk_idx = len(chunks)
        # First handle any text
        if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
            chunks.extend(self.chunk_up_text(tag.text))
            tag.text = None
        # Now loop over children
        for child in list(tag):
            raw = etree.tostring(child, encoding='UTF-8',
                    xml_declaration=False, with_tail=False)
            raw = close_self_closing_tags(raw)
            if len(raw) > CHUNK_SIZE and child.get('aid', None):
                self.step_into_tag(child, chunks)
                if child.tail and child.tail.strip(): # Leave pure whitespace
                    chunks.extend(self.chunk_up_text(child.tail))
                    child.tail = None
            else:
                if len(raw) > CHUNK_SIZE:
                    self.log.warn('Tag %s has no aid and a too large chunk'
                            ' size. Adding anyway.'%child.tag)
                chunks.append(Chunk(raw))
                if child.tail:
                    chunks.extend(self.chunk_up_text(child.tail))
                tag.remove(child)
        if len(chunks) <= first_chunk_idx and chunks:
            raise ValueError('Stepped into a tag that generated no chunks.')
        # Mark the first and last chunks of this tag
        if chunks:
            chunks[first_chunk_idx].starts_tags.append(aid)
            chunks[-1].ends_tags.append(aid)
    def chunk_up_text(self, text):
        text = text.encode('utf-8')
        ans = []
        def split_multibyte_text(raw):
            if len(raw) <= CHUNK_SIZE:
                return raw, b''
            l = raw[:CHUNK_SIZE]
            l = l.decode('utf-8', 'ignore').encode('utf-8')
            return l, raw[len(l):]
        start, rest = split_multibyte_text(text)
        ans.append(start)
        while rest:
            start, rest = split_multibyte_text(rest)
            ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
        return [Chunk(x) for x in ans]
    def merge_small_chunks(self, chunks):
        ans = chunks[:1]
        for chunk in chunks[1:]:
            prev = ans[-1]
            if (
                    chunk.starts_tags or # Starts a tag in the skel
                    len(chunk) + len(prev) > CHUNK_SIZE or # Too large
                    prev.ends_tags # Prev chunk ended a tag
                    ):
                ans.append(chunk)
            else:
                prev.merge(chunk)
        return ans