diff --git a/src/calibre/ebooks/conversion/plugins/mobi_output.py b/src/calibre/ebooks/conversion/plugins/mobi_output.py index 10d5d87d76..89ab91f8eb 100644 --- a/src/calibre/ebooks/conversion/plugins/mobi_output.py +++ b/src/calibre/ebooks/conversion/plugins/mobi_output.py @@ -172,6 +172,7 @@ class MOBIOutput(OutputFormatPlugin): kf8 = self.create_kf8(resources) if create_kf8 else None + self.log('Creating MOBI 6 output') self.write_mobi(input_plugin, output_path, kf8, resources) def create_kf8(self, resources): diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index e23d292370..79ff7c3d96 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -19,6 +19,7 @@ from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, extract, XHTML, urlnormalize) from calibre.ebooks.oeb.parse_utils import barename +from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags XML_DOCS = OEB_DOCS | {SVG_MIME} @@ -28,20 +29,11 @@ to_ref = partial(to_base, base=32, min_num_digits=4) # References in links are stored with 10 digits to_href = partial(to_base, base=32, min_num_digits=10) -# Tags to which kindlegen adds the aid attribute -aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', -'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details', -'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer', -'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd', -'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p', -'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small', -'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var', -'video'} - class KF8Writer(object): def __init__(self, oeb, opts, resources): self.oeb, self.opts, self.log = oeb, opts, oeb.log + self.log.info('Creating KF8 output') self.used_images = set() self.resources = resources self.dup_data() @@ -52,6 +44,7 @@ class KF8Writer(object): self.extract_svg_into_flows() self.replace_internal_links_with_placeholders() self.insert_aid_attributes() + self.chunk_it_up() def dup_data(self): ''' Duplicate data so that any changes we make to markup/CSS only @@ -144,6 +137,7 @@ class KF8Writer(object): continue repl = etree.Element(XHTML('link'), type='text/css', rel='stylesheet') + repl.tail='\n' p.insert(idx, repl) extract(tag) inlines[raw].append(repl) @@ -204,3 +198,8 @@ class KF8Writer(object): j += 1 + def chunk_it_up(self): + chunker = Chunker(self.oeb, self.data) + chunker + + diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py new file mode 100644 index 0000000000..b8c90e0e36 --- /dev/null +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import re +from collections import namedtuple + +from lxml import etree + +from calibre.ebooks.oeb.base import XHTML_NS + +CHUNK_SIZE = 8192 + +# Tags to which kindlegen adds the aid attribute +aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', +'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details', +'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer', +'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd', +'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p', +'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small', +'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var', +'video'} + +_self_closing_pat = re.compile(bytes( + r'<(?P%s)(?=[\s/])(?P[^>]*)/>'%('|'.join(aid_able_tags))), + re.IGNORECASE) + +def close_self_closing_tags(raw): + return _self_closing_pat.sub(br'<\g\g>>', raw) + +def path_to_node(node): + ans = [] + parent = node.getparent() + while parent is not None: + ans.append(parent.index(node)) + node = parent + parent = parent.getparent() + return tuple(reversed(ans)) + +def node_from_path(root, path): + parent = root + for idx in path: + parent = parent[idx] + return parent + +class Chunk(object): + + def __init__(self, raw): + self.raw = raw + self.starts_tags = [] + self.ends_tags = [] + self.insert_pos = None + + def __len__(self): + return len(self.raw) + + def merge(self, chunk): + self.raw += chunk.raw + self.ends_tags = chunk.ends_tags + +class Skeleton(object): + + def __init__(self, file_number, item, root, chunks): + self.file_number, self.item = file_number, item + self.chunks = chunks + + self.skeleton = self.render(root) + self.body_offset = self.skeleton.find('')[0]) + len(text) + 1 + end_length = len(raw.rpartition(b'<')[-1]) + 1 + self.metrics[tag.get('aid')] = Metric(start_length, end_length) + + def calculate_insert_positions(self): + pos = self.body_offset + for chunk in self.chunks: + for tag in chunk.starts_tags: + pos += self.metrics[tag].start + chunk.insert_pos = pos + pos += len(chunk) + for tag in chunk.ends_tags: + pos += self.metrics[tag].end + +class Chunker(object): + + def __init__(self, oeb, data_func): + self.oeb, self.log = oeb, oeb.log + self.data = data_func + + self.skeletons = [] + + for i, item in enumerate(self.oeb.spine): + root = self.remove_namespaces(self.data(item)) + body = root.xpath('//body')[0] + body.tail = '\n' + + # First pass: break up document into rendered strings of length no + # more than CHUNK_SIZE + chunks = [] + self.step_into_tag(body, chunks) + + # Second pass: Merge neighboring small chunks within the same + # skeleton tag so as to have chunks as close to the CHUNK_SIZE as + # possible. + chunks = self.merge_small_chunks(chunks) + + # Third pass: Create the skeleton and calculate the insert position + # for all chunks + self.skeletons.append(Skeleton(i, item, root, chunks)) + + def remove_namespaces(self, root): + lang = None + for attr, val in root.attrib.iteritems(): + if attr.rpartition('}')[-1] == 'lang': + lang = val + + # Remove all namespace information from the tree. This means namespaced + # tags have their namespaces removed and all namespace declarations are + # removed. We have to do this manual cloning of the tree as there is no + # other way to remove namespace declarations in lxml. This is done so + # that serialization creates clean HTML 5 markup with no namespaces. We + # insert the XHTML namespace manually after serialization. The + # preceding layers should have removed svg and any other non html + # namespaced tags. + attrib = {'lang':lang} if lang else {} + nroot = etree.Element('html', attrib=attrib) + nroot.text = root.text + nroot.tail = '\n' + + for tag in root.iterdescendants(etree.Element): + # We are ignoring all non tag entities in the tree + # like comments and processing instructions, as they make the + # chunking code even harder, for minimal gain. + elem = nroot.makeelement(tag.tag.rpartition('}')[-1], + attrib={k.rpartition('}')[-1]:v for k, v in + tag.attrib.iteritems()}) + elem.text, elem.tail = tag.text, tag.tail + parent = node_from_path(nroot, path_to_node(tag.getparent())) + parent.append(elem) + + return nroot + + + def step_into_tag(self, tag, chunks): + aid = tag.get('aid') + + first_chunk_idx = len(chunks) + + # First handle any text + if tag.text and tag.text.strip(): # Leave pure whitespace in the skel + chunks.extend(self.chunk_up_text(tag.text)) + tag.text = None + + # Now loop over children + for child in list(tag): + raw = etree.tostring(child, encoding='UTF-8', + xml_declaration=False, with_tail=False) + raw = close_self_closing_tags(raw) + if len(raw) > CHUNK_SIZE and child.get('aid', None): + self.step_into_tag(child, chunks) + if child.tail and child.tail.strip(): # Leave pure whitespace + chunks.extend(self.chunk_up_text(child.tail)) + child.tail = None + else: + if len(raw) > CHUNK_SIZE: + self.log.warn('Tag %s has no aid and a too large chunk' + ' size. Adding anyway.'%child.tag) + chunks.append(Chunk(raw)) + if child.tail: + chunks.extend(self.chunk_up_text(child.tail)) + tag.remove(child) + + if len(chunks) <= first_chunk_idx and chunks: + raise ValueError('Stepped into a tag that generated no chunks.') + + # Mark the first and last chunks of this tag + if chunks: + chunks[first_chunk_idx].starts_tags.append(aid) + chunks[-1].ends_tags.append(aid) + + def chunk_up_text(self, text): + text = text.encode('utf-8') + ans = [] + + def split_multibyte_text(raw): + if len(raw) <= CHUNK_SIZE: + return raw, b'' + l = raw[:CHUNK_SIZE] + l = l.decode('utf-8', 'ignore').encode('utf-8') + return l, raw[len(l):] + + start, rest = split_multibyte_text(text) + ans.append(start) + while rest: + start, rest = split_multibyte_text(rest) + ans.append(b'' + start + '') + return [Chunk(x) for x in ans] + + def merge_small_chunks(self, chunks): + ans = chunks[:1] + for chunk in chunks[1:]: + prev = ans[-1] + if ( + chunk.starts_tags or # Starts a tag in the skel + len(chunk) + len(prev) > CHUNK_SIZE or # Too large + prev.ends_tags # Prev chunk ended a tag + ): + ans.append(chunk) + else: + prev.merge(chunk) + return ans +