mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8 Output: Implement the chunking algorithm. Needs review and testing
This commit is contained in:
parent
5c0cd6e070
commit
de700eb326
@ -172,6 +172,7 @@ class MOBIOutput(OutputFormatPlugin):
|
|||||||
|
|
||||||
kf8 = self.create_kf8(resources) if create_kf8 else None
|
kf8 = self.create_kf8(resources) if create_kf8 else None
|
||||||
|
|
||||||
|
self.log('Creating MOBI 6 output')
|
||||||
self.write_mobi(input_plugin, output_path, kf8, resources)
|
self.write_mobi(input_plugin, output_path, kf8, resources)
|
||||||
|
|
||||||
def create_kf8(self, resources):
|
def create_kf8(self, resources):
|
||||||
|
@ -19,6 +19,7 @@ from calibre.ebooks.mobi.utils import to_base
|
|||||||
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
|
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
|
||||||
extract, XHTML, urlnormalize)
|
extract, XHTML, urlnormalize)
|
||||||
from calibre.ebooks.oeb.parse_utils import barename
|
from calibre.ebooks.oeb.parse_utils import barename
|
||||||
|
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags
|
||||||
|
|
||||||
XML_DOCS = OEB_DOCS | {SVG_MIME}
|
XML_DOCS = OEB_DOCS | {SVG_MIME}
|
||||||
|
|
||||||
@ -28,20 +29,11 @@ to_ref = partial(to_base, base=32, min_num_digits=4)
|
|||||||
# References in links are stored with 10 digits
|
# References in links are stored with 10 digits
|
||||||
to_href = partial(to_base, base=32, min_num_digits=10)
|
to_href = partial(to_base, base=32, min_num_digits=10)
|
||||||
|
|
||||||
# Tags to which kindlegen adds the aid attribute
|
|
||||||
aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
|
|
||||||
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
|
|
||||||
'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
|
|
||||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
|
|
||||||
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
|
|
||||||
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
|
|
||||||
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
|
|
||||||
'video'}
|
|
||||||
|
|
||||||
class KF8Writer(object):
|
class KF8Writer(object):
|
||||||
|
|
||||||
def __init__(self, oeb, opts, resources):
|
def __init__(self, oeb, opts, resources):
|
||||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||||
|
self.log.info('Creating KF8 output')
|
||||||
self.used_images = set()
|
self.used_images = set()
|
||||||
self.resources = resources
|
self.resources = resources
|
||||||
self.dup_data()
|
self.dup_data()
|
||||||
@ -52,6 +44,7 @@ class KF8Writer(object):
|
|||||||
self.extract_svg_into_flows()
|
self.extract_svg_into_flows()
|
||||||
self.replace_internal_links_with_placeholders()
|
self.replace_internal_links_with_placeholders()
|
||||||
self.insert_aid_attributes()
|
self.insert_aid_attributes()
|
||||||
|
self.chunk_it_up()
|
||||||
|
|
||||||
def dup_data(self):
|
def dup_data(self):
|
||||||
''' Duplicate data so that any changes we make to markup/CSS only
|
''' Duplicate data so that any changes we make to markup/CSS only
|
||||||
@ -144,6 +137,7 @@ class KF8Writer(object):
|
|||||||
continue
|
continue
|
||||||
repl = etree.Element(XHTML('link'), type='text/css',
|
repl = etree.Element(XHTML('link'), type='text/css',
|
||||||
rel='stylesheet')
|
rel='stylesheet')
|
||||||
|
repl.tail='\n'
|
||||||
p.insert(idx, repl)
|
p.insert(idx, repl)
|
||||||
extract(tag)
|
extract(tag)
|
||||||
inlines[raw].append(repl)
|
inlines[raw].append(repl)
|
||||||
@ -204,3 +198,8 @@ class KF8Writer(object):
|
|||||||
|
|
||||||
j += 1
|
j += 1
|
||||||
|
|
||||||
|
def chunk_it_up(self):
|
||||||
|
chunker = Chunker(self.oeb, self.data)
|
||||||
|
chunker
|
||||||
|
|
||||||
|
|
||||||
|
232
src/calibre/ebooks/mobi/writer8/skeleton.py
Normal file
232
src/calibre/ebooks/mobi/writer8/skeleton.py
Normal file
@ -0,0 +1,232 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import (unicode_literals, division, absolute_import,
|
||||||
|
print_function)
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import XHTML_NS
|
||||||
|
|
||||||
|
CHUNK_SIZE = 8192
|
||||||
|
|
||||||
|
# Tags to which kindlegen adds the aid attribute
|
||||||
|
aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
|
||||||
|
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
|
||||||
|
'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
|
||||||
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'ins', 'kbd',
|
||||||
|
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
|
||||||
|
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
|
||||||
|
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
|
||||||
|
'video'}
|
||||||
|
|
||||||
|
_self_closing_pat = re.compile(bytes(
|
||||||
|
r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'%('|'.join(aid_able_tags))),
|
||||||
|
re.IGNORECASE)
|
||||||
|
|
||||||
|
def close_self_closing_tags(raw):
|
||||||
|
return _self_closing_pat.sub(br'<\g<tag>\g<arg>></\g<tag>>', raw)
|
||||||
|
|
||||||
|
def path_to_node(node):
|
||||||
|
ans = []
|
||||||
|
parent = node.getparent()
|
||||||
|
while parent is not None:
|
||||||
|
ans.append(parent.index(node))
|
||||||
|
node = parent
|
||||||
|
parent = parent.getparent()
|
||||||
|
return tuple(reversed(ans))
|
||||||
|
|
||||||
|
def node_from_path(root, path):
|
||||||
|
parent = root
|
||||||
|
for idx in path:
|
||||||
|
parent = parent[idx]
|
||||||
|
return parent
|
||||||
|
|
||||||
|
class Chunk(object):
|
||||||
|
|
||||||
|
def __init__(self, raw):
|
||||||
|
self.raw = raw
|
||||||
|
self.starts_tags = []
|
||||||
|
self.ends_tags = []
|
||||||
|
self.insert_pos = None
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.raw)
|
||||||
|
|
||||||
|
def merge(self, chunk):
|
||||||
|
self.raw += chunk.raw
|
||||||
|
self.ends_tags = chunk.ends_tags
|
||||||
|
|
||||||
|
class Skeleton(object):
|
||||||
|
|
||||||
|
def __init__(self, file_number, item, root, chunks):
|
||||||
|
self.file_number, self.item = file_number, item
|
||||||
|
self.chunks = chunks
|
||||||
|
|
||||||
|
self.skeleton = self.render(root)
|
||||||
|
self.body_offset = self.skeleton.find('<body')
|
||||||
|
self.calculate_metrics(root)
|
||||||
|
|
||||||
|
self.calculate_insert_positions()
|
||||||
|
|
||||||
|
def render(self, root):
|
||||||
|
raw = etree.tostring(root, encoding='UTF-8', xml_declaration=True)
|
||||||
|
raw = raw.replace('<html', '<html xmlns="%s"'%XHTML_NS, 1)
|
||||||
|
return raw
|
||||||
|
|
||||||
|
def calculate_metrics(self, root):
|
||||||
|
Metric = namedtuple('Metric', 'start end')
|
||||||
|
self.metrics = {}
|
||||||
|
for tag in root.xpath('//*[@aid]'):
|
||||||
|
text = (tag.text or '').encode('utf-8')
|
||||||
|
raw = etree.tostring(tag, encoding='UTF-8', with_tail=True,
|
||||||
|
xml_declaration=False)
|
||||||
|
start_length = len(raw.partition(b'>')[0]) + len(text) + 1
|
||||||
|
end_length = len(raw.rpartition(b'<')[-1]) + 1
|
||||||
|
self.metrics[tag.get('aid')] = Metric(start_length, end_length)
|
||||||
|
|
||||||
|
def calculate_insert_positions(self):
|
||||||
|
pos = self.body_offset
|
||||||
|
for chunk in self.chunks:
|
||||||
|
for tag in chunk.starts_tags:
|
||||||
|
pos += self.metrics[tag].start
|
||||||
|
chunk.insert_pos = pos
|
||||||
|
pos += len(chunk)
|
||||||
|
for tag in chunk.ends_tags:
|
||||||
|
pos += self.metrics[tag].end
|
||||||
|
|
||||||
|
class Chunker(object):
|
||||||
|
|
||||||
|
def __init__(self, oeb, data_func):
|
||||||
|
self.oeb, self.log = oeb, oeb.log
|
||||||
|
self.data = data_func
|
||||||
|
|
||||||
|
self.skeletons = []
|
||||||
|
|
||||||
|
for i, item in enumerate(self.oeb.spine):
|
||||||
|
root = self.remove_namespaces(self.data(item))
|
||||||
|
body = root.xpath('//body')[0]
|
||||||
|
body.tail = '\n'
|
||||||
|
|
||||||
|
# First pass: break up document into rendered strings of length no
|
||||||
|
# more than CHUNK_SIZE
|
||||||
|
chunks = []
|
||||||
|
self.step_into_tag(body, chunks)
|
||||||
|
|
||||||
|
# Second pass: Merge neighboring small chunks within the same
|
||||||
|
# skeleton tag so as to have chunks as close to the CHUNK_SIZE as
|
||||||
|
# possible.
|
||||||
|
chunks = self.merge_small_chunks(chunks)
|
||||||
|
|
||||||
|
# Third pass: Create the skeleton and calculate the insert position
|
||||||
|
# for all chunks
|
||||||
|
self.skeletons.append(Skeleton(i, item, root, chunks))
|
||||||
|
|
||||||
|
def remove_namespaces(self, root):
|
||||||
|
lang = None
|
||||||
|
for attr, val in root.attrib.iteritems():
|
||||||
|
if attr.rpartition('}')[-1] == 'lang':
|
||||||
|
lang = val
|
||||||
|
|
||||||
|
# Remove all namespace information from the tree. This means namespaced
|
||||||
|
# tags have their namespaces removed and all namespace declarations are
|
||||||
|
# removed. We have to do this manual cloning of the tree as there is no
|
||||||
|
# other way to remove namespace declarations in lxml. This is done so
|
||||||
|
# that serialization creates clean HTML 5 markup with no namespaces. We
|
||||||
|
# insert the XHTML namespace manually after serialization. The
|
||||||
|
# preceding layers should have removed svg and any other non html
|
||||||
|
# namespaced tags.
|
||||||
|
attrib = {'lang':lang} if lang else {}
|
||||||
|
nroot = etree.Element('html', attrib=attrib)
|
||||||
|
nroot.text = root.text
|
||||||
|
nroot.tail = '\n'
|
||||||
|
|
||||||
|
for tag in root.iterdescendants(etree.Element):
|
||||||
|
# We are ignoring all non tag entities in the tree
|
||||||
|
# like comments and processing instructions, as they make the
|
||||||
|
# chunking code even harder, for minimal gain.
|
||||||
|
elem = nroot.makeelement(tag.tag.rpartition('}')[-1],
|
||||||
|
attrib={k.rpartition('}')[-1]:v for k, v in
|
||||||
|
tag.attrib.iteritems()})
|
||||||
|
elem.text, elem.tail = tag.text, tag.tail
|
||||||
|
parent = node_from_path(nroot, path_to_node(tag.getparent()))
|
||||||
|
parent.append(elem)
|
||||||
|
|
||||||
|
return nroot
|
||||||
|
|
||||||
|
|
||||||
|
def step_into_tag(self, tag, chunks):
|
||||||
|
aid = tag.get('aid')
|
||||||
|
|
||||||
|
first_chunk_idx = len(chunks)
|
||||||
|
|
||||||
|
# First handle any text
|
||||||
|
if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
|
||||||
|
chunks.extend(self.chunk_up_text(tag.text))
|
||||||
|
tag.text = None
|
||||||
|
|
||||||
|
# Now loop over children
|
||||||
|
for child in list(tag):
|
||||||
|
raw = etree.tostring(child, encoding='UTF-8',
|
||||||
|
xml_declaration=False, with_tail=False)
|
||||||
|
raw = close_self_closing_tags(raw)
|
||||||
|
if len(raw) > CHUNK_SIZE and child.get('aid', None):
|
||||||
|
self.step_into_tag(child, chunks)
|
||||||
|
if child.tail and child.tail.strip(): # Leave pure whitespace
|
||||||
|
chunks.extend(self.chunk_up_text(child.tail))
|
||||||
|
child.tail = None
|
||||||
|
else:
|
||||||
|
if len(raw) > CHUNK_SIZE:
|
||||||
|
self.log.warn('Tag %s has no aid and a too large chunk'
|
||||||
|
' size. Adding anyway.'%child.tag)
|
||||||
|
chunks.append(Chunk(raw))
|
||||||
|
if child.tail:
|
||||||
|
chunks.extend(self.chunk_up_text(child.tail))
|
||||||
|
tag.remove(child)
|
||||||
|
|
||||||
|
if len(chunks) <= first_chunk_idx and chunks:
|
||||||
|
raise ValueError('Stepped into a tag that generated no chunks.')
|
||||||
|
|
||||||
|
# Mark the first and last chunks of this tag
|
||||||
|
if chunks:
|
||||||
|
chunks[first_chunk_idx].starts_tags.append(aid)
|
||||||
|
chunks[-1].ends_tags.append(aid)
|
||||||
|
|
||||||
|
def chunk_up_text(self, text):
|
||||||
|
text = text.encode('utf-8')
|
||||||
|
ans = []
|
||||||
|
|
||||||
|
def split_multibyte_text(raw):
|
||||||
|
if len(raw) <= CHUNK_SIZE:
|
||||||
|
return raw, b''
|
||||||
|
l = raw[:CHUNK_SIZE]
|
||||||
|
l = l.decode('utf-8', 'ignore').encode('utf-8')
|
||||||
|
return l, raw[len(l):]
|
||||||
|
|
||||||
|
start, rest = split_multibyte_text(text)
|
||||||
|
ans.append(start)
|
||||||
|
while rest:
|
||||||
|
start, rest = split_multibyte_text(rest)
|
||||||
|
ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
|
||||||
|
return [Chunk(x) for x in ans]
|
||||||
|
|
||||||
|
def merge_small_chunks(self, chunks):
|
||||||
|
ans = chunks[:1]
|
||||||
|
for chunk in chunks[1:]:
|
||||||
|
prev = ans[-1]
|
||||||
|
if (
|
||||||
|
chunk.starts_tags or # Starts a tag in the skel
|
||||||
|
len(chunk) + len(prev) > CHUNK_SIZE or # Too large
|
||||||
|
prev.ends_tags # Prev chunk ended a tag
|
||||||
|
):
|
||||||
|
ans.append(chunk)
|
||||||
|
else:
|
||||||
|
prev.merge(chunk)
|
||||||
|
return ans
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user