KF8 Output: Text processing layer is complete

This commit is contained in:
Kovid Goyal 2012-04-20 18:49:22 +05:30
parent 4b93ebc990
commit 13abe2bb6e
6 changed files with 201 additions and 76 deletions

View File

@ -14,6 +14,7 @@ from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
from calibre.ebooks import normalize from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024 IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
def decode_string(raw, codec='utf-8', ordt_map=''): def decode_string(raw, codec='utf-8', ordt_map=''):
length, = struct.unpack(b'>B', raw[0]) length, = struct.unpack(b'>B', raw[0])
@ -498,3 +499,53 @@ def write_font_record(data, obfuscate=True, compress=True):
# }}} # }}}
def create_text_record(text):
'''
Return a Palmdoc record of size RECORD_SIZE from the text file object.
In case the record ends in the middle of a multibyte character return
the overlap as well.
Returns data, overlap: where both are byte strings. overlap is the
extra bytes needed to complete the truncated multibyte character.
'''
opos = text.tell()
text.seek(0, 2)
# npos is the position of the next record
npos = min((opos + RECORD_SIZE, text.tell()))
# Number of bytes from the next record needed to complete the last
# character in this record
extra = 0
last = b''
while not last.decode('utf-8', 'ignore'):
# last contains no valid utf-8 characters
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
# last now has one valid utf-8 char and possibly some bytes that belong
# to a truncated char
try:
last.decode('utf-8', 'strict')
except UnicodeDecodeError:
# There are some truncated bytes in last
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(opos)
data = text.read(RECORD_SIZE)
overlap = text.read(extra)
text.seek(npos)
return data, overlap

View File

@ -12,5 +12,4 @@ UNCOMPRESSED = 1
PALMDOC = 2 PALMDOC = 2
HUFFDIC = 17480 HUFFDIC = 17480
PALM_MAX_IMAGE_SIZE = 63 * 1024 PALM_MAX_IMAGE_SIZE = 63 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))

View File

@ -12,9 +12,8 @@ from struct import pack
from cStringIO import StringIO from cStringIO import StringIO
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
encode_tbs, align_block, utf8_text) encode_tbs, align_block, utf8_text, RECORD_SIZE)
class CNCX(object): # {{{ class CNCX(object): # {{{

View File

@ -16,9 +16,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer
from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.filenames import ascii_filename from calibre.utils.filenames import ascii_filename
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE) from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
from calibre.ebooks.mobi.utils import (encint, encode_trailing_data, from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
align_block, detect_periodical) align_block, detect_periodical, RECORD_SIZE, create_text_record)
from calibre.ebooks.mobi.writer2.indexer import Indexer from calibre.ebooks.mobi.writer2.indexer import Indexer
EXTH_CODES = { EXTH_CODES = {
@ -163,9 +163,7 @@ class MobiWriter(object):
# }}} # }}}
# Text {{{ def generate_text(self): # {{{
def generate_text(self):
self.oeb.logger.info('Serializing markup content...') self.oeb.logger.info('Serializing markup content...')
self.serializer = Serializer(self.oeb, self.image_map, self.serializer = Serializer(self.oeb, self.image_map,
self.is_periodical, self.is_periodical,
@ -180,7 +178,7 @@ class MobiWriter(object):
self.oeb.logger.info(' Compressing markup content...') self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length: while text.tell() < self.text_length:
data, overlap = self.read_text_record(text) data, overlap = create_text_record(text)
if self.compression == PALMDOC: if self.compression == PALMDOC:
data = compress_doc(data) data = compress_doc(data)
@ -197,57 +195,6 @@ class MobiWriter(object):
if records_size % 4 != 0: if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4)) self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1 self.first_non_text_record_idx += 1
def read_text_record(self, text):
'''
Return a Palmdoc record of size RECORD_SIZE from the text file object.
In case the record ends in the middle of a multibyte character return
the overlap as well.
Returns data, overlap: where both are byte strings. overlap is the
extra bytes needed to complete the truncated multibyte character.
'''
opos = text.tell()
text.seek(0, 2)
# npos is the position of the next record
npos = min((opos + RECORD_SIZE, text.tell()))
# Number of bytes from the next record needed to complete the last
# character in this record
extra = 0
last = b''
while not last.decode('utf-8', 'ignore'):
# last contains no valid utf-8 characters
size = len(last) + 1
text.seek(npos - size)
last = text.read(size)
# last now has one valid utf-8 char and possibly some bytes that belong
# to a truncated char
try:
last.decode('utf-8', 'strict')
except UnicodeDecodeError:
# There are some truncated bytes in last
prev = len(last)
while True:
text.seek(npos - prev)
last = text.read(len(last) + 1)
try:
last.decode('utf-8')
except UnicodeDecodeError:
pass
else:
break
extra = len(last) - prev
text.seek(opos)
data = text.read(RECORD_SIZE)
overlap = text.read(extra)
text.seek(npos)
return data, overlap
# }}} # }}}
def generate_record0(self): # MOBI header {{{ def generate_record0(self): # MOBI header {{{

View File

@ -19,15 +19,13 @@ from calibre.ebooks.mobi.utils import to_base
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
extract, XHTML, urlnormalize) extract, XHTML, urlnormalize)
from calibre.ebooks.oeb.parse_utils import barename from calibre.ebooks.oeb.parse_utils import barename
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
XML_DOCS = OEB_DOCS | {SVG_MIME} XML_DOCS = OEB_DOCS | {SVG_MIME}
# References to record numbers in KF8 are stored as base-32 encoded integers, # References to record numbers in KF8 are stored as base-32 encoded integers,
# with 4 digits # with 4 digits
to_ref = partial(to_base, base=32, min_num_digits=4) to_ref = partial(to_base, base=32, min_num_digits=4)
# References in links are stored with 10 digits
to_href = partial(to_base, base=32, min_num_digits=10)
class KF8Writer(object): class KF8Writer(object):
@ -167,7 +165,7 @@ class KF8Writer(object):
self.link_map = {} self.link_map = {}
count = 0 count = 0
hrefs = {item.href for item in self.oeb.spine} hrefs = {item.href for item in self.oeb.spine}
for item in self.oeb.spine: for i, item in enumerate(self.oeb.spine):
root = self.data(item) root = self.data(item)
for a in XPath('//h:a[@href]')(root): for a in XPath('//h:a[@href]')(root):
@ -176,7 +174,8 @@ class KF8Writer(object):
href, _, frag = ref.partition('#') href, _, frag = ref.partition('#')
href = urlnormalize(href) href = urlnormalize(href)
if href in hrefs: if href in hrefs:
placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) placeholder = 'kindle:pos:fid:%04d:off:%s'%(i,
to_href(count))
self.link_map[placeholder] = (href, frag) self.link_map[placeholder] = (href, frag)
a.set('href', placeholder) a.set('href', placeholder)
@ -199,7 +198,19 @@ class KF8Writer(object):
j += 1 j += 1
def chunk_it_up(self): def chunk_it_up(self):
chunker = Chunker(self.oeb, self.data) placeholder_map = {}
chunker for placeholder, x in self.link_map.iteritems():
href, frag = x
aid = self.id_map.get(x, None)
if aid is None:
aid = self.id_map.get((href, ''))
placeholder_map[placeholder] = aid
chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress,
placeholder_map)
for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records',
'last_text_record_idx', 'first_non_text_record_idx',
'text_length'):
setattr(self, x, getattr(chunker, x))

View File

@ -9,14 +9,22 @@ __docformat__ = 'restructuredtext en'
import re import re
from collections import namedtuple from collections import namedtuple
from io import BytesIO
from struct import pack
from functools import partial
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML_NS from calibre.ebooks.oeb.base import XHTML_NS
from calibre.constants import ispy3 from calibre.constants import ispy3
from calibre.ebooks.mobi.utils import create_text_record, to_base
from calibre.ebooks.compression.palmdoc import compress_doc
CHUNK_SIZE = 8192 CHUNK_SIZE = 8192
# References in links are stored with 10 digits
to_href = partial(to_base, base=32, min_num_digits=10)
# Tags to which kindlegen adds the aid attribute # Tags to which kindlegen adds the aid attribute
aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details', 'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
@ -70,11 +78,15 @@ def tostring(raw, **kwargs):
class Chunk(object): class Chunk(object):
def __init__(self, raw): def __init__(self, raw, parent_tag):
self.raw = raw self.raw = raw
self.starts_tags = [] self.starts_tags = []
self.ends_tags = [] self.ends_tags = []
self.insert_pos = None self.insert_pos = None
self.parent_tag = parent_tag
self.parent_is_body = False
self.is_last_chunk = False
self.is_first_chunk = False
def __len__(self): def __len__(self):
return len(self.raw) return len(self.raw)
@ -87,6 +99,11 @@ class Chunk(object):
return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%( return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags) len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
@property
def selector(self):
typ = 'S' if (self.is_last_chunk and not self.parent_is_body) else 'P'
return "%s-//*[@aid='%s']"%(typ, self.parent_tag)
__str__ = __repr__ __str__ = __repr__
class Skeleton(object): class Skeleton(object):
@ -133,11 +150,20 @@ class Skeleton(object):
ans = ans[:i] + chunk.raw + ans[i:] ans = ans[:i] + chunk.raw + ans[i:]
return ans return ans
def __len__(self):
return len(self.skeleton) + sum([len(x.raw) for x in self.chunks])
@property
def raw_text(self):
return b''.join([self.skeleton] + [x.raw for x in self.chunks])
class Chunker(object): class Chunker(object):
def __init__(self, oeb, data_func): def __init__(self, oeb, data_func, compress, placeholder_map):
self.oeb, self.log = oeb, oeb.log self.oeb, self.log = oeb, oeb.log
self.data = data_func self.data = data_func
self.compress = compress
self.placeholder_map = placeholder_map
self.skeletons = [] self.skeletons = []
@ -174,6 +200,19 @@ class Chunker(object):
if self.orig_dumps: if self.orig_dumps:
self.dump() self.dump()
# Create the SKEL and Chunk tables
self.skel_table = []
self.chunk_table = []
self.create_tables()
# Set internal links
text = b''.join(x.raw_text for x in self.skeletons)
text = self.set_internal_links(text)
# Create text records
self.records = []
self.create_text_records(text)
def remove_namespaces(self, root): def remove_namespaces(self, root):
lang = None lang = None
for attr, val in root.attrib.iteritems(): for attr, val in root.attrib.iteritems():
@ -206,15 +245,15 @@ class Chunker(object):
return nroot return nroot
def step_into_tag(self, tag, chunks): def step_into_tag(self, tag, chunks):
aid = tag.get('aid') aid = tag.get('aid')
is_body = tag.tag == 'body'
first_chunk_idx = len(chunks) first_chunk_idx = len(chunks)
# First handle any text # First handle any text
if tag.text and tag.text.strip(): # Leave pure whitespace in the skel if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
chunks.extend(self.chunk_up_text(tag.text)) chunks.extend(self.chunk_up_text(tag.text, aid))
tag.text = None tag.text = None
# Now loop over children # Now loop over children
@ -224,15 +263,15 @@ class Chunker(object):
if len(raw) > CHUNK_SIZE and child.get('aid', None): if len(raw) > CHUNK_SIZE and child.get('aid', None):
self.step_into_tag(child, chunks) self.step_into_tag(child, chunks)
if child.tail and child.tail.strip(): # Leave pure whitespace if child.tail and child.tail.strip(): # Leave pure whitespace
chunks.extend(self.chunk_up_text(child.tail)) chunks.extend(self.chunk_up_text(child.tail, aid))
child.tail = None child.tail = None
else: else:
if len(raw) > CHUNK_SIZE: if len(raw) > CHUNK_SIZE:
self.log.warn('Tag %s has no aid and a too large chunk' self.log.warn('Tag %s has no aid and a too large chunk'
' size. Adding anyway.'%child.tag) ' size. Adding anyway.'%child.tag)
chunks.append(Chunk(raw)) chunks.append(Chunk(raw, aid))
if child.tail: if child.tail:
chunks.extend(self.chunk_up_text(child.tail)) chunks.extend(self.chunk_up_text(child.tail, aid))
tag.remove(child) tag.remove(child)
if len(chunks) <= first_chunk_idx and chunks: if len(chunks) <= first_chunk_idx and chunks:
@ -242,8 +281,15 @@ class Chunker(object):
if chunks: if chunks:
chunks[first_chunk_idx].starts_tags.append(aid) chunks[first_chunk_idx].starts_tags.append(aid)
chunks[-1].ends_tags.append(aid) chunks[-1].ends_tags.append(aid)
my_chunks = chunks[first_chunk_idx:]
if my_chunks:
my_chunks[0].is_first_chunk = True
my_chunks[-1].is_last_chunk = True
if is_body:
for chunk in my_chunks:
chunk.parent_is_body = True
def chunk_up_text(self, text): def chunk_up_text(self, text, parent_tag):
text = text.encode('utf-8') text = text.encode('utf-8')
ans = [] ans = []
@ -259,7 +305,7 @@ class Chunker(object):
while rest: while rest:
start, rest = split_multibyte_text(rest) start, rest = split_multibyte_text(rest)
ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>') ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
return [Chunk(x) for x in ans] return [Chunk(x, parent_tag) for x in ans]
def merge_small_chunks(self, chunks): def merge_small_chunks(self, chunks):
ans = chunks[:1] ans = chunks[:1]
@ -275,6 +321,77 @@ class Chunker(object):
prev.merge(chunk) prev.merge(chunk)
return ans return ans
def create_tables(self):
Skel = namedtuple('Skel',
'file_number name chunk_count start_pos length')
sp = 0
for s in self.skeletons:
s.start_pos = sp
sp += len(s)
self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number,
len(s.chunks), s.start_pos, len(s.skeleton)) for x in self.skeletons]
Chunk = namedtuple('Chunk',
'insert_pos selector file_number sequence_number start_pos length')
num = cp = 0
for skel in self.skeletons:
cp = skel.start_pos
for chunk in skel.chunks:
self.chunk_table.append(
Chunk(chunk.insert_pos + skel.start_pos, chunk.selector,
skel.file_number, num, cp, len(chunk.raw)))
cp += len(chunk.raw)
num += 1
def set_internal_links(self, text):
# First find the start pos of all tags with aids
aid_map = {}
for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
aid_map[match.group(1)] = match.start()
self.aid_offset_map = aid_map
placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in
self.placeholder_map.iteritems()}
# Now update the links
def sub(match):
raw = match.group()
pl = match.group(1)
try:
return raw[:-10] + placeholder_map[pl]
except KeyError:
pass
return raw
return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
def create_text_records(self, text):
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
if self.compress:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def dump(self): def dump(self):
import tempfile, shutil, os import tempfile, shutil, os
tdir = os.path.join(tempfile.gettempdir(), 'skeleton') tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
@ -291,3 +408,4 @@ class Chunker(object):
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
f.write(skeleton.rebuild()) f.write(skeleton.rebuild())