From 13abe2bb6efb537bd2b5d404a7eda1c81ce80b1f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 20 Apr 2012 18:49:22 +0530 Subject: [PATCH] KF8 Output: Text processing layer is complete --- src/calibre/ebooks/mobi/utils.py | 51 ++++++++ src/calibre/ebooks/mobi/writer2/__init__.py | 1 - src/calibre/ebooks/mobi/writer2/indexer.py | 3 +- src/calibre/ebooks/mobi/writer2/main.py | 61 +-------- src/calibre/ebooks/mobi/writer8/main.py | 25 +++- src/calibre/ebooks/mobi/writer8/skeleton.py | 136 ++++++++++++++++++-- 6 files changed, 201 insertions(+), 76 deletions(-) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 0ae992f438..fe5cd7eaf2 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -14,6 +14,7 @@ from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 +RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed)) def decode_string(raw, codec='utf-8', ordt_map=''): length, = struct.unpack(b'>B', raw[0]) @@ -498,3 +499,53 @@ def write_font_record(data, obfuscate=True, compress=True): # }}} +def create_text_record(text): + ''' + Return a Palmdoc record of size RECORD_SIZE from the text file object. + In case the record ends in the middle of a multibyte character return + the overlap as well. + + Returns data, overlap: where both are byte strings. overlap is the + extra bytes needed to complete the truncated multibyte character. + ''' + opos = text.tell() + text.seek(0, 2) + # npos is the position of the next record + npos = min((opos + RECORD_SIZE, text.tell())) + # Number of bytes from the next record needed to complete the last + # character in this record + extra = 0 + + last = b'' + while not last.decode('utf-8', 'ignore'): + # last contains no valid utf-8 characters + size = len(last) + 1 + text.seek(npos - size) + last = text.read(size) + + # last now has one valid utf-8 char and possibly some bytes that belong + # to a truncated char + + try: + last.decode('utf-8', 'strict') + except UnicodeDecodeError: + # There are some truncated bytes in last + prev = len(last) + while True: + text.seek(npos - prev) + last = text.read(len(last) + 1) + try: + last.decode('utf-8') + except UnicodeDecodeError: + pass + else: + break + extra = len(last) - prev + + text.seek(opos) + data = text.read(RECORD_SIZE) + overlap = text.read(extra) + text.seek(npos) + + return data, overlap + diff --git a/src/calibre/ebooks/mobi/writer2/__init__.py b/src/calibre/ebooks/mobi/writer2/__init__.py index bc8dbbf7de..df3dcefb94 100644 --- a/src/calibre/ebooks/mobi/writer2/__init__.py +++ b/src/calibre/ebooks/mobi/writer2/__init__.py @@ -12,5 +12,4 @@ UNCOMPRESSED = 1 PALMDOC = 2 HUFFDIC = 17480 PALM_MAX_IMAGE_SIZE = 63 * 1024 -RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed)) diff --git a/src/calibre/ebooks/mobi/writer2/indexer.py b/src/calibre/ebooks/mobi/writer2/indexer.py index e349172d95..134fbadc60 100644 --- a/src/calibre/ebooks/mobi/writer2/indexer.py +++ b/src/calibre/ebooks/mobi/writer2/indexer.py @@ -12,9 +12,8 @@ from struct import pack from cStringIO import StringIO from collections import OrderedDict, defaultdict -from calibre.ebooks.mobi.writer2 import RECORD_SIZE from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, - encode_tbs, align_block, utf8_text) + encode_tbs, align_block, utf8_text, RECORD_SIZE) class CNCX(object): # {{{ diff --git a/src/calibre/ebooks/mobi/writer2/main.py b/src/calibre/ebooks/mobi/writer2/main.py index b7a0d76424..c930609489 100644 --- a/src/calibre/ebooks/mobi/writer2/main.py +++ b/src/calibre/ebooks/mobi/writer2/main.py @@ -16,9 +16,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer from calibre.ebooks.compression.palmdoc import compress_doc from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.filenames import ascii_filename -from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE) +from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED) from calibre.ebooks.mobi.utils import (encint, encode_trailing_data, - align_block, detect_periodical) + align_block, detect_periodical, RECORD_SIZE, create_text_record) from calibre.ebooks.mobi.writer2.indexer import Indexer EXTH_CODES = { @@ -163,9 +163,7 @@ class MobiWriter(object): # }}} - # Text {{{ - - def generate_text(self): + def generate_text(self): # {{{ self.oeb.logger.info('Serializing markup content...') self.serializer = Serializer(self.oeb, self.image_map, self.is_periodical, @@ -180,7 +178,7 @@ class MobiWriter(object): self.oeb.logger.info(' Compressing markup content...') while text.tell() < self.text_length: - data, overlap = self.read_text_record(text) + data, overlap = create_text_record(text) if self.compression == PALMDOC: data = compress_doc(data) @@ -197,57 +195,6 @@ class MobiWriter(object): if records_size % 4 != 0: self.records.append(b'\x00'*(records_size % 4)) self.first_non_text_record_idx += 1 - - def read_text_record(self, text): - ''' - Return a Palmdoc record of size RECORD_SIZE from the text file object. - In case the record ends in the middle of a multibyte character return - the overlap as well. - - Returns data, overlap: where both are byte strings. overlap is the - extra bytes needed to complete the truncated multibyte character. - ''' - opos = text.tell() - text.seek(0, 2) - # npos is the position of the next record - npos = min((opos + RECORD_SIZE, text.tell())) - # Number of bytes from the next record needed to complete the last - # character in this record - extra = 0 - - last = b'' - while not last.decode('utf-8', 'ignore'): - # last contains no valid utf-8 characters - size = len(last) + 1 - text.seek(npos - size) - last = text.read(size) - - # last now has one valid utf-8 char and possibly some bytes that belong - # to a truncated char - - try: - last.decode('utf-8', 'strict') - except UnicodeDecodeError: - # There are some truncated bytes in last - prev = len(last) - while True: - text.seek(npos - prev) - last = text.read(len(last) + 1) - try: - last.decode('utf-8') - except UnicodeDecodeError: - pass - else: - break - extra = len(last) - prev - - text.seek(opos) - data = text.read(RECORD_SIZE) - overlap = text.read(extra) - text.seek(npos) - - return data, overlap - # }}} def generate_record0(self): # MOBI header {{{ diff --git a/src/calibre/ebooks/mobi/writer8/main.py b/src/calibre/ebooks/mobi/writer8/main.py index 79ff7c3d96..b924a4df7c 100644 --- a/src/calibre/ebooks/mobi/writer8/main.py +++ b/src/calibre/ebooks/mobi/writer8/main.py @@ -19,15 +19,13 @@ from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, extract, XHTML, urlnormalize) from calibre.ebooks.oeb.parse_utils import barename -from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags +from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href XML_DOCS = OEB_DOCS | {SVG_MIME} # References to record numbers in KF8 are stored as base-32 encoded integers, # with 4 digits to_ref = partial(to_base, base=32, min_num_digits=4) -# References in links are stored with 10 digits -to_href = partial(to_base, base=32, min_num_digits=10) class KF8Writer(object): @@ -167,7 +165,7 @@ class KF8Writer(object): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} - for item in self.oeb.spine: + for i, item in enumerate(self.oeb.spine): root = self.data(item) for a in XPath('//h:a[@href]')(root): @@ -176,7 +174,8 @@ class KF8Writer(object): href, _, frag = ref.partition('#') href = urlnormalize(href) if href in hrefs: - placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) + placeholder = 'kindle:pos:fid:%04d:off:%s'%(i, + to_href(count)) self.link_map[placeholder] = (href, frag) a.set('href', placeholder) @@ -199,7 +198,19 @@ class KF8Writer(object): j += 1 def chunk_it_up(self): - chunker = Chunker(self.oeb, self.data) - chunker + placeholder_map = {} + for placeholder, x in self.link_map.iteritems(): + href, frag = x + aid = self.id_map.get(x, None) + if aid is None: + aid = self.id_map.get((href, '')) + placeholder_map[placeholder] = aid + chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress, + placeholder_map) + + for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records', + 'last_text_record_idx', 'first_non_text_record_idx', + 'text_length'): + setattr(self, x, getattr(chunker, x)) diff --git a/src/calibre/ebooks/mobi/writer8/skeleton.py b/src/calibre/ebooks/mobi/writer8/skeleton.py index 201d2b63d4..da3b9407bd 100644 --- a/src/calibre/ebooks/mobi/writer8/skeleton.py +++ b/src/calibre/ebooks/mobi/writer8/skeleton.py @@ -9,14 +9,22 @@ __docformat__ = 'restructuredtext en' import re from collections import namedtuple +from io import BytesIO +from struct import pack +from functools import partial from lxml import etree from calibre.ebooks.oeb.base import XHTML_NS from calibre.constants import ispy3 +from calibre.ebooks.mobi.utils import create_text_record, to_base +from calibre.ebooks.compression.palmdoc import compress_doc CHUNK_SIZE = 8192 +# References in links are stored with 10 digits +to_href = partial(to_base, base=32, min_num_digits=10) + # Tags to which kindlegen adds the aid attribute aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', 'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details', @@ -70,11 +78,15 @@ def tostring(raw, **kwargs): class Chunk(object): - def __init__(self, raw): + def __init__(self, raw, parent_tag): self.raw = raw self.starts_tags = [] self.ends_tags = [] self.insert_pos = None + self.parent_tag = parent_tag + self.parent_is_body = False + self.is_last_chunk = False + self.is_first_chunk = False def __len__(self): return len(self.raw) @@ -87,6 +99,11 @@ class Chunk(object): return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%( len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags) + @property + def selector(self): + typ = 'S' if (self.is_last_chunk and not self.parent_is_body) else 'P' + return "%s-//*[@aid='%s']"%(typ, self.parent_tag) + __str__ = __repr__ class Skeleton(object): @@ -133,11 +150,20 @@ class Skeleton(object): ans = ans[:i] + chunk.raw + ans[i:] return ans + def __len__(self): + return len(self.skeleton) + sum([len(x.raw) for x in self.chunks]) + + @property + def raw_text(self): + return b''.join([self.skeleton] + [x.raw for x in self.chunks]) + class Chunker(object): - def __init__(self, oeb, data_func): + def __init__(self, oeb, data_func, compress, placeholder_map): self.oeb, self.log = oeb, oeb.log self.data = data_func + self.compress = compress + self.placeholder_map = placeholder_map self.skeletons = [] @@ -174,6 +200,19 @@ class Chunker(object): if self.orig_dumps: self.dump() + # Create the SKEL and Chunk tables + self.skel_table = [] + self.chunk_table = [] + self.create_tables() + + # Set internal links + text = b''.join(x.raw_text for x in self.skeletons) + text = self.set_internal_links(text) + + # Create text records + self.records = [] + self.create_text_records(text) + def remove_namespaces(self, root): lang = None for attr, val in root.attrib.iteritems(): @@ -206,15 +245,15 @@ class Chunker(object): return nroot - def step_into_tag(self, tag, chunks): aid = tag.get('aid') + is_body = tag.tag == 'body' first_chunk_idx = len(chunks) # First handle any text if tag.text and tag.text.strip(): # Leave pure whitespace in the skel - chunks.extend(self.chunk_up_text(tag.text)) + chunks.extend(self.chunk_up_text(tag.text, aid)) tag.text = None # Now loop over children @@ -224,15 +263,15 @@ class Chunker(object): if len(raw) > CHUNK_SIZE and child.get('aid', None): self.step_into_tag(child, chunks) if child.tail and child.tail.strip(): # Leave pure whitespace - chunks.extend(self.chunk_up_text(child.tail)) + chunks.extend(self.chunk_up_text(child.tail, aid)) child.tail = None else: if len(raw) > CHUNK_SIZE: self.log.warn('Tag %s has no aid and a too large chunk' ' size. Adding anyway.'%child.tag) - chunks.append(Chunk(raw)) + chunks.append(Chunk(raw, aid)) if child.tail: - chunks.extend(self.chunk_up_text(child.tail)) + chunks.extend(self.chunk_up_text(child.tail, aid)) tag.remove(child) if len(chunks) <= first_chunk_idx and chunks: @@ -242,8 +281,15 @@ class Chunker(object): if chunks: chunks[first_chunk_idx].starts_tags.append(aid) chunks[-1].ends_tags.append(aid) + my_chunks = chunks[first_chunk_idx:] + if my_chunks: + my_chunks[0].is_first_chunk = True + my_chunks[-1].is_last_chunk = True + if is_body: + for chunk in my_chunks: + chunk.parent_is_body = True - def chunk_up_text(self, text): + def chunk_up_text(self, text, parent_tag): text = text.encode('utf-8') ans = [] @@ -259,7 +305,7 @@ class Chunker(object): while rest: start, rest = split_multibyte_text(rest) ans.append(b'' + start + '') - return [Chunk(x) for x in ans] + return [Chunk(x, parent_tag) for x in ans] def merge_small_chunks(self, chunks): ans = chunks[:1] @@ -275,6 +321,77 @@ class Chunker(object): prev.merge(chunk) return ans + def create_tables(self): + Skel = namedtuple('Skel', + 'file_number name chunk_count start_pos length') + sp = 0 + for s in self.skeletons: + s.start_pos = sp + sp += len(s) + self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number, + len(s.chunks), s.start_pos, len(s.skeleton)) for x in self.skeletons] + + Chunk = namedtuple('Chunk', + 'insert_pos selector file_number sequence_number start_pos length') + num = cp = 0 + for skel in self.skeletons: + cp = skel.start_pos + for chunk in skel.chunks: + self.chunk_table.append( + Chunk(chunk.insert_pos + skel.start_pos, chunk.selector, + skel.file_number, num, cp, len(chunk.raw))) + cp += len(chunk.raw) + num += 1 + + def set_internal_links(self, text): + # First find the start pos of all tags with aids + aid_map = {} + for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text): + aid_map[match.group(1)] = match.start() + self.aid_offset_map = aid_map + placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in + self.placeholder_map.iteritems()} + + # Now update the links + def sub(match): + raw = match.group() + pl = match.group(1) + try: + return raw[:-10] + placeholder_map[pl] + except KeyError: + pass + return raw + + return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) + + def create_text_records(self, text): + self.text_length = len(text) + text = BytesIO(text) + nrecords = 0 + records_size = 0 + + if self.compress: + self.oeb.logger.info(' Compressing markup content...') + + while text.tell() < self.text_length: + data, overlap = create_text_record(text) + if self.compress: + data = compress_doc(data) + + data += overlap + data += pack(b'>B', len(overlap)) + + self.records.append(data) + records_size += len(data) + nrecords += 1 + + self.last_text_record_idx = nrecords + self.first_non_text_record_idx = nrecords + 1 + # Pad so that the next records starts at a 4 byte boundary + if records_size % 4 != 0: + self.records.append(b'\x00'*(records_size % 4)) + self.first_non_text_record_idx += 1 + def dump(self): import tempfile, shutil, os tdir = os.path.join(tempfile.gettempdir(), 'skeleton') @@ -291,3 +408,4 @@ class Chunker(object): with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f: f.write(skeleton.rebuild()) +