This commit is contained in:
Kovid Goyal 2012-04-20 20:04:13 +05:30
parent fcbef661ca
commit 7800024bac
2 changed files with 57 additions and 46 deletions

View File

@ -9,13 +9,16 @@ __docformat__ = 'restructuredtext en'
import copy import copy
from functools import partial from functools import partial
from collections import defaultdict from collections import defaultdict, namedtuple
from io import BytesIO
from struct import pack
import cssutils import cssutils
from lxml import etree from lxml import etree
from calibre import isbytestring, force_unicode from calibre import isbytestring, force_unicode
from calibre.ebooks.mobi.utils import to_base from calibre.ebooks.mobi.utils import create_text_record, to_base
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
extract, XHTML, urlnormalize) extract, XHTML, urlnormalize)
from calibre.ebooks.oeb.parse_utils import barename from calibre.ebooks.oeb.parse_utils import barename
@ -31,11 +34,14 @@ class KF8Writer(object):
def __init__(self, oeb, opts, resources): def __init__(self, oeb, opts, resources):
self.oeb, self.opts, self.log = oeb, opts, oeb.log self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.compress = not self.opts.dont_compress
self.log.info('Creating KF8 output') self.log.info('Creating KF8 output')
self.used_images = set() self.used_images = set()
self.resources = resources self.resources = resources
self.dup_data() self.dup_data()
self.flows = [None] # First flow item is reserved for the text self.flows = [None] # First flow item is reserved for the text
self.records = []
self.fdst_table = []
self.replace_resource_links() self.replace_resource_links()
self.extract_css_into_flows() self.extract_css_into_flows()
@ -43,6 +49,10 @@ class KF8Writer(object):
self.replace_internal_links_with_placeholders() self.replace_internal_links_with_placeholders()
self.insert_aid_attributes() self.insert_aid_attributes()
self.chunk_it_up() self.chunk_it_up()
# Dump the cloned data as it is no longer needed
del self._data_cache
self.create_text_records()
self.create_fdst_table()
def dup_data(self): def dup_data(self):
''' Duplicate data so that any changes we make to markup/CSS only ''' Duplicate data so that any changes we make to markup/CSS only
@ -205,12 +215,49 @@ class KF8Writer(object):
if aid is None: if aid is None:
aid = self.id_map.get((href, '')) aid = self.id_map.get((href, ''))
placeholder_map[placeholder] = aid placeholder_map[placeholder] = aid
chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress, chunker = Chunker(self.oeb, self.data, placeholder_map)
placeholder_map)
for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records', for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
'last_text_record_idx', 'first_non_text_record_idx',
'text_length'):
setattr(self, x, getattr(chunker, x)) setattr(self, x, getattr(chunker, x))
self.flows[0] = chunker.text
def create_text_records(self):
self.flows = [x.encode('utf-8') if isinstance(x, unicode) else x for x
in self.flows]
text = b''.join(self.flows)
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
if self.compress:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def create_fdst_table(self):
FDST = namedtuple('Flow', 'start end')
for i, flow in enumerate(self.flows):
start = 0 if i == 0 else self.fdst_table[-1].end
self.fdst_table.append(FDST(start, start + len(flow)))

View File

@ -9,16 +9,13 @@ __docformat__ = 'restructuredtext en'
import re import re
from collections import namedtuple from collections import namedtuple
from io import BytesIO
from struct import pack
from functools import partial from functools import partial
from lxml import etree from lxml import etree
from calibre.ebooks.oeb.base import XHTML_NS from calibre.ebooks.oeb.base import XHTML_NS
from calibre.constants import ispy3 from calibre.constants import ispy3
from calibre.ebooks.mobi.utils import create_text_record, to_base from calibre.ebooks.mobi.utils import to_base
from calibre.ebooks.compression.palmdoc import compress_doc
CHUNK_SIZE = 8192 CHUNK_SIZE = 8192
@ -159,10 +156,9 @@ class Skeleton(object):
class Chunker(object): class Chunker(object):
def __init__(self, oeb, data_func, compress, placeholder_map): def __init__(self, oeb, data_func, placeholder_map):
self.oeb, self.log = oeb, oeb.log self.oeb, self.log = oeb, oeb.log
self.data = data_func self.data = data_func
self.compress = compress
self.placeholder_map = placeholder_map self.placeholder_map = placeholder_map
self.skeletons = [] self.skeletons = []
@ -207,11 +203,7 @@ class Chunker(object):
# Set internal links # Set internal links
text = b''.join(x.raw_text for x in self.skeletons) text = b''.join(x.raw_text for x in self.skeletons)
text = self.set_internal_links(text) self.text = self.set_internal_links(text)
# Create text records
self.records = []
self.create_text_records(text)
def remove_namespaces(self, root): def remove_namespaces(self, root):
lang = None lang = None
@ -364,34 +356,6 @@ class Chunker(object):
return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text) return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
def create_text_records(self, text):
self.text_length = len(text)
text = BytesIO(text)
nrecords = 0
records_size = 0
if self.compress:
self.oeb.logger.info(' Compressing markup content...')
while text.tell() < self.text_length:
data, overlap = create_text_record(text)
if self.compress:
data = compress_doc(data)
data += overlap
data += pack(b'>B', len(overlap))
self.records.append(data)
records_size += len(data)
nrecords += 1
self.last_text_record_idx = nrecords
self.first_non_text_record_idx = nrecords + 1
# Pad so that the next records starts at a 4 byte boundary
if records_size % 4 != 0:
self.records.append(b'\x00'*(records_size % 4))
self.first_non_text_record_idx += 1
def dump(self): def dump(self):
import tempfile, shutil, os import tempfile, shutil, os
tdir = os.path.join(tempfile.gettempdir(), 'skeleton') tdir = os.path.join(tempfile.gettempdir(), 'skeleton')