mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
KF8 Output: Text processing layer is complete
This commit is contained in:
parent
4b93ebc990
commit
13abe2bb6e
@ -14,6 +14,7 @@ from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
|
|||||||
from calibre.ebooks import normalize
|
from calibre.ebooks import normalize
|
||||||
|
|
||||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||||
|
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
|
||||||
|
|
||||||
def decode_string(raw, codec='utf-8', ordt_map=''):
|
def decode_string(raw, codec='utf-8', ordt_map=''):
|
||||||
length, = struct.unpack(b'>B', raw[0])
|
length, = struct.unpack(b'>B', raw[0])
|
||||||
@ -498,3 +499,53 @@ def write_font_record(data, obfuscate=True, compress=True):
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
|
def create_text_record(text):
|
||||||
|
'''
|
||||||
|
Return a Palmdoc record of size RECORD_SIZE from the text file object.
|
||||||
|
In case the record ends in the middle of a multibyte character return
|
||||||
|
the overlap as well.
|
||||||
|
|
||||||
|
Returns data, overlap: where both are byte strings. overlap is the
|
||||||
|
extra bytes needed to complete the truncated multibyte character.
|
||||||
|
'''
|
||||||
|
opos = text.tell()
|
||||||
|
text.seek(0, 2)
|
||||||
|
# npos is the position of the next record
|
||||||
|
npos = min((opos + RECORD_SIZE, text.tell()))
|
||||||
|
# Number of bytes from the next record needed to complete the last
|
||||||
|
# character in this record
|
||||||
|
extra = 0
|
||||||
|
|
||||||
|
last = b''
|
||||||
|
while not last.decode('utf-8', 'ignore'):
|
||||||
|
# last contains no valid utf-8 characters
|
||||||
|
size = len(last) + 1
|
||||||
|
text.seek(npos - size)
|
||||||
|
last = text.read(size)
|
||||||
|
|
||||||
|
# last now has one valid utf-8 char and possibly some bytes that belong
|
||||||
|
# to a truncated char
|
||||||
|
|
||||||
|
try:
|
||||||
|
last.decode('utf-8', 'strict')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# There are some truncated bytes in last
|
||||||
|
prev = len(last)
|
||||||
|
while True:
|
||||||
|
text.seek(npos - prev)
|
||||||
|
last = text.read(len(last) + 1)
|
||||||
|
try:
|
||||||
|
last.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
extra = len(last) - prev
|
||||||
|
|
||||||
|
text.seek(opos)
|
||||||
|
data = text.read(RECORD_SIZE)
|
||||||
|
overlap = text.read(extra)
|
||||||
|
text.seek(npos)
|
||||||
|
|
||||||
|
return data, overlap
|
||||||
|
|
||||||
|
@ -12,5 +12,4 @@ UNCOMPRESSED = 1
|
|||||||
PALMDOC = 2
|
PALMDOC = 2
|
||||||
HUFFDIC = 17480
|
HUFFDIC = 17480
|
||||||
PALM_MAX_IMAGE_SIZE = 63 * 1024
|
PALM_MAX_IMAGE_SIZE = 63 * 1024
|
||||||
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
|
|
||||||
|
|
||||||
|
@ -12,9 +12,8 @@ from struct import pack
|
|||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
from collections import OrderedDict, defaultdict
|
from collections import OrderedDict, defaultdict
|
||||||
|
|
||||||
from calibre.ebooks.mobi.writer2 import RECORD_SIZE
|
|
||||||
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
|
from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex,
|
||||||
encode_tbs, align_block, utf8_text)
|
encode_tbs, align_block, utf8_text, RECORD_SIZE)
|
||||||
|
|
||||||
class CNCX(object): # {{{
|
class CNCX(object): # {{{
|
||||||
|
|
||||||
|
@ -16,9 +16,9 @@ from calibre.ebooks.mobi.writer2.serializer import Serializer
|
|||||||
from calibre.ebooks.compression.palmdoc import compress_doc
|
from calibre.ebooks.compression.palmdoc import compress_doc
|
||||||
from calibre.ebooks.mobi.langcodes import iana2mobi
|
from calibre.ebooks.mobi.langcodes import iana2mobi
|
||||||
from calibre.utils.filenames import ascii_filename
|
from calibre.utils.filenames import ascii_filename
|
||||||
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED, RECORD_SIZE)
|
from calibre.ebooks.mobi.writer2 import (PALMDOC, UNCOMPRESSED)
|
||||||
from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
|
from calibre.ebooks.mobi.utils import (encint, encode_trailing_data,
|
||||||
align_block, detect_periodical)
|
align_block, detect_periodical, RECORD_SIZE, create_text_record)
|
||||||
from calibre.ebooks.mobi.writer2.indexer import Indexer
|
from calibre.ebooks.mobi.writer2.indexer import Indexer
|
||||||
|
|
||||||
EXTH_CODES = {
|
EXTH_CODES = {
|
||||||
@ -163,9 +163,7 @@ class MobiWriter(object):
|
|||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
# Text {{{
|
def generate_text(self): # {{{
|
||||||
|
|
||||||
def generate_text(self):
|
|
||||||
self.oeb.logger.info('Serializing markup content...')
|
self.oeb.logger.info('Serializing markup content...')
|
||||||
self.serializer = Serializer(self.oeb, self.image_map,
|
self.serializer = Serializer(self.oeb, self.image_map,
|
||||||
self.is_periodical,
|
self.is_periodical,
|
||||||
@ -180,7 +178,7 @@ class MobiWriter(object):
|
|||||||
self.oeb.logger.info(' Compressing markup content...')
|
self.oeb.logger.info(' Compressing markup content...')
|
||||||
|
|
||||||
while text.tell() < self.text_length:
|
while text.tell() < self.text_length:
|
||||||
data, overlap = self.read_text_record(text)
|
data, overlap = create_text_record(text)
|
||||||
if self.compression == PALMDOC:
|
if self.compression == PALMDOC:
|
||||||
data = compress_doc(data)
|
data = compress_doc(data)
|
||||||
|
|
||||||
@ -197,57 +195,6 @@ class MobiWriter(object):
|
|||||||
if records_size % 4 != 0:
|
if records_size % 4 != 0:
|
||||||
self.records.append(b'\x00'*(records_size % 4))
|
self.records.append(b'\x00'*(records_size % 4))
|
||||||
self.first_non_text_record_idx += 1
|
self.first_non_text_record_idx += 1
|
||||||
|
|
||||||
def read_text_record(self, text):
|
|
||||||
'''
|
|
||||||
Return a Palmdoc record of size RECORD_SIZE from the text file object.
|
|
||||||
In case the record ends in the middle of a multibyte character return
|
|
||||||
the overlap as well.
|
|
||||||
|
|
||||||
Returns data, overlap: where both are byte strings. overlap is the
|
|
||||||
extra bytes needed to complete the truncated multibyte character.
|
|
||||||
'''
|
|
||||||
opos = text.tell()
|
|
||||||
text.seek(0, 2)
|
|
||||||
# npos is the position of the next record
|
|
||||||
npos = min((opos + RECORD_SIZE, text.tell()))
|
|
||||||
# Number of bytes from the next record needed to complete the last
|
|
||||||
# character in this record
|
|
||||||
extra = 0
|
|
||||||
|
|
||||||
last = b''
|
|
||||||
while not last.decode('utf-8', 'ignore'):
|
|
||||||
# last contains no valid utf-8 characters
|
|
||||||
size = len(last) + 1
|
|
||||||
text.seek(npos - size)
|
|
||||||
last = text.read(size)
|
|
||||||
|
|
||||||
# last now has one valid utf-8 char and possibly some bytes that belong
|
|
||||||
# to a truncated char
|
|
||||||
|
|
||||||
try:
|
|
||||||
last.decode('utf-8', 'strict')
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
# There are some truncated bytes in last
|
|
||||||
prev = len(last)
|
|
||||||
while True:
|
|
||||||
text.seek(npos - prev)
|
|
||||||
last = text.read(len(last) + 1)
|
|
||||||
try:
|
|
||||||
last.decode('utf-8')
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
extra = len(last) - prev
|
|
||||||
|
|
||||||
text.seek(opos)
|
|
||||||
data = text.read(RECORD_SIZE)
|
|
||||||
overlap = text.read(extra)
|
|
||||||
text.seek(npos)
|
|
||||||
|
|
||||||
return data, overlap
|
|
||||||
|
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
def generate_record0(self): # MOBI header {{{
|
def generate_record0(self): # MOBI header {{{
|
||||||
|
@ -19,15 +19,13 @@ from calibre.ebooks.mobi.utils import to_base
|
|||||||
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
|
from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
|
||||||
extract, XHTML, urlnormalize)
|
extract, XHTML, urlnormalize)
|
||||||
from calibre.ebooks.oeb.parse_utils import barename
|
from calibre.ebooks.oeb.parse_utils import barename
|
||||||
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags
|
from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
|
||||||
|
|
||||||
XML_DOCS = OEB_DOCS | {SVG_MIME}
|
XML_DOCS = OEB_DOCS | {SVG_MIME}
|
||||||
|
|
||||||
# References to record numbers in KF8 are stored as base-32 encoded integers,
|
# References to record numbers in KF8 are stored as base-32 encoded integers,
|
||||||
# with 4 digits
|
# with 4 digits
|
||||||
to_ref = partial(to_base, base=32, min_num_digits=4)
|
to_ref = partial(to_base, base=32, min_num_digits=4)
|
||||||
# References in links are stored with 10 digits
|
|
||||||
to_href = partial(to_base, base=32, min_num_digits=10)
|
|
||||||
|
|
||||||
class KF8Writer(object):
|
class KF8Writer(object):
|
||||||
|
|
||||||
@ -167,7 +165,7 @@ class KF8Writer(object):
|
|||||||
self.link_map = {}
|
self.link_map = {}
|
||||||
count = 0
|
count = 0
|
||||||
hrefs = {item.href for item in self.oeb.spine}
|
hrefs = {item.href for item in self.oeb.spine}
|
||||||
for item in self.oeb.spine:
|
for i, item in enumerate(self.oeb.spine):
|
||||||
root = self.data(item)
|
root = self.data(item)
|
||||||
|
|
||||||
for a in XPath('//h:a[@href]')(root):
|
for a in XPath('//h:a[@href]')(root):
|
||||||
@ -176,7 +174,8 @@ class KF8Writer(object):
|
|||||||
href, _, frag = ref.partition('#')
|
href, _, frag = ref.partition('#')
|
||||||
href = urlnormalize(href)
|
href = urlnormalize(href)
|
||||||
if href in hrefs:
|
if href in hrefs:
|
||||||
placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
|
placeholder = 'kindle:pos:fid:%04d:off:%s'%(i,
|
||||||
|
to_href(count))
|
||||||
self.link_map[placeholder] = (href, frag)
|
self.link_map[placeholder] = (href, frag)
|
||||||
a.set('href', placeholder)
|
a.set('href', placeholder)
|
||||||
|
|
||||||
@ -199,7 +198,19 @@ class KF8Writer(object):
|
|||||||
j += 1
|
j += 1
|
||||||
|
|
||||||
def chunk_it_up(self):
|
def chunk_it_up(self):
|
||||||
chunker = Chunker(self.oeb, self.data)
|
placeholder_map = {}
|
||||||
chunker
|
for placeholder, x in self.link_map.iteritems():
|
||||||
|
href, frag = x
|
||||||
|
aid = self.id_map.get(x, None)
|
||||||
|
if aid is None:
|
||||||
|
aid = self.id_map.get((href, ''))
|
||||||
|
placeholder_map[placeholder] = aid
|
||||||
|
chunker = Chunker(self.oeb, self.data, not self.opts.dont_compress,
|
||||||
|
placeholder_map)
|
||||||
|
|
||||||
|
for x in ('skel_table', 'chunk_table', 'aid_offset_map', 'records',
|
||||||
|
'last_text_record_idx', 'first_non_text_record_idx',
|
||||||
|
'text_length'):
|
||||||
|
setattr(self, x, getattr(chunker, x))
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,14 +9,22 @@ __docformat__ = 'restructuredtext en'
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
from io import BytesIO
|
||||||
|
from struct import pack
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import XHTML_NS
|
from calibre.ebooks.oeb.base import XHTML_NS
|
||||||
from calibre.constants import ispy3
|
from calibre.constants import ispy3
|
||||||
|
from calibre.ebooks.mobi.utils import create_text_record, to_base
|
||||||
|
from calibre.ebooks.compression.palmdoc import compress_doc
|
||||||
|
|
||||||
CHUNK_SIZE = 8192
|
CHUNK_SIZE = 8192
|
||||||
|
|
||||||
|
# References in links are stored with 10 digits
|
||||||
|
to_href = partial(to_base, base=32, min_num_digits=10)
|
||||||
|
|
||||||
# Tags to which kindlegen adds the aid attribute
|
# Tags to which kindlegen adds the aid attribute
|
||||||
aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
|
aid_able_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
|
||||||
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
|
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
|
||||||
@ -70,11 +78,15 @@ def tostring(raw, **kwargs):
|
|||||||
|
|
||||||
class Chunk(object):
|
class Chunk(object):
|
||||||
|
|
||||||
def __init__(self, raw):
|
def __init__(self, raw, parent_tag):
|
||||||
self.raw = raw
|
self.raw = raw
|
||||||
self.starts_tags = []
|
self.starts_tags = []
|
||||||
self.ends_tags = []
|
self.ends_tags = []
|
||||||
self.insert_pos = None
|
self.insert_pos = None
|
||||||
|
self.parent_tag = parent_tag
|
||||||
|
self.parent_is_body = False
|
||||||
|
self.is_last_chunk = False
|
||||||
|
self.is_first_chunk = False
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.raw)
|
return len(self.raw)
|
||||||
@ -87,6 +99,11 @@ class Chunk(object):
|
|||||||
return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
|
return 'Chunk(len=%r insert_pos=%r starts_tags=%r ends_tags=%r)'%(
|
||||||
len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
|
len(self.raw), self.insert_pos, self.starts_tags, self.ends_tags)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def selector(self):
|
||||||
|
typ = 'S' if (self.is_last_chunk and not self.parent_is_body) else 'P'
|
||||||
|
return "%s-//*[@aid='%s']"%(typ, self.parent_tag)
|
||||||
|
|
||||||
__str__ = __repr__
|
__str__ = __repr__
|
||||||
|
|
||||||
class Skeleton(object):
|
class Skeleton(object):
|
||||||
@ -133,11 +150,20 @@ class Skeleton(object):
|
|||||||
ans = ans[:i] + chunk.raw + ans[i:]
|
ans = ans[:i] + chunk.raw + ans[i:]
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.skeleton) + sum([len(x.raw) for x in self.chunks])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def raw_text(self):
|
||||||
|
return b''.join([self.skeleton] + [x.raw for x in self.chunks])
|
||||||
|
|
||||||
class Chunker(object):
|
class Chunker(object):
|
||||||
|
|
||||||
def __init__(self, oeb, data_func):
|
def __init__(self, oeb, data_func, compress, placeholder_map):
|
||||||
self.oeb, self.log = oeb, oeb.log
|
self.oeb, self.log = oeb, oeb.log
|
||||||
self.data = data_func
|
self.data = data_func
|
||||||
|
self.compress = compress
|
||||||
|
self.placeholder_map = placeholder_map
|
||||||
|
|
||||||
self.skeletons = []
|
self.skeletons = []
|
||||||
|
|
||||||
@ -174,6 +200,19 @@ class Chunker(object):
|
|||||||
if self.orig_dumps:
|
if self.orig_dumps:
|
||||||
self.dump()
|
self.dump()
|
||||||
|
|
||||||
|
# Create the SKEL and Chunk tables
|
||||||
|
self.skel_table = []
|
||||||
|
self.chunk_table = []
|
||||||
|
self.create_tables()
|
||||||
|
|
||||||
|
# Set internal links
|
||||||
|
text = b''.join(x.raw_text for x in self.skeletons)
|
||||||
|
text = self.set_internal_links(text)
|
||||||
|
|
||||||
|
# Create text records
|
||||||
|
self.records = []
|
||||||
|
self.create_text_records(text)
|
||||||
|
|
||||||
def remove_namespaces(self, root):
|
def remove_namespaces(self, root):
|
||||||
lang = None
|
lang = None
|
||||||
for attr, val in root.attrib.iteritems():
|
for attr, val in root.attrib.iteritems():
|
||||||
@ -206,15 +245,15 @@ class Chunker(object):
|
|||||||
|
|
||||||
return nroot
|
return nroot
|
||||||
|
|
||||||
|
|
||||||
def step_into_tag(self, tag, chunks):
|
def step_into_tag(self, tag, chunks):
|
||||||
aid = tag.get('aid')
|
aid = tag.get('aid')
|
||||||
|
is_body = tag.tag == 'body'
|
||||||
|
|
||||||
first_chunk_idx = len(chunks)
|
first_chunk_idx = len(chunks)
|
||||||
|
|
||||||
# First handle any text
|
# First handle any text
|
||||||
if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
|
if tag.text and tag.text.strip(): # Leave pure whitespace in the skel
|
||||||
chunks.extend(self.chunk_up_text(tag.text))
|
chunks.extend(self.chunk_up_text(tag.text, aid))
|
||||||
tag.text = None
|
tag.text = None
|
||||||
|
|
||||||
# Now loop over children
|
# Now loop over children
|
||||||
@ -224,15 +263,15 @@ class Chunker(object):
|
|||||||
if len(raw) > CHUNK_SIZE and child.get('aid', None):
|
if len(raw) > CHUNK_SIZE and child.get('aid', None):
|
||||||
self.step_into_tag(child, chunks)
|
self.step_into_tag(child, chunks)
|
||||||
if child.tail and child.tail.strip(): # Leave pure whitespace
|
if child.tail and child.tail.strip(): # Leave pure whitespace
|
||||||
chunks.extend(self.chunk_up_text(child.tail))
|
chunks.extend(self.chunk_up_text(child.tail, aid))
|
||||||
child.tail = None
|
child.tail = None
|
||||||
else:
|
else:
|
||||||
if len(raw) > CHUNK_SIZE:
|
if len(raw) > CHUNK_SIZE:
|
||||||
self.log.warn('Tag %s has no aid and a too large chunk'
|
self.log.warn('Tag %s has no aid and a too large chunk'
|
||||||
' size. Adding anyway.'%child.tag)
|
' size. Adding anyway.'%child.tag)
|
||||||
chunks.append(Chunk(raw))
|
chunks.append(Chunk(raw, aid))
|
||||||
if child.tail:
|
if child.tail:
|
||||||
chunks.extend(self.chunk_up_text(child.tail))
|
chunks.extend(self.chunk_up_text(child.tail, aid))
|
||||||
tag.remove(child)
|
tag.remove(child)
|
||||||
|
|
||||||
if len(chunks) <= first_chunk_idx and chunks:
|
if len(chunks) <= first_chunk_idx and chunks:
|
||||||
@ -242,8 +281,15 @@ class Chunker(object):
|
|||||||
if chunks:
|
if chunks:
|
||||||
chunks[first_chunk_idx].starts_tags.append(aid)
|
chunks[first_chunk_idx].starts_tags.append(aid)
|
||||||
chunks[-1].ends_tags.append(aid)
|
chunks[-1].ends_tags.append(aid)
|
||||||
|
my_chunks = chunks[first_chunk_idx:]
|
||||||
|
if my_chunks:
|
||||||
|
my_chunks[0].is_first_chunk = True
|
||||||
|
my_chunks[-1].is_last_chunk = True
|
||||||
|
if is_body:
|
||||||
|
for chunk in my_chunks:
|
||||||
|
chunk.parent_is_body = True
|
||||||
|
|
||||||
def chunk_up_text(self, text):
|
def chunk_up_text(self, text, parent_tag):
|
||||||
text = text.encode('utf-8')
|
text = text.encode('utf-8')
|
||||||
ans = []
|
ans = []
|
||||||
|
|
||||||
@ -259,7 +305,7 @@ class Chunker(object):
|
|||||||
while rest:
|
while rest:
|
||||||
start, rest = split_multibyte_text(rest)
|
start, rest = split_multibyte_text(rest)
|
||||||
ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
|
ans.append(b'<span class="AmznBigTextBlock">' + start + '</span>')
|
||||||
return [Chunk(x) for x in ans]
|
return [Chunk(x, parent_tag) for x in ans]
|
||||||
|
|
||||||
def merge_small_chunks(self, chunks):
|
def merge_small_chunks(self, chunks):
|
||||||
ans = chunks[:1]
|
ans = chunks[:1]
|
||||||
@ -275,6 +321,77 @@ class Chunker(object):
|
|||||||
prev.merge(chunk)
|
prev.merge(chunk)
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
|
def create_tables(self):
|
||||||
|
Skel = namedtuple('Skel',
|
||||||
|
'file_number name chunk_count start_pos length')
|
||||||
|
sp = 0
|
||||||
|
for s in self.skeletons:
|
||||||
|
s.start_pos = sp
|
||||||
|
sp += len(s)
|
||||||
|
self.skel_table = [Skel(s.file_number, 'SKEL%010d'%s.file_number,
|
||||||
|
len(s.chunks), s.start_pos, len(s.skeleton)) for x in self.skeletons]
|
||||||
|
|
||||||
|
Chunk = namedtuple('Chunk',
|
||||||
|
'insert_pos selector file_number sequence_number start_pos length')
|
||||||
|
num = cp = 0
|
||||||
|
for skel in self.skeletons:
|
||||||
|
cp = skel.start_pos
|
||||||
|
for chunk in skel.chunks:
|
||||||
|
self.chunk_table.append(
|
||||||
|
Chunk(chunk.insert_pos + skel.start_pos, chunk.selector,
|
||||||
|
skel.file_number, num, cp, len(chunk.raw)))
|
||||||
|
cp += len(chunk.raw)
|
||||||
|
num += 1
|
||||||
|
|
||||||
|
def set_internal_links(self, text):
|
||||||
|
# First find the start pos of all tags with aids
|
||||||
|
aid_map = {}
|
||||||
|
for match in re.finditer(br'<[^>]+? aid=[\'"]([A-Z0-9]+)[\'"]', text):
|
||||||
|
aid_map[match.group(1)] = match.start()
|
||||||
|
self.aid_offset_map = aid_map
|
||||||
|
placeholder_map = {bytes(k):bytes(to_href(aid_map[v])) for k, v in
|
||||||
|
self.placeholder_map.iteritems()}
|
||||||
|
|
||||||
|
# Now update the links
|
||||||
|
def sub(match):
|
||||||
|
raw = match.group()
|
||||||
|
pl = match.group(1)
|
||||||
|
try:
|
||||||
|
return raw[:-10] + placeholder_map[pl]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
return raw
|
||||||
|
|
||||||
|
return re.sub(br'<[^>]+(kindle:pos:fid:\d{4}:\d{10})', sub, text)
|
||||||
|
|
||||||
|
def create_text_records(self, text):
|
||||||
|
self.text_length = len(text)
|
||||||
|
text = BytesIO(text)
|
||||||
|
nrecords = 0
|
||||||
|
records_size = 0
|
||||||
|
|
||||||
|
if self.compress:
|
||||||
|
self.oeb.logger.info(' Compressing markup content...')
|
||||||
|
|
||||||
|
while text.tell() < self.text_length:
|
||||||
|
data, overlap = create_text_record(text)
|
||||||
|
if self.compress:
|
||||||
|
data = compress_doc(data)
|
||||||
|
|
||||||
|
data += overlap
|
||||||
|
data += pack(b'>B', len(overlap))
|
||||||
|
|
||||||
|
self.records.append(data)
|
||||||
|
records_size += len(data)
|
||||||
|
nrecords += 1
|
||||||
|
|
||||||
|
self.last_text_record_idx = nrecords
|
||||||
|
self.first_non_text_record_idx = nrecords + 1
|
||||||
|
# Pad so that the next records starts at a 4 byte boundary
|
||||||
|
if records_size % 4 != 0:
|
||||||
|
self.records.append(b'\x00'*(records_size % 4))
|
||||||
|
self.first_non_text_record_idx += 1
|
||||||
|
|
||||||
def dump(self):
|
def dump(self):
|
||||||
import tempfile, shutil, os
|
import tempfile, shutil, os
|
||||||
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
|
tdir = os.path.join(tempfile.gettempdir(), 'skeleton')
|
||||||
@ -291,3 +408,4 @@ class Chunker(object):
|
|||||||
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
|
with open(os.path.join(rebuilt, '%04d.html'%i), 'wb') as f:
|
||||||
f.write(skeleton.rebuild())
|
f.write(skeleton.rebuild())
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user