mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
AZW3 Output: Add support for converting documents with very large table of contents (with more than 2000 entries). Fixes #1250475 [epub conversion error to azw3 - multiple index](https://bugs.launchpad.net/calibre/+bug/1250475)
This commit is contained in:
parent
f4e1ea020d
commit
30bbd17481
@ -11,6 +11,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from struct import pack
|
from struct import pack
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
from future_builtins import zip
|
||||||
|
|
||||||
from calibre.ebooks.mobi.utils import CNCX, encint, align_block
|
from calibre.ebooks.mobi.utils import CNCX, encint, align_block
|
||||||
from calibre.ebooks.mobi.writer8.header import Header
|
from calibre.ebooks.mobi.writer8.header import Header
|
||||||
@ -22,10 +23,10 @@ EndTagTable = TagMeta(('eof', 0, 0, 0, 1))
|
|||||||
|
|
||||||
# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
|
# map of mask to number of shifts needed, works with 1 bit and two-bit wide masks
|
||||||
# could also be extended to 4 bit wide ones as well
|
# could also be extended to 4 bit wide ones as well
|
||||||
mask_to_bit_shifts = { 1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
|
mask_to_bit_shifts = {1:0, 2:1, 3:0, 4:2, 8:3, 12:2, 16:4, 32:5, 48:4, 64:6,
|
||||||
128:7, 192: 6 }
|
128:7, 192: 6}
|
||||||
|
|
||||||
class IndexHeader(Header): # {{{
|
class IndexHeader(Header): # {{{
|
||||||
|
|
||||||
HEADER_NAME = b'INDX'
|
HEADER_NAME = b'INDX'
|
||||||
ALIGN_BLOCK = True
|
ALIGN_BLOCK = True
|
||||||
@ -45,7 +46,7 @@ class IndexHeader(Header): # {{{
|
|||||||
idxt_offset
|
idxt_offset
|
||||||
|
|
||||||
# 24 - 28: Number of index records
|
# 24 - 28: Number of index records
|
||||||
num_of_records = 1
|
num_of_records = DYN
|
||||||
|
|
||||||
# 28 - 32: Index encoding (65001 = utf-8)
|
# 28 - 32: Index encoding (65001 = utf-8)
|
||||||
encoding = 65001
|
encoding = 65001
|
||||||
@ -80,8 +81,8 @@ class IndexHeader(Header): # {{{
|
|||||||
# TAGX
|
# TAGX
|
||||||
tagx = DYN
|
tagx = DYN
|
||||||
|
|
||||||
# Last Index entry
|
# Geometry of index records
|
||||||
last_index = DYN
|
geometry = DYN
|
||||||
|
|
||||||
# IDXT
|
# IDXT
|
||||||
idxt = DYN
|
idxt = DYN
|
||||||
@ -90,7 +91,7 @@ class IndexHeader(Header): # {{{
|
|||||||
POSITIONS = {'idxt_offset':'idxt'}
|
POSITIONS = {'idxt_offset':'idxt'}
|
||||||
# }}}
|
# }}}
|
||||||
|
|
||||||
class Index(object): # {{{
|
class Index(object): # {{{
|
||||||
|
|
||||||
control_byte_count = 1
|
control_byte_count = 1
|
||||||
cncx = CNCX()
|
cncx = CNCX()
|
||||||
@ -135,27 +136,23 @@ class Index(object): # {{{
|
|||||||
self.control_bytes = self.calculate_control_bytes_for_each_entry(
|
self.control_bytes = self.calculate_control_bytes_for_each_entry(
|
||||||
self.entries)
|
self.entries)
|
||||||
|
|
||||||
rendered_entries = []
|
index_blocks, idxt_blocks, record_counts, last_indices = [BytesIO()], [BytesIO()], [0], [b'']
|
||||||
index, idxt, buf = BytesIO(), BytesIO(), BytesIO()
|
buf = BytesIO()
|
||||||
IndexEntry = namedtuple('IndexEntry', 'offset length raw')
|
|
||||||
last_lead_text = b''
|
|
||||||
too_large = ValueError('Index has too many entries, calibre does not'
|
|
||||||
' support generating multiple index records at this'
|
|
||||||
' time.')
|
|
||||||
|
|
||||||
for i, x in enumerate(self.entries):
|
RECORD_LIMIT = 0x10000 - self.HEADER_LENGTH - 1048 # kindlegen uses 1048 (there has to be some margin because of block alignment)
|
||||||
|
|
||||||
|
for i, (index_num, tags) in enumerate(self.entries):
|
||||||
control_bytes = self.control_bytes[i]
|
control_bytes = self.control_bytes[i]
|
||||||
leading_text, tags = x
|
|
||||||
buf.seek(0), buf.truncate(0)
|
buf.seek(0), buf.truncate(0)
|
||||||
leading_text = (leading_text.encode('utf-8') if
|
index_num = (index_num.encode('utf-8') if isinstance(index_num, unicode) else index_num)
|
||||||
isinstance(leading_text, unicode) else leading_text)
|
raw = bytearray(index_num)
|
||||||
raw = bytearray(leading_text)
|
raw.insert(0, len(index_num))
|
||||||
raw.insert(0, len(leading_text))
|
|
||||||
buf.write(bytes(raw))
|
buf.write(bytes(raw))
|
||||||
buf.write(bytes(bytearray(control_bytes)))
|
buf.write(bytes(bytearray(control_bytes)))
|
||||||
for tag in self.tag_types:
|
for tag in self.tag_types:
|
||||||
values = tags.get(tag.name, None)
|
values = tags.get(tag.name, None)
|
||||||
if values is None: continue
|
if values is None:
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
len(values)
|
len(values)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
@ -168,55 +165,71 @@ class Index(object): # {{{
|
|||||||
raise ValueError('Invalid values for %r: %r'%(
|
raise ValueError('Invalid values for %r: %r'%(
|
||||||
tag, values))
|
tag, values))
|
||||||
raw = buf.getvalue()
|
raw = buf.getvalue()
|
||||||
offset = index.tell()
|
offset = index_blocks[-1].tell()
|
||||||
if offset + self.HEADER_LENGTH >= 0x10000:
|
idxt_pos = idxt_blocks[-1].tell()
|
||||||
raise too_large
|
if offset + idxt_pos + len(raw) + 2 > RECORD_LIMIT:
|
||||||
rendered_entries.append(IndexEntry(offset, len(raw), raw))
|
index_blocks.append(BytesIO())
|
||||||
idxt.write(pack(b'>H', self.HEADER_LENGTH+offset))
|
idxt_blocks.append(BytesIO())
|
||||||
index.write(raw)
|
record_counts.append(0)
|
||||||
last_lead_text = leading_text
|
offset = idxt_pos = 0
|
||||||
|
last_indices.append(b'')
|
||||||
|
record_counts[-1] += 1
|
||||||
|
idxt_blocks[-1].write(pack(b'>H', self.HEADER_LENGTH+offset))
|
||||||
|
index_blocks[-1].write(raw)
|
||||||
|
last_indices[-1] = index_num
|
||||||
|
|
||||||
index_block = align_block(index.getvalue())
|
index_records = []
|
||||||
idxt_block = align_block(b'IDXT' + idxt.getvalue())
|
for index_block, idxt_block, record_count in zip(index_blocks, idxt_blocks, record_counts):
|
||||||
body = index_block + idxt_block
|
index_block = align_block(index_block.getvalue())
|
||||||
if len(body) + self.HEADER_LENGTH >= 0x10000:
|
idxt_block = align_block(b'IDXT' + idxt_block.getvalue())
|
||||||
raise too_large
|
# Create header for this index record
|
||||||
header = b'INDX'
|
header = b'INDX'
|
||||||
buf.seek(0), buf.truncate(0)
|
buf.seek(0), buf.truncate(0)
|
||||||
buf.write(pack(b'>I', self.HEADER_LENGTH))
|
buf.write(pack(b'>I', self.HEADER_LENGTH))
|
||||||
buf.write(b'\0'*4) # Unknown
|
buf.write(b'\0'*4) # Unknown
|
||||||
buf.write(pack(b'>I', 1)) # Header type? Or index record number?
|
buf.write(pack(b'>I', 1)) # Header type (0 for Index header record and 1 for Index records)
|
||||||
buf.write(b'\0'*4) # Unknown
|
buf.write(b'\0'*4) # Unknown
|
||||||
|
|
||||||
# IDXT block offset
|
# IDXT block offset
|
||||||
buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
|
buf.write(pack(b'>I', self.HEADER_LENGTH + len(index_block)))
|
||||||
|
|
||||||
# Number of index entries
|
# Number of index entries in this record
|
||||||
buf.write(pack(b'>I', len(rendered_entries)))
|
buf.write(pack(b'>I', record_count))
|
||||||
|
|
||||||
buf.write(b'\xff'*8) # Unknown
|
buf.write(b'\xff'*8) # Unknown
|
||||||
|
|
||||||
buf.write(b'\0'*156) # Unknown
|
buf.write(b'\0'*156) # Unknown
|
||||||
|
|
||||||
header += buf.getvalue()
|
header += buf.getvalue()
|
||||||
index_record = header + body
|
index_records.append(header + index_block + idxt_block)
|
||||||
|
if len(index_records[-1]) > 0x10000:
|
||||||
|
raise ValueError('Failed to rollover index blocks for very large index.')
|
||||||
|
|
||||||
|
# Create the Index Header record
|
||||||
tagx = self.generate_tagx()
|
tagx = self.generate_tagx()
|
||||||
idxt = (b'IDXT' + pack(b'>H', IndexHeader.HEADER_LENGTH + len(tagx)) +
|
|
||||||
b'\0')
|
# Geometry of the index records is written as index entries pointed to
|
||||||
# Last index
|
# by the IDXT records
|
||||||
idx = bytes(bytearray([len(last_lead_text)])) + last_lead_text
|
buf.seek(0), buf.truncate()
|
||||||
idx += pack(b'>H', len(rendered_entries))
|
idxt = [b'IDXT']
|
||||||
|
pos = IndexHeader.HEADER_LENGTH + len(tagx)
|
||||||
|
for last_idx, num in zip(last_indices, record_counts):
|
||||||
|
start = buf.tell()
|
||||||
|
idxt.append(pack(b'>H', pos))
|
||||||
|
buf.write(bytes(bytearray([len(last_idx)])) + last_idx)
|
||||||
|
buf.write(pack(b'>H', num))
|
||||||
|
pos += buf.tell() - start
|
||||||
|
|
||||||
header = {
|
header = {
|
||||||
'num_of_entries': len(rendered_entries),
|
'num_of_entries': sum(r for r in record_counts),
|
||||||
|
'num_of_records': len(index_records),
|
||||||
'num_of_cncx': len(self.cncx),
|
'num_of_cncx': len(self.cncx),
|
||||||
'tagx':tagx,
|
'tagx':align_block(tagx),
|
||||||
'last_index':align_block(idx),
|
'geometry':align_block(buf.getvalue()),
|
||||||
'idxt':idxt
|
'idxt':align_block(b''.join(idxt)),
|
||||||
}
|
}
|
||||||
header = IndexHeader()(**header)
|
header = IndexHeader()(**header)
|
||||||
self.records = [header, index_record]
|
self.records = [header] + index_records
|
||||||
self.records.extend(self.cncx.records)
|
self.records.extend(self.cncx.records)
|
||||||
return self.records
|
return self.records
|
||||||
# }}}
|
# }}}
|
||||||
@ -321,6 +334,12 @@ class NCXIndex(Index):
|
|||||||
strings.append(kind)
|
strings.append(kind)
|
||||||
self.cncx = CNCX(strings)
|
self.cncx = CNCX(strings)
|
||||||
|
|
||||||
|
try:
|
||||||
|
largest = max(x['index'] for x in toc_table)
|
||||||
|
except ValueError:
|
||||||
|
largest = 0
|
||||||
|
fmt = '%0{0}X'.format(max(2, len('%X' % largest)))
|
||||||
|
|
||||||
def to_entry(x):
|
def to_entry(x):
|
||||||
ans = {}
|
ans = {}
|
||||||
for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
|
for f in ('offset', 'length', 'depth', 'pos_fid', 'parent',
|
||||||
@ -330,12 +349,11 @@ class NCXIndex(Index):
|
|||||||
for f in ('label', 'description', 'author', 'kind'):
|
for f in ('label', 'description', 'author', 'kind'):
|
||||||
if f in x:
|
if f in x:
|
||||||
ans[f] = self.cncx[x[f]]
|
ans[f] = self.cncx[x[f]]
|
||||||
return ('%02x'%x['index'], ans)
|
return (fmt % x['index'], ans)
|
||||||
|
|
||||||
self.entries = list(map(to_entry, toc_table))
|
self.entries = list(map(to_entry, toc_table))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class NonLinearNCXIndex(NCXIndex):
|
class NonLinearNCXIndex(NCXIndex):
|
||||||
control_byte_count = 2
|
control_byte_count = 2
|
||||||
tag_types = tuple(map(TagMeta, (
|
tag_types = tuple(map(TagMeta, (
|
||||||
@ -352,4 +370,23 @@ class NonLinearNCXIndex(NCXIndex):
|
|||||||
EndTagTable
|
EndTagTable
|
||||||
)))
|
)))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Generate a document with a large number of index entries using both
|
||||||
|
# calibre and kindlegen and compare the output
|
||||||
|
import os, subprocess
|
||||||
|
os.chdir('/t')
|
||||||
|
paras = ['<p>%d</p>' % i for i in xrange(4000)]
|
||||||
|
raw = '<html><body>' + '\n\n'.join(paras) + '</body></html>'
|
||||||
|
|
||||||
|
src = 'index.html'
|
||||||
|
with open(src, 'wb') as f:
|
||||||
|
f.write(raw.encode('utf-8'))
|
||||||
|
|
||||||
|
subprocess.check_call(['ebook-convert', src, '.epub', '--level1-toc', '//h:p', '--no-default-epub-cover', '--flow-size', '1000000'])
|
||||||
|
subprocess.check_call(['ebook-convert', src, '.azw3', '--level1-toc', '//h:p', '--no-inline-toc', '--extract-to=x'])
|
||||||
|
subprocess.call(['kindlegen', 'index.epub']) # kindlegen exit code is not 0 as we dont have a cover
|
||||||
|
subprocess.check_call(['calibre-debug', 'index.mobi'])
|
||||||
|
|
||||||
|
from calibre.gui2.tweak_book.diff.main import main
|
||||||
|
main(['cdiff', 'decompiled_index/mobi8/ncx.record', 'x/ncx.record'])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user