Handle more TPZ file layouts when setting metadata

This commit is contained in:
Kovid Goyal 2010-03-11 09:50:48 -07:00
commit 8c0cadae9f
2 changed files with 187 additions and 205 deletions

View File

@ -429,33 +429,30 @@ class Bookmark():
self.book_length = 0 self.book_length = 0
if self.bookmark_extension == 'mbp': if self.bookmark_extension == 'mbp':
# Read the book len from the header # Read the book len from the header
with open(book_fs,'rb') as f: try:
self.stream = StringIO(f.read()) with open(book_fs,'rb') as f:
self.data = StreamSlicer(self.stream) self.stream = StringIO(f.read())
self.nrecs, = unpack('>H', self.data[76:78]) self.data = StreamSlicer(self.stream)
record0 = self.record(0) self.nrecs, = unpack('>H', self.data[76:78])
self.book_length = int(unpack('>I', record0[0x04:0x08])[0]) record0 = self.record(0)
self.book_length = int(unpack('>I', record0[0x04:0x08])[0])
except:
pass
elif self.bookmark_extension == 'tan': elif self.bookmark_extension == 'tan':
# Read bookLength from metadata # Read bookLength from metadata
with open(book_fs,'rb') as f: from calibre.ebooks.metadata.topaz import MetadataUpdater
stream = StringIO(f.read()) try:
raw = stream.read(8*1024) with open(book_fs,'rb') as f:
if not raw.startswith('TPZ'): mu = MetadataUpdater(f)
raise ValueError('Not a Topaz file') self.book_length = mu.book_length
first = raw.find('metadata') except:
if first < 0: pass
raise ValueError('Invalid Topaz file')
second = raw.find('metadata', first+10)
if second < 0:
raise ValueError('Invalid Topaz file')
raw = raw[second:second+1000]
idx = raw.find('bookLength')
if idx > -1:
length = ord(raw[idx+len('bookLength')])
self.book_length = int(raw[idx+len('bookLength')+1:idx+len('bookLength')+1+length])
elif self.bookmark_extension == 'pdr': elif self.bookmark_extension == 'pdr':
# Book length not yet implemented for PDF files # Book length not yet implemented for PDF files
# After 0.6.45:
# from calibre import plugins
# self.book_length = plugins['pdfreflow'][0].get_numpages(open(book_fs).read())
self.book_length = 0 self.book_length = 0
else: else:

View File

@ -10,36 +10,6 @@ from struct import pack, unpack
from calibre import prints from calibre import prints
from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata import MetaInformation
def read_record(raw, name):
idx = raw.find(name)
if idx > -1:
length = ord(raw[idx+len(name)])
return raw[idx+len(name)+1:idx+len(name)+1+length]
def get_metadata(stream):
raw = stream.read(8*1024)
if not raw.startswith('TPZ'):
raise ValueError('Not a Topaz file')
first = raw.find('metadata')
if first < 0:
raise ValueError('Invalid Topaz file')
second = raw.find('metadata', first+10)
if second < 0:
raise ValueError('Invalid Topaz file')
raw = raw[second:second+1000]
authors = read_record(raw, 'Authors')
if authors:
authors = authors.decode('utf-8', 'replace').split(';')
else:
authors = [_('Unknown')]
title = read_record(raw, 'Title')
if title:
title = title.decode('utf-8', 'replace')
else:
raise ValueError('No metadata in file')
#from calibre.ebooks.metadata import MetaInformation
return MetaInformation(title, authors)
class StreamSlicer(object): class StreamSlicer(object):
def __init__(self, stream, start=0, stop=None): def __init__(self, stream, start=0, stop=None):
@ -110,29 +80,34 @@ class StreamSlicer(object):
class MetadataUpdater(object): class MetadataUpdater(object):
def __init__(self, stream): def __init__(self, stream):
self.stream = stream self.stream = stream
raw = stream.read(8*1024)
if not raw.startswith('TPZ'):
raise ValueError('Not a Topaz file')
first = raw.find('metadata')
if first < 0:
raise ValueError('Invalid Topaz file')
self.data = StreamSlicer(stream) self.data = StreamSlicer(stream)
self.header_records, = unpack('>B',self.data[4])
self.get_topaz_headers()
# Seek the metadata block sig = self.data[:4]
md_block_offset, spam = self.decode_vwi(self.data[first+9:first+13]) if not sig.startswith('TPZ'):
md_block_offset += self.base raise ValueError('Not a Topaz file')
if self.data[md_block_offset+1:md_block_offset+9] != 'metadata': offset = 4
raise ValueError('Invalid Topaz file')
self.header_records, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
self.topaz_headers = self.get_headers(offset)
# First integrity test - metadata header
if not 'metadata' in self.topaz_headers:
raise ValueError('Invalid Topaz format - no metadata record')
# Second integrity test - metadata body
md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
md_offset += self.base
if self.data[md_offset+1:md_offset+9] != 'metadata':
raise ValueError('Damaged metadata record')
def book_length(self):
''' convenience method for retrieving book length '''
self.get_original_metadata()
if 'bookLength' in self.metadata:
return int(self.metadata['bookLength'])
else: else:
self.md_start = md_block_offset return 0
offset = self.get_md_header(self.md_start)
self.metadata = {}
self.md_end = self.get_original_metadata(offset)
self.orig_md_len = self.md_end - self.md_start
def decode_vwi(self,bytes): def decode_vwi(self,bytes):
pos, val = 0, 0 pos, val = 0, 0
@ -148,6 +123,33 @@ class MetadataUpdater(object):
if done: break if done: break
return val, pos return val, pos
def dump_headers(self):
''' Diagnostic '''
print "\ndump_headers():"
for tag in self.topaz_headers:
print "%s: " % (tag)
num_recs = len(self.topaz_headers[tag]['blocks'])
print " num_recs: %d" % num_recs
if num_recs:
print " starting offset: 0x%x" % self.topaz_headers[tag]['blocks'][0]['offset']
def dump_hex(self, src, length=16):
''' Diagnostic '''
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
N=0; result=''
while src:
s,src = src[:length],src[length:]
hexa = ' '.join(["%02X"%ord(x) for x in s])
s = s.translate(FILTER)
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
N+=length
print result
def dump_metadata(self):
''' Diagnostic '''
for tag in self.metadata:
print '%s: %s' % (tag, repr(self.metadata[tag]))
def encode_vwi(self,value): def encode_vwi(self,value):
bytes = [] bytes = []
multi_byte = (value > 0x7f) multi_byte = (value > 0x7f)
@ -174,39 +176,11 @@ class MetadataUpdater(object):
# If value == 0, return 0 # If value == 0, return 0
return pack('>B', 0x0).decode('iso-8859-1') return pack('>B', 0x0).decode('iso-8859-1')
def fixup_topaz_headers(self, size_delta):
# Rewrite Topaz Header. Any offset > md_hdr_offset needs to be adjusted
ths = StringIO.StringIO()
md_header_offset = self.md_header_offset
# Copy the first 5 bytes
ths.write(self.data[:5])
md_record = False
for th in self.topaz_headers:
ths.write('c')
ths.write(self.encode_vwi(len(self.topaz_headers[th]['tag'])))
ths.write(self.topaz_headers[th]['tag'])
ths.write(self.encode_vwi(len(self.topaz_headers[th]['blocks'])))
for block in self.topaz_headers[th]['blocks']:
b = self.topaz_headers[th]['blocks'][block]
if b['hdr_offset'] > md_header_offset:
vwi = self.encode_vwi(b['hdr_offset'] + size_delta)
else:
vwi = self.encode_vwi(b['hdr_offset'])
ths.write(vwi)
if self.topaz_headers[th]['tag'] == 'metadata':
ths.write(self.encode_vwi(b['len_uncomp'] + size_delta))
else:
ths.write(self.encode_vwi(b['len_uncomp']))
ths.write(self.encode_vwi(b['len_comp']))
return ths.getvalue().encode('iso-8859-1')
def generate_dkey(self): def generate_dkey(self):
for x in self.topaz_headers: for x in self.topaz_headers:
#print "dkey['blocks']: %s" % self.topaz_headers[x]['blocks']
if self.topaz_headers[x]['tag'] == 'dkey': if self.topaz_headers[x]['tag'] == 'dkey':
if self.topaz_headers[x]['blocks']: if self.topaz_headers[x]['blocks']:
offset = self.base + self.topaz_headers[x]['blocks'][0]['hdr_offset'] offset = self.base + self.topaz_headers[x]['blocks'][0]['offset']
len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp'] len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
break break
else: else:
@ -222,42 +196,11 @@ class MetadataUpdater(object):
dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1')) dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
return dks.getvalue().encode('iso-8859-1') return dks.getvalue().encode('iso-8859-1')
def get_topaz_headers(self): def get_headers(self, offset):
offset = 5 # Build a dict of topaz_header records
md_header_offset = 0
dkey_len = 0
# Find the offset of the metadata header record
for hr in range(self.header_records):
marker = self.data[offset]
offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
offset += taglen
if not tag == 'metadata':
num_vals, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
for val in range(num_vals):
foo, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
foo, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
foo, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
continue
num_vals, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
md_header_offset, consumed = self.decode_vwi(self.data[offset:offset+4])
break
self.md_header_offset = md_header_offset
offset = 5
topaz_headers = {} topaz_headers = {}
dkey_offset = 0
lowest_payload_offset = sys.maxint
lowest_offset_err = None
for x in range(self.header_records): for x in range(self.header_records):
marker = self.data[offset] c_marker = self.data[offset]
offset += 1 offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed offset += consumed
@ -268,97 +211,131 @@ class MetadataUpdater(object):
blocks = {} blocks = {}
for val in range(num_vals): for val in range(num_vals):
hdr_offset, consumed = self.decode_vwi(self.data[offset:offset+4]) hdr_offset, consumed = self.decode_vwi(self.data[offset:offset+4])
if tag == 'dkey':
dkey_offset = hdr_offset
if tag not in ['dkey','metadata']:
if hdr_offset < lowest_payload_offset:
lowest_payload_offset = hdr_offset
lowest_offset_err = "lowest_payload_offset: 0x%x (%s)" % (hdr_offset,tag)
offset += consumed offset += consumed
len_uncomp, consumed = self.decode_vwi(self.data[offset:offset+4]) len_uncomp, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed offset += consumed
len_comp, consumed = self.decode_vwi(self.data[offset:offset+4]) len_comp, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed offset += consumed
blocks[val] = dict(hdr_offset=hdr_offset,len_uncomp=len_uncomp,len_comp=len_comp) blocks[val] = dict(offset=hdr_offset,len_uncomp=len_uncomp,len_comp=len_comp)
topaz_headers[x] = dict(tag=tag,blocks=blocks) topaz_headers[tag] = dict(blocks=blocks)
self.topaz_headers = topaz_headers self.eoth = self.data[offset]
self.eod = self.data[offset]
offset += 1 offset += 1
self.base = offset self.base = offset
self.lowest_payload_offset = lowest_payload_offset + self.base return topaz_headers
if self.lowest_payload_offset < self.md_header_offset:
prints("Unexpected TPZ file layout:\n %s\n metadata_offset: 0x%x" % (lowest_offset_err, self.md_header_offset))
prints("metadata needs to be before payload")
self.base_value = None
if dkey_offset:
self.base_value = self.data[offset:offset + dkey_offset]
return md_header_offset, topaz_headers
def generate_metadata_stream(self): def generate_metadata_stream(self):
ms = StringIO.StringIO() ms = StringIO.StringIO()
# Generate the header
ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1')) ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
ms.write(self.md_header['tag']) ms.write(self.md_header['tag'])
ms.write(chr(self.md_header['flags'])) ms.write(chr(self.md_header['flags']))
ms.write(chr(len(self.metadata))) ms.write(chr(len(self.metadata)))
# Add the metadata fields. # Add the metadata fields.
for item in self.metadata: for tag in self.metadata:
ms.write(self.encode_vwi(len(self.metadata[item]['tag'])).encode('iso-8859-1')) ms.write(self.encode_vwi(len(tag)).encode('iso-8859-1'))
ms.write(self.metadata[item]['tag']) ms.write(tag)
ms.write(self.encode_vwi(len(self.metadata[item]['metadata'])).encode('iso-8859-1')) ms.write(self.encode_vwi(len(self.metadata[tag])).encode('iso-8859-1'))
ms.write(self.metadata[item]['metadata']) ms.write(self.metadata[tag])
return ms.getvalue() return ms.getvalue()
def get_md_header(self,offset): def get_metadata(self):
md_header = {} ''' Return MetaInformation with title, author'''
self.get_original_metadata()
return MetaInformation(self.metadata['Title'], [self.metadata['Authors']])
def get_original_metadata(self):
offset = self.base + self.topaz_headers['metadata']['blocks'][0]['offset']
self.md_header = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed offset += consumed
md_header['tag'] = self.data[offset:offset+taglen] self.md_header['tag'] = self.data[offset:offset+taglen]
offset += taglen offset += taglen
md_header['flags'] = ord(self.data[offset]) self.md_header['flags'] = ord(self.data[offset])
offset += 1 offset += 1
md_header['records'] = ord(self.data[offset]) self.md_header['num_recs'] = ord(self.data[offset])
offset += 1 offset += 1
self.md_header = md_header #print "self.md_header: %s" % self.md_header
return offset
def get_original_metadata(self,offset): self.metadata = {}
for x in range(self.md_header['records']): for x in range(self.md_header['num_recs']):
md_record = {} md_record = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed offset += consumed
md_record['tag'] = self.data[offset:offset+taglen] tag = self.data[offset:offset+taglen]
offset += taglen offset += taglen
md_len, consumed = self.decode_vwi(self.data[offset:offset+4]) md_len, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed offset += consumed
md_record['metadata'] = self.data[offset:offset + md_len] metadata = self.data[offset:offset + md_len]
offset += md_len offset += md_len
self.metadata[x] = md_record self.metadata[tag] = metadata
return offset
def hexdump(self, src, length=16): def regenerate_headers(self, len_updated_metadata):
# Diagnostic
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) headers = {}
N=0; result='' for tag in self.topaz_headers:
while src: if self.topaz_headers[tag]['blocks']:
s,src = src[:length],src[length:] headers[tag] = self.topaz_headers[tag]['blocks'][0]['offset']
hexa = ' '.join(["%02X"%ord(x) for x in s]) else:
s = s.translate(FILTER) headers[tag] = None
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
N+=length # Sort headers based on initial offset
print result sh = sorted(headers,key=lambda x:(headers[x],headers[x]))
# Metadata goes last
sh.remove('metadata')
sh.append('metadata')
original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp']
original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset']
# Copy the first 5 bytes of the file: sig + num_recs
ths = StringIO.StringIO()
ths.write(self.data[:5])
# Rewrite the offsets for hdr_offsets > metadata original location
for tag in sh[:-1]:
ths.write('c')
ths.write(self.encode_vwi(len(tag)))
ths.write(tag)
if self.topaz_headers[tag]['blocks']:
ths.write(self.encode_vwi(len(self.topaz_headers[tag]['blocks'])))
for block in self.topaz_headers[tag]['blocks']:
b = self.topaz_headers[tag]['blocks'][block]
if b['offset'] < original_md_offset:
ths.write(self.encode_vwi(b['offset']))
else:
ths.write(self.encode_vwi(b['offset'] - original_md_len))
ths.write(self.encode_vwi(b['len_uncomp']))
ths.write(self.encode_vwi(b['len_comp']))
else:
ths.write(self.encode_vwi(0))
# Adjust metadata offset to end
new_md_offset = (len(self.data) - self.base - original_md_len)
new_md_len = len_updated_metadata - 1 - len('metadata') - 1
# Write the metadata header
ths.write('c')
ths.write(self.encode_vwi(len('metadata')))
ths.write('metadata')
ths.write(self.encode_vwi(1))
ths.write(self.encode_vwi(new_md_offset))
ths.write(self.encode_vwi(new_md_len))
ths.write(self.encode_vwi(0))
self.sorted_headers = sh
self.original_md_start = original_md_offset + self.base
self.original_md_len = original_md_len
return ths.getvalue().encode('iso-8859-1')
def update(self,mi): def update(self,mi):
def update_metadata(tag,value): # Collect the original metadata
for item in self.metadata: self.get_original_metadata()
if self.metadata[item]['tag'] == tag:
self.metadata[item]['metadata'] = value
return
if self.md_start > self.lowest_payload_offset:
raise ValueError('Unable to update metadata:')
try: try:
from calibre.ebooks.conversion.config import load_defaults from calibre.ebooks.conversion.config import load_defaults
@ -369,27 +346,33 @@ class MetadataUpdater(object):
if mi.author_sort and pas: if mi.author_sort and pas:
authors = mi.author_sort authors = mi.author_sort
update_metadata('Authors',authors.encode('utf-8')) self.metadata['Authors'] = authors.encode('utf-8')
elif mi.authors: elif mi.authors:
authors = '; '.join(mi.authors) authors = '; '.join(mi.authors)
update_metadata('Authors',authors) self.metadata['Authors'] = authors.encode('utf-8')
update_metadata('Title',mi.title.encode('utf-8')) self.metadata['Title'] = mi.title.encode('utf-8')
updated_metadata = self.generate_metadata_stream() updated_metadata = self.generate_metadata_stream()
head = self.fixup_topaz_headers(len(updated_metadata) - self.orig_md_len) head = self.regenerate_headers(len(updated_metadata))
dkey = self.generate_dkey()
tail = copy.copy(self.data[self.md_end:]) # Chunk1: self.base -> original metadata start
# Chunk2: original metadata end -> eof
chunk1 = self.data[self.base:self.original_md_start]
chunk2 = self.data[self.original_md_start + self.original_md_len:]
self.stream.seek(0) self.stream.seek(0)
self.stream.truncate(0) self.stream.truncate(0)
# Write the revised stream
self.stream.write(head) self.stream.write(head)
self.stream.write(self.eod) self.stream.write('d')
if self.base_value: self.stream.write(chunk1)
self.stream.write(self.base_value) self.stream.write(chunk2)
if dkey:
self.stream.write(dkey)
self.stream.write(updated_metadata) self.stream.write(updated_metadata)
self.stream.write(tail)
def get_metadata(stream):
mu = MetadataUpdater(stream)
return mu.get_metadata()
def set_metadata(stream, mi): def set_metadata(stream, mi):
mu = MetadataUpdater(stream) mu = MetadataUpdater(stream)
@ -398,4 +381,6 @@ def set_metadata(stream, mi):
if __name__ == '__main__': if __name__ == '__main__':
import cStringIO, sys import cStringIO, sys
print get_metadata(open(sys.argv[1], 'rb')) #print get_metadata(open(sys.argv[1], 'rb'))
mi = MetaInformation(title="My New Title", authors=['Smith, John'])
set_metadata(open(sys.argv[1], 'rb'), mi)