diff --git a/src/calibre/devices/kindle/driver.py b/src/calibre/devices/kindle/driver.py index 74d188e882..4f78b2f98e 100644 --- a/src/calibre/devices/kindle/driver.py +++ b/src/calibre/devices/kindle/driver.py @@ -429,33 +429,30 @@ class Bookmark(): self.book_length = 0 if self.bookmark_extension == 'mbp': # Read the book len from the header - with open(book_fs,'rb') as f: - self.stream = StringIO(f.read()) - self.data = StreamSlicer(self.stream) - self.nrecs, = unpack('>H', self.data[76:78]) - record0 = self.record(0) - self.book_length = int(unpack('>I', record0[0x04:0x08])[0]) + try: + with open(book_fs,'rb') as f: + self.stream = StringIO(f.read()) + self.data = StreamSlicer(self.stream) + self.nrecs, = unpack('>H', self.data[76:78]) + record0 = self.record(0) + self.book_length = int(unpack('>I', record0[0x04:0x08])[0]) + except: + pass elif self.bookmark_extension == 'tan': # Read bookLength from metadata - with open(book_fs,'rb') as f: - stream = StringIO(f.read()) - raw = stream.read(8*1024) - if not raw.startswith('TPZ'): - raise ValueError('Not a Topaz file') - first = raw.find('metadata') - if first < 0: - raise ValueError('Invalid Topaz file') - second = raw.find('metadata', first+10) - if second < 0: - raise ValueError('Invalid Topaz file') - raw = raw[second:second+1000] - idx = raw.find('bookLength') - if idx > -1: - length = ord(raw[idx+len('bookLength')]) - self.book_length = int(raw[idx+len('bookLength')+1:idx+len('bookLength')+1+length]) - + from calibre.ebooks.metadata.topaz import MetadataUpdater + try: + with open(book_fs,'rb') as f: + mu = MetadataUpdater(f) + self.book_length = mu.book_length + except: + pass elif self.bookmark_extension == 'pdr': # Book length not yet implemented for PDF files + # After 0.6.45: + # from calibre import plugins + # self.book_length = plugins['pdfreflow'][0].get_numpages(open(book_fs).read()) + self.book_length = 0 else: diff --git a/src/calibre/ebooks/metadata/topaz.py b/src/calibre/ebooks/metadata/topaz.py index 8423c52524..07c7af3bf6 100644 --- a/src/calibre/ebooks/metadata/topaz.py +++ b/src/calibre/ebooks/metadata/topaz.py @@ -10,36 +10,6 @@ from struct import pack, unpack from calibre import prints from calibre.ebooks.metadata import MetaInformation -def read_record(raw, name): - idx = raw.find(name) - if idx > -1: - length = ord(raw[idx+len(name)]) - return raw[idx+len(name)+1:idx+len(name)+1+length] - -def get_metadata(stream): - raw = stream.read(8*1024) - if not raw.startswith('TPZ'): - raise ValueError('Not a Topaz file') - first = raw.find('metadata') - if first < 0: - raise ValueError('Invalid Topaz file') - second = raw.find('metadata', first+10) - if second < 0: - raise ValueError('Invalid Topaz file') - raw = raw[second:second+1000] - authors = read_record(raw, 'Authors') - if authors: - authors = authors.decode('utf-8', 'replace').split(';') - else: - authors = [_('Unknown')] - title = read_record(raw, 'Title') - if title: - title = title.decode('utf-8', 'replace') - else: - raise ValueError('No metadata in file') - #from calibre.ebooks.metadata import MetaInformation - return MetaInformation(title, authors) - class StreamSlicer(object): def __init__(self, stream, start=0, stop=None): @@ -110,29 +80,34 @@ class StreamSlicer(object): class MetadataUpdater(object): def __init__(self, stream): self.stream = stream - raw = stream.read(8*1024) - if not raw.startswith('TPZ'): - raise ValueError('Not a Topaz file') - first = raw.find('metadata') - if first < 0: - raise ValueError('Invalid Topaz file') - self.data = StreamSlicer(stream) - self.header_records, = unpack('>B',self.data[4]) - self.get_topaz_headers() - # Seek the metadata block - md_block_offset, spam = self.decode_vwi(self.data[first+9:first+13]) - md_block_offset += self.base - if self.data[md_block_offset+1:md_block_offset+9] != 'metadata': - raise ValueError('Invalid Topaz file') + sig = self.data[:4] + if not sig.startswith('TPZ'): + raise ValueError('Not a Topaz file') + offset = 4 + + self.header_records, consumed = self.decode_vwi(self.data[offset:offset+4]) + offset += consumed + self.topaz_headers = self.get_headers(offset) + + # First integrity test - metadata header + if not 'metadata' in self.topaz_headers: + raise ValueError('Invalid Topaz format - no metadata record') + + # Second integrity test - metadata body + md_offset = self.topaz_headers['metadata']['blocks'][0]['offset'] + md_offset += self.base + if self.data[md_offset+1:md_offset+9] != 'metadata': + raise ValueError('Damaged metadata record') + + def book_length(self): + ''' convenience method for retrieving book length ''' + self.get_original_metadata() + if 'bookLength' in self.metadata: + return int(self.metadata['bookLength']) else: - self.md_start = md_block_offset - - offset = self.get_md_header(self.md_start) - self.metadata = {} - self.md_end = self.get_original_metadata(offset) - self.orig_md_len = self.md_end - self.md_start + return 0 def decode_vwi(self,bytes): pos, val = 0, 0 @@ -148,6 +123,33 @@ class MetadataUpdater(object): if done: break return val, pos + def dump_headers(self): + ''' Diagnostic ''' + print "\ndump_headers():" + for tag in self.topaz_headers: + print "%s: " % (tag) + num_recs = len(self.topaz_headers[tag]['blocks']) + print " num_recs: %d" % num_recs + if num_recs: + print " starting offset: 0x%x" % self.topaz_headers[tag]['blocks'][0]['offset'] + + def dump_hex(self, src, length=16): + ''' Diagnostic ''' + FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) + N=0; result='' + while src: + s,src = src[:length],src[length:] + hexa = ' '.join(["%02X"%ord(x) for x in s]) + s = s.translate(FILTER) + result += "%04X %-*s %s\n" % (N, length*3, hexa, s) + N+=length + print result + + def dump_metadata(self): + ''' Diagnostic ''' + for tag in self.metadata: + print '%s: %s' % (tag, repr(self.metadata[tag])) + def encode_vwi(self,value): bytes = [] multi_byte = (value > 0x7f) @@ -174,39 +176,11 @@ class MetadataUpdater(object): # If value == 0, return 0 return pack('>B', 0x0).decode('iso-8859-1') - def fixup_topaz_headers(self, size_delta): - # Rewrite Topaz Header. Any offset > md_hdr_offset needs to be adjusted - ths = StringIO.StringIO() - md_header_offset = self.md_header_offset - # Copy the first 5 bytes - ths.write(self.data[:5]) - md_record = False - for th in self.topaz_headers: - ths.write('c') - ths.write(self.encode_vwi(len(self.topaz_headers[th]['tag']))) - ths.write(self.topaz_headers[th]['tag']) - ths.write(self.encode_vwi(len(self.topaz_headers[th]['blocks']))) - for block in self.topaz_headers[th]['blocks']: - b = self.topaz_headers[th]['blocks'][block] - if b['hdr_offset'] > md_header_offset: - vwi = self.encode_vwi(b['hdr_offset'] + size_delta) - else: - vwi = self.encode_vwi(b['hdr_offset']) - ths.write(vwi) - if self.topaz_headers[th]['tag'] == 'metadata': - ths.write(self.encode_vwi(b['len_uncomp'] + size_delta)) - else: - ths.write(self.encode_vwi(b['len_uncomp'])) - ths.write(self.encode_vwi(b['len_comp'])) - - return ths.getvalue().encode('iso-8859-1') - def generate_dkey(self): for x in self.topaz_headers: - #print "dkey['blocks']: %s" % self.topaz_headers[x]['blocks'] if self.topaz_headers[x]['tag'] == 'dkey': if self.topaz_headers[x]['blocks']: - offset = self.base + self.topaz_headers[x]['blocks'][0]['hdr_offset'] + offset = self.base + self.topaz_headers[x]['blocks'][0]['offset'] len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp'] break else: @@ -222,42 +196,11 @@ class MetadataUpdater(object): dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1')) return dks.getvalue().encode('iso-8859-1') - def get_topaz_headers(self): - offset = 5 - md_header_offset = 0 - dkey_len = 0 - # Find the offset of the metadata header record - for hr in range(self.header_records): - marker = self.data[offset] - offset += 1 - taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) - offset += consumed - tag = self.data[offset:offset+taglen] - offset += taglen - if not tag == 'metadata': - num_vals, consumed = self.decode_vwi(self.data[offset:offset+4]) - offset += consumed - for val in range(num_vals): - foo, consumed = self.decode_vwi(self.data[offset:offset+4]) - offset += consumed - foo, consumed = self.decode_vwi(self.data[offset:offset+4]) - offset += consumed - foo, consumed = self.decode_vwi(self.data[offset:offset+4]) - offset += consumed - continue - num_vals, consumed = self.decode_vwi(self.data[offset:offset+4]) - offset += consumed - md_header_offset, consumed = self.decode_vwi(self.data[offset:offset+4]) - break - self.md_header_offset = md_header_offset - - offset = 5 + def get_headers(self, offset): + # Build a dict of topaz_header records topaz_headers = {} - dkey_offset = 0 - lowest_payload_offset = sys.maxint - lowest_offset_err = None for x in range(self.header_records): - marker = self.data[offset] + c_marker = self.data[offset] offset += 1 taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) offset += consumed @@ -268,97 +211,131 @@ class MetadataUpdater(object): blocks = {} for val in range(num_vals): hdr_offset, consumed = self.decode_vwi(self.data[offset:offset+4]) - if tag == 'dkey': - dkey_offset = hdr_offset - if tag not in ['dkey','metadata']: - if hdr_offset < lowest_payload_offset: - lowest_payload_offset = hdr_offset - lowest_offset_err = "lowest_payload_offset: 0x%x (%s)" % (hdr_offset,tag) offset += consumed len_uncomp, consumed = self.decode_vwi(self.data[offset:offset+4]) offset += consumed len_comp, consumed = self.decode_vwi(self.data[offset:offset+4]) offset += consumed - blocks[val] = dict(hdr_offset=hdr_offset,len_uncomp=len_uncomp,len_comp=len_comp) - topaz_headers[x] = dict(tag=tag,blocks=blocks) - self.topaz_headers = topaz_headers - self.eod = self.data[offset] + blocks[val] = dict(offset=hdr_offset,len_uncomp=len_uncomp,len_comp=len_comp) + topaz_headers[tag] = dict(blocks=blocks) + self.eoth = self.data[offset] offset += 1 self.base = offset - self.lowest_payload_offset = lowest_payload_offset + self.base - if self.lowest_payload_offset < self.md_header_offset: - prints("Unexpected TPZ file layout:\n %s\n metadata_offset: 0x%x" % (lowest_offset_err, self.md_header_offset)) - prints("metadata needs to be before payload") - self.base_value = None - if dkey_offset: - self.base_value = self.data[offset:offset + dkey_offset] - return md_header_offset, topaz_headers + return topaz_headers def generate_metadata_stream(self): ms = StringIO.StringIO() - # Generate the header ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1')) ms.write(self.md_header['tag']) ms.write(chr(self.md_header['flags'])) ms.write(chr(len(self.metadata))) # Add the metadata fields. - for item in self.metadata: - ms.write(self.encode_vwi(len(self.metadata[item]['tag'])).encode('iso-8859-1')) - ms.write(self.metadata[item]['tag']) - ms.write(self.encode_vwi(len(self.metadata[item]['metadata'])).encode('iso-8859-1')) - ms.write(self.metadata[item]['metadata']) + for tag in self.metadata: + ms.write(self.encode_vwi(len(tag)).encode('iso-8859-1')) + ms.write(tag) + ms.write(self.encode_vwi(len(self.metadata[tag])).encode('iso-8859-1')) + ms.write(self.metadata[tag]) return ms.getvalue() - def get_md_header(self,offset): - md_header = {} + def get_metadata(self): + ''' Return MetaInformation with title, author''' + self.get_original_metadata() + return MetaInformation(self.metadata['Title'], [self.metadata['Authors']]) + + def get_original_metadata(self): + offset = self.base + self.topaz_headers['metadata']['blocks'][0]['offset'] + self.md_header = {} taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) offset += consumed - md_header['tag'] = self.data[offset:offset+taglen] + self.md_header['tag'] = self.data[offset:offset+taglen] offset += taglen - md_header['flags'] = ord(self.data[offset]) + self.md_header['flags'] = ord(self.data[offset]) offset += 1 - md_header['records'] = ord(self.data[offset]) + self.md_header['num_recs'] = ord(self.data[offset]) offset += 1 - self.md_header = md_header - return offset + #print "self.md_header: %s" % self.md_header - def get_original_metadata(self,offset): - for x in range(self.md_header['records']): + self.metadata = {} + for x in range(self.md_header['num_recs']): md_record = {} taglen, consumed = self.decode_vwi(self.data[offset:offset+4]) offset += consumed - md_record['tag'] = self.data[offset:offset+taglen] + tag = self.data[offset:offset+taglen] offset += taglen md_len, consumed = self.decode_vwi(self.data[offset:offset+4]) offset += consumed - md_record['metadata'] = self.data[offset:offset + md_len] + metadata = self.data[offset:offset + md_len] offset += md_len - self.metadata[x] = md_record - return offset + self.metadata[tag] = metadata - def hexdump(self, src, length=16): - # Diagnostic - FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)]) - N=0; result='' - while src: - s,src = src[:length],src[length:] - hexa = ' '.join(["%02X"%ord(x) for x in s]) - s = s.translate(FILTER) - result += "%04X %-*s %s\n" % (N, length*3, hexa, s) - N+=length - print result + def regenerate_headers(self, len_updated_metadata): + + headers = {} + for tag in self.topaz_headers: + if self.topaz_headers[tag]['blocks']: + headers[tag] = self.topaz_headers[tag]['blocks'][0]['offset'] + else: + headers[tag] = None + + # Sort headers based on initial offset + sh = sorted(headers,key=lambda x:(headers[x],headers[x])) + + # Metadata goes last + sh.remove('metadata') + sh.append('metadata') + + original_md_len = self.topaz_headers['metadata']['blocks'][0]['len_uncomp'] + original_md_offset = self.topaz_headers['metadata']['blocks'][0]['offset'] + + # Copy the first 5 bytes of the file: sig + num_recs + ths = StringIO.StringIO() + ths.write(self.data[:5]) + + # Rewrite the offsets for hdr_offsets > metadata original location + for tag in sh[:-1]: + ths.write('c') + ths.write(self.encode_vwi(len(tag))) + ths.write(tag) + if self.topaz_headers[tag]['blocks']: + ths.write(self.encode_vwi(len(self.topaz_headers[tag]['blocks']))) + for block in self.topaz_headers[tag]['blocks']: + b = self.topaz_headers[tag]['blocks'][block] + + if b['offset'] < original_md_offset: + ths.write(self.encode_vwi(b['offset'])) + else: + ths.write(self.encode_vwi(b['offset'] - original_md_len)) + + ths.write(self.encode_vwi(b['len_uncomp'])) + ths.write(self.encode_vwi(b['len_comp'])) + else: + ths.write(self.encode_vwi(0)) + + # Adjust metadata offset to end + new_md_offset = (len(self.data) - self.base - original_md_len) + + new_md_len = len_updated_metadata - 1 - len('metadata') - 1 + + # Write the metadata header + ths.write('c') + ths.write(self.encode_vwi(len('metadata'))) + ths.write('metadata') + ths.write(self.encode_vwi(1)) + ths.write(self.encode_vwi(new_md_offset)) + + ths.write(self.encode_vwi(new_md_len)) + ths.write(self.encode_vwi(0)) + + self.sorted_headers = sh + self.original_md_start = original_md_offset + self.base + self.original_md_len = original_md_len + return ths.getvalue().encode('iso-8859-1') def update(self,mi): - def update_metadata(tag,value): - for item in self.metadata: - if self.metadata[item]['tag'] == tag: - self.metadata[item]['metadata'] = value - return - - if self.md_start > self.lowest_payload_offset: - raise ValueError('Unable to update metadata:') + # Collect the original metadata + self.get_original_metadata() try: from calibre.ebooks.conversion.config import load_defaults @@ -369,27 +346,33 @@ class MetadataUpdater(object): if mi.author_sort and pas: authors = mi.author_sort - update_metadata('Authors',authors.encode('utf-8')) + self.metadata['Authors'] = authors.encode('utf-8') elif mi.authors: authors = '; '.join(mi.authors) - update_metadata('Authors',authors) - update_metadata('Title',mi.title.encode('utf-8')) + self.metadata['Authors'] = authors.encode('utf-8') + self.metadata['Title'] = mi.title.encode('utf-8') updated_metadata = self.generate_metadata_stream() - head = self.fixup_topaz_headers(len(updated_metadata) - self.orig_md_len) - dkey = self.generate_dkey() - tail = copy.copy(self.data[self.md_end:]) + head = self.regenerate_headers(len(updated_metadata)) + + # Chunk1: self.base -> original metadata start + # Chunk2: original metadata end -> eof + chunk1 = self.data[self.base:self.original_md_start] + chunk2 = self.data[self.original_md_start + self.original_md_len:] self.stream.seek(0) self.stream.truncate(0) + + # Write the revised stream self.stream.write(head) - self.stream.write(self.eod) - if self.base_value: - self.stream.write(self.base_value) - if dkey: - self.stream.write(dkey) + self.stream.write('d') + self.stream.write(chunk1) + self.stream.write(chunk2) self.stream.write(updated_metadata) - self.stream.write(tail) + +def get_metadata(stream): + mu = MetadataUpdater(stream) + return mu.get_metadata() def set_metadata(stream, mi): mu = MetadataUpdater(stream) @@ -398,4 +381,6 @@ def set_metadata(stream, mi): if __name__ == '__main__': import cStringIO, sys - print get_metadata(open(sys.argv[1], 'rb')) + #print get_metadata(open(sys.argv[1], 'rb')) + mi = MetaInformation(title="My New Title", authors=['Smith, John']) + set_metadata(open(sys.argv[1], 'rb'), mi)