Support for setting metadata in Topaz files and getting pdf anntations from a Kindle

This commit is contained in:
Kovid Goyal 2010-03-09 15:36:17 -07:00
commit 1c7dae888b
6 changed files with 453 additions and 26 deletions

View File

@ -378,6 +378,17 @@ class RTFMetadataWriter(MetadataWriterPlugin):
from calibre.ebooks.metadata.rtf import set_metadata
set_metadata(stream, mi)
class TOPAZMetadataWriter(MetadataWriterPlugin):
name = 'Set TOPAZ metadata'
file_types = set(['tpz', 'azw1'])
description = _('Set metadata in %s files')%'TOPAZ'
author = 'Greg Riker'
def set_metadata(self, stream, mi, type):
from calibre.ebooks.metadata.topaz import set_metadata
set_metadata(stream, mi)
from calibre.ebooks.comic.input import ComicInput
from calibre.ebooks.epub.input import EPUBInput

View File

@ -41,7 +41,7 @@ class KINDLE(USBMS):
EBOOK_DIR_MAIN = 'documents'
EBOOK_DIR_CARD_A = 'documents'
DELETE_EXTS = ['.mbp']
DELETE_EXTS = ['.mbp','.tan','.pdr']
SUPPORTS_SUB_DIRS = True
SUPPORTS_ANNOTATIONS = True
@ -63,6 +63,7 @@ class KINDLE(USBMS):
def get_annotations(self, path_map):
MBP_FORMATS = [u'azw', u'mobi', u'prc', u'txt']
TAN_FORMATS = [u'tpz', u'azw1']
PDR_FORMATS = [u'pdf']
mbp_formats = set()
for fmt in MBP_FORMATS:
@ -70,6 +71,9 @@ class KINDLE(USBMS):
tan_formats = set()
for fmt in TAN_FORMATS:
tan_formats.add(fmt)
pdr_formats = set()
for fmt in PDR_FORMATS:
pdr_formats.add(fmt)
def get_storage():
storage = []
@ -88,7 +92,6 @@ class KINDLE(USBMS):
file_fmts = set()
for fmt in path_map[id]['fmts']:
file_fmts.add(fmt)
bookmark_extension = None
if file_fmts.intersection(mbp_formats):
book_extension = list(file_fmts.intersection(mbp_formats))[0]
@ -96,6 +99,9 @@ class KINDLE(USBMS):
elif file_fmts.intersection(tan_formats):
book_extension = list(file_fmts.intersection(tan_formats))[0]
bookmark_extension = 'tan'
elif file_fmts.intersection(pdr_formats):
book_extension = list(file_fmts.intersection(pdr_formats))[0]
bookmark_extension = 'pdr'
if bookmark_extension:
for vol in storage:
@ -165,10 +171,13 @@ class Bookmark():
self.get_bookmark_data()
self.get_book_length()
try:
self.percent_read = float(100*self.last_read / self.book_length)
except:
self.percent_read = 0
if self.book_length >= 0:
try:
self.percent_read = float(100*self.last_read / self.book_length)
except:
self.percent_read = 0
else:
self.percent_read = -1
def record(self, n):
from calibre.ebooks.metadata.mobi import StreamSlicer
@ -280,6 +289,9 @@ class Bookmark():
def get_topaz_highlight(displayed_location):
# Parse My Clippings.txt for a matching highlight
# Search looks for book title match, highlight match, and location match
# Author is not matched
# This will find the first instance of a clipping only
book_fs = self.path.replace('.%s' % self.bookmark_extension,'.%s' % self.book_format)
with open(book_fs,'rb') as f2:
stream = StringIO(f2.read())
@ -291,7 +303,7 @@ class Bookmark():
with open(my_clippings, 'r') as f2:
marker_found = 0
text = ''
search_str1 = '%s (%s)' % (mi.title, str(mi.author[0]))
search_str1 = '%s' % (mi.title)
search_str2 = '- Highlight Loc. %d' % (displayed_location)
for line in f2:
if marker_found == 0:
@ -336,6 +348,47 @@ class Bookmark():
else:
e_type = 'Unknown annotation type'
displayed_location = location/MAGIC_TOPAZ_CONSTANT + 1
user_notes[location] = dict(id=self.id,
displayed_location=displayed_location,
type=e_type,
text=text)
if text_len == 0xFFFFFFFF:
e_base = e_base + 14
else:
e_base = e_base + 14 + 2 + text_len
current_entry += 1
for location in user_notes:
if location == self.last_read:
user_notes.pop(location)
break
elif self.bookmark_extension == 'pdr':
self.timestamp = os.path.getmtime(self.path)
with open(self.path,'rb') as f:
stream = StringIO(f.read())
data = StreamSlicer(stream)
self.last_read = int(unpack('>I', data[5:9])[0])
entries, = unpack('>I', data[9:13])
current_entry = 0
e_base = 0x0d
while current_entry < entries:
'''
location, = unpack('>I', data[e_base+2:e_base+6])
text = None
text_len, = unpack('>I', data[e_base+0xA:e_base+0xE])
e_type, = unpack('>B', data[e_base+1])
if e_type == 0:
e_type = 'Bookmark'
elif e_type == 1:
e_type = 'Highlight'
text = get_topaz_highlight(location/MAGIC_TOPAZ_CONSTANT + 1)
elif e_type == 2:
e_type = 'Note'
text = data[e_base+0x10:e_base+0x10+text_len]
else:
e_type = 'Unknown annotation type'
if self.book_format in ['tpz','azw1']:
displayed_location = location/MAGIC_TOPAZ_CONSTANT + 1
elif self.book_format == 'pdf':
@ -350,10 +403,24 @@ class Bookmark():
else:
e_base = e_base + 14 + 2 + text_len
current_entry += 1
for location in user_notes:
if location == self.last_read:
user_notes.pop(location)
break
'''
# Use label as page number
pdf_location, = unpack('>I', data[e_base+1:e_base+5])
label_len, = unpack('>H', data[e_base+5:e_base+7])
location = int(data[e_base+7:e_base+7+label_len])
displayed_location = location
e_type = 'Bookmark'
text = None
user_notes[location] = dict(id=self.id,
displayed_location=displayed_location,
type=e_type,
text=text)
self.pdf_page_offset = pdf_location - location
e_base += (7 + label_len)
current_entry += 1
self.last_read_location = self.last_read - self.pdf_page_offset
else:
print "unsupported bookmark_extension: %s" % self.bookmark_extension
self.user_notes = user_notes
@ -390,5 +457,9 @@ class Bookmark():
length = ord(raw[idx+len('bookLength')])
self.book_length = int(raw[idx+len('bookLength')+1:idx+len('bookLength')+1+length])
elif self.bookmark_extension == 'pdr':
# Book length not yet implemented for PDF files
self.book_length = -1
else:
print "unsupported bookmark_extension: %s" % self.bookmark_extension

View File

@ -23,7 +23,7 @@ class DRMError(ValueError):
pass
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'htm', 'xhtm',
'html', 'xhtml', 'pdf', 'pdb', 'prc', 'mobi', 'azw', 'doc',
'html', 'xhtml', 'pdf', 'pdb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
'epub', 'fb2', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'mbp', 'tan']

View File

@ -1,9 +1,13 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__copyright__ = '2010, Greg Riker <griker@hotmail.com>'
__docformat__ = 'restructuredtext en'
''' Read metadata from Amazon's topaz format '''
''' Read/write metadata from Amazon's topaz format '''
import copy, StringIO
from struct import pack, unpack
from calibre.ebooks.metadata import MetaInformation
def read_record(raw, name):
idx = raw.find(name)
@ -32,9 +36,336 @@ def get_metadata(stream):
title = title.decode('utf-8', 'replace')
else:
raise ValueError('No metadata in file')
from calibre.ebooks.metadata import MetaInformation
#from calibre.ebooks.metadata import MetaInformation
return MetaInformation(title, authors)
class StreamSlicer(object):
def __init__(self, stream, start=0, stop=None):
self._stream = stream
self.start = start
if stop is None:
stream.seek(0, 2)
stop = stream.tell()
self.stop = stop
self._len = stop - start
def __len__(self):
return self._len
def __getitem__(self, key):
stream = self._stream
base = self.start
if isinstance(key, (int, long)):
stream.seek(base + key)
return stream.read(1)
if isinstance(key, slice):
start, stop, stride = key.indices(self._len)
if stride < 0:
start, stop = stop, start
size = stop - start
if size <= 0:
return ""
stream.seek(base + start)
data = stream.read(size)
if stride != 1:
data = data[::stride]
return data
raise TypeError("stream indices must be integers")
def __setitem__(self, key, value):
stream = self._stream
base = self.start
if isinstance(key, (int, long)):
if len(value) != 1:
raise ValueError("key and value lengths must match")
stream.seek(base + key)
return stream.write(value)
if isinstance(key, slice):
start, stop, stride = key.indices(self._len)
if stride < 0:
start, stop = stop, start
size = stop - start
if stride != 1:
value = value[::stride]
if len(value) != size:
raise ValueError("key and value lengths must match")
stream.seek(base + start)
return stream.write(value)
raise TypeError("stream indices must be integers")
def update(self, data_blocks):
# Rewrite the stream
stream = self._stream
base = self.start
stream.seek(base)
self._stream.truncate(base)
for block in data_blocks:
stream.write(block)
def truncate(self, value):
self._stream.truncate(value)
class MetadataUpdater(object):
def __init__(self, stream):
self.stream = stream
raw = stream.read(8*1024)
if not raw.startswith('TPZ'):
raise ValueError('Not a Topaz file')
first = raw.find('metadata')
if first < 0:
raise ValueError('Invalid Topaz file')
second = raw.find('metadata', first+10)
if second < 0:
raise ValueError('Invalid Topaz file')
self.md_start = second-1
self.data = StreamSlicer(stream)
self.header_records, = unpack('>B',self.data[4])
offset = self.get_md_header(self.md_start)
self.metadata = {}
self.md_end = self.get_original_metadata(offset)
self.orig_md_len = self.md_end - self.md_start
def decode_vwi(self,bytes):
pos, val = 0, 0
done = False
while pos < len(bytes) and not done:
b = ord(bytes[pos])
pos += 1
if (b & 0x80) == 0:
done = True
b &= 0x7F
val <<= 7
val |= b
if done: break
return val, pos
def encode_vwi(self,value):
bytes = []
multi_byte = (value > 0x7f)
while value:
b = value & 0x7f
value >>= 7
if value == 0:
if multi_byte:
bytes.append(b|0x80)
if len(bytes) == 4:
return pack('>BBBB',bytes[3],bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
elif len(bytes) == 3:
return pack('>BBB',bytes[2],bytes[1],bytes[0]).decode('iso-8859-1')
elif len(bytes) == 2:
return pack('>BB',bytes[1],bytes[0]).decode('iso-8859-1')
else:
return pack('>B', b).decode('iso-8859-1')
else:
if len(bytes):
bytes.append(b|0x80)
else:
bytes.append(b)
# If value == 0, return 0
return pack('>B', 0x0).decode('iso-8859-1')
def fixup_topaz_headers(self, size_delta):
# Rewrite Topaz Header. Any offset > md_hdr_offset needs to be adjusted
ths = StringIO.StringIO()
md_header_offset = self.md_header_offset
# Copy the first 5 bytes
ths.write(self.data[:5])
md_record = False
for th in self.topaz_headers:
ths.write('c')
ths.write(self.encode_vwi(len(self.topaz_headers[th]['tag'])))
ths.write(self.topaz_headers[th]['tag'])
ths.write(self.encode_vwi(len(self.topaz_headers[th]['blocks'])))
for block in self.topaz_headers[th]['blocks']:
b = self.topaz_headers[th]['blocks'][block]
if b['hdr_offset'] > md_header_offset:
vwi = self.encode_vwi(b['hdr_offset'] + size_delta)
else:
vwi = self.encode_vwi(b['hdr_offset'])
ths.write(vwi)
if self.topaz_headers[th]['tag'] == 'metadata':
ths.write(self.encode_vwi(b['len_uncomp'] + size_delta))
else:
ths.write(self.encode_vwi(b['len_uncomp']))
ths.write(self.encode_vwi(b['len_comp']))
return ths.getvalue().encode('iso-8859-1')
def generate_dkey(self):
for x in self.topaz_headers:
if self.topaz_headers[x]['tag'] == 'dkey':
offset = self.base + self.topaz_headers[x]['blocks'][0]['hdr_offset']
len_uncomp = self.topaz_headers[x]['blocks'][0]['len_uncomp']
break
dkey = self.topaz_headers[x]
dks = StringIO.StringIO()
dks.write('d@')
dks.write(self.encode_vwi(len(dkey['tag'])))
offset += 1
dks.write(dkey['tag'])
offset += len('dkey')
dks.write(chr(0))
offset += 1
dks.write(self.data[offset:offset + len_uncomp].decode('iso-8859-1'))
return dks.getvalue().encode('iso-8859-1')
def get_topaz_headers(self):
offset = 5
md_header_offset = 0
dkey_len = 0
# Find the offset of the metadata header record
for hr in range(self.header_records):
marker = self.data[offset]
offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
offset += taglen
if not tag == 'metadata':
num_vals, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
for val in range(num_vals):
foo, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
foo, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
foo, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
continue
num_vals, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
md_header_offset, consumed = self.decode_vwi(self.data[offset:offset+4])
break
self.md_header_offset = md_header_offset
offset = 5
topaz_headers = {}
for x in range(self.header_records):
marker = self.data[offset]
offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
offset += taglen
num_vals, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
blocks = {}
for val in range(num_vals):
hdr_offset, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
len_uncomp, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
len_comp, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
blocks[val] = dict(hdr_offset=hdr_offset,len_uncomp=len_uncomp,len_comp=len_comp)
topaz_headers[x] = dict(tag=tag,blocks=blocks)
self.topaz_headers = topaz_headers
eod = self.data[offset]
offset += 1
self.base = offset
return md_header_offset, topaz_headers
def generate_metadata_stream(self):
ms = StringIO.StringIO()
# Generate the header
ms.write(self.encode_vwi(len(self.md_header['tag'])).encode('iso-8859-1'))
ms.write(self.md_header['tag'])
ms.write(chr(self.md_header['flags']))
ms.write(chr(len(self.metadata)))
# Add the metadata fields.
for item in self.metadata:
ms.write(self.encode_vwi(len(self.metadata[item]['tag'])).encode('iso-8859-1'))
ms.write(self.metadata[item]['tag'])
ms.write(self.encode_vwi(len(self.metadata[item]['metadata'])).encode('iso-8859-1'))
ms.write(self.metadata[item]['metadata'])
return ms.getvalue()
def get_md_header(self,offset):
md_header = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
md_header['tag'] = self.data[offset:offset+taglen]
offset += taglen
md_header['flags'] = ord(self.data[offset])
offset += 1
md_header['records'] = ord(self.data[offset])
offset += 1
self.md_header = md_header
return offset
def get_original_metadata(self,offset):
for x in range(self.md_header['records']):
md_record = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
md_record['tag'] = self.data[offset:offset+taglen]
offset += taglen
md_len, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
md_record['metadata'] = self.data[offset:offset + md_len]
offset += md_len
self.metadata[x] = md_record
return offset
def hexdump(self, src, length=16):
# Diagnostic
FILTER=''.join([(len(repr(chr(x)))==3) and chr(x) or '.' for x in range(256)])
N=0; result=''
while src:
s,src = src[:length],src[length:]
hexa = ' '.join(["%02X"%ord(x) for x in s])
s = s.translate(FILTER)
result += "%04X %-*s %s\n" % (N, length*3, hexa, s)
N+=length
print result
def update(self,mi):
def update_metadata(tag,value):
for item in self.metadata:
if self.metadata[item]['tag'] == tag:
self.metadata[item]['metadata'] = value
return
self.get_topaz_headers()
try:
from calibre.ebooks.conversion.config import load_defaults
prefs = load_defaults('mobi_output')
pas = prefs.get('prefer_author_sort', False)
except:
pas = False
if mi.author_sort and pas:
authors = mi.author_sort
update_metadata('Authors',authors.encode('utf-8'))
elif mi.authors:
authors = '; '.join(mi.authors)
update_metadata('Authors',authors)
update_metadata('Title',mi.title.encode('utf-8'))
updated_metadata = self.generate_metadata_stream()
head = self.fixup_topaz_headers(len(updated_metadata) - self.orig_md_len)
dkey = self.generate_dkey()
tail = copy.copy(self.data[self.md_end:])
self.stream.seek(0)
self.stream.truncate(0)
self.stream.write(head)
self.stream.write(dkey)
self.stream.write(updated_metadata)
self.stream.write(tail)
def set_metadata(stream, mi):
mu = MetadataUpdater(stream)
mu.update(mi)
return
if __name__ == '__main__':
import sys
print get_metadata(open(sys.argv[1], 'rb'))
import cStringIO, sys
print get_metadata(open(sys.argv[1], 'rb'))

View File

@ -326,12 +326,17 @@ class FileIconProvider(QFileIconProvider):
'lrf' : 'lrf',
'lrx' : 'lrx',
'pdf' : 'pdf',
'pdr' : 'zero',
'rar' : 'rar',
'zip' : 'zip',
'txt' : 'txt',
'prc' : 'mobi',
'azw' : 'mobi',
'mobi' : 'mobi',
'mbp' : 'zero',
'azw1' : 'mobi',
'tpz' : 'mobi',
'tan' : 'zero',
'epub' : 'epub',
'fb2' : 'fb2',
}

View File

@ -1012,9 +1012,15 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
# Add the last-read location
spanTag = Tag(ka_soup, 'span')
spanTag['style'] = 'font-weight:bold'
spanTag.insert(0,NavigableString("%s<br />Last Page Read: Location %d (%d%%)" % \
(strftime(u'%x', timestamp.timetuple()),
last_read_location, percent_read)))
if bookmark.book_format == 'pdf':
spanTag.insert(0,NavigableString("%s<br />Last Page Read: %d" % \
(strftime(u'%x', timestamp.timetuple()),
last_read_location)))
else:
spanTag.insert(0,NavigableString("%s<br />Last Page Read: Location %d (%d%%)" % \
(strftime(u'%x', timestamp.timetuple()),
last_read_location,
percent_read)))
divTag.insert(dtc, spanTag)
dtc += 1
@ -1036,9 +1042,14 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
user_notes[location]['type'] == 'Note' else \
'<i>%s</i>' % user_notes[location]['text']))
else:
annotations.append('<b>Location %d &bull; %s</b><br />' % \
(user_notes[location]['displayed_location'],
user_notes[location]['type']))
if bookmark.book_format == 'pdf':
annotations.append('<b>Page %d &bull; %s</b><br />' % \
(user_notes[location]['displayed_location'],
user_notes[location]['type']))
else:
annotations.append('<b>Location %d &bull; %s</b><br />' % \
(user_notes[location]['displayed_location'],
user_notes[location]['type']))
for annotation in annotations:
divTag.insert(dtc, annotation)
@ -1074,11 +1085,9 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
mi.comments = unicode(user_notes_soup.prettify())
# Update library comments
self.db.set_comment(id, mi.comments)
'''
# Add bookmark file to id
self.db.add_format_with_hooks(id, bm.bookmark.bookmark_extension,
bm.bookmark.path, index_is_id=True)
'''
self.update_progress.emit(i)
self.update_done.emit()
self.done_callback(self.am.keys())
@ -1522,7 +1531,7 @@ class Main(MainWindow, Ui_MainWindow, DeviceGUI):
if single_format is not None:
opts.formats = single_format
# Special case for Kindle annotation files
if single_format.lower() == 'mbp' or single_format == 'tan':
if single_format.lower() in ['mbp','pdr','tan']:
opts.to_lowercase = False
opts.save_cover = False
opts.write_opf = False