Greatly improved decoding of metadata block in LRF files. We now handle the incorrectly encoded metadata blocks produced by makelrf 0.3 by assuming a latin1 encoding. Also added additional fields and support for creating a field that does not exist. Metadata blocks are now always written correctly encoded in utf-16 independent of the encoding of the original metadata block.

This commit is contained in:
Kovid Goyal 2007-01-03 02:22:43 +00:00
parent c25865e76b
commit cead54c959

View File

@ -27,15 +27,13 @@ to get and set meta information. For example:
import struct import struct
import array import array
import zlib import zlib
import StringIO
import xml.dom.minidom as dom import xml.dom.minidom as dom
from xml.dom.ext import Print
from libprs500.prstypes import field from libprs500.prstypes import field
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
DWORD = "<I" #: Unsigned integer little endian encoded in 4 bytes DWORD = "<I" #: Unsigned integer little endian encoded in 4 bytes
QWORD = "<Q" #: Unsigned long long little endian encoded in 8 bytes QWORD = "<Q" #: Unsigned long long little endian encoded in 8 bytes
class versioned_field(field): class versioned_field(field):
@ -90,37 +88,69 @@ class xml_field(object):
""" """
Descriptor that gets and sets XML based meta information from an LRF file. Descriptor that gets and sets XML based meta information from an LRF file.
Works for simple XML fields of the form <tagname>data</tagname> Works for simple XML fields of the form <tagname>data</tagname>
""" """
def __init__(self, tag_name): def __init__(self, tag_name, parent="BookInfo"):
""" @param tag_name: The XML tag whoose data we operate on """ """
@param tag_name: The XML tag whose data we operate on
@param parent: The tagname of the parent element of C{tag_name}
"""
self.tag_name = tag_name self.tag_name = tag_name
self.parent = parent
def __get__(self, obj, typ=None): def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """
document = dom.parseString(obj.info) document = dom.parseString(obj.info)
elem = document.getElementsByTagName(self.tag_name)[0] elems = document.getElementsByTagName(self.tag_name)
elem.normalize() if len(elems):
if not elem.hasChildNodes(): elem = None
return "" for candidate in elems:
return elem.firstChild.data.strip() if candidate.parentNode.nodeName == self.parent:
elem = candidate
if elem:
elem.normalize()
if elem.hasChildNodes():
return elem.firstChild.data.strip()
return ""
def __set__(self, obj, val): def __set__(self, obj, val):
document = dom.parseString(obj.info) document = dom.parseString(obj.info)
elem = document.getElementsByTagName(self.tag_name)[0] def create_elem():
elem.normalize() elem = document.createElement(self.tag_name)
while elem.hasChildNodes(): elem.appendChild(dom.Text())
elem.removeChild(elem.lastChild) parent = document.getElementsByTagName(self.parent)[0]
elem.appendChild(dom.Text()) parent.appendChild(elem)
return elem
if not val:
val = u''
if type(val).__name__ != 'unicode':
val = unicode(val, 'utf-8')
elems = document.getElementsByTagName(self.tag_name)
elem = None
if len(elems):
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if not elem:
elem = create_elem()
else:
elem.normalize()
while elem.hasChildNodes():
elem.removeChild(elem.lastChild)
elem.appendChild(dom.Text())
else:
elem = create_elem()
elem.firstChild.data = val elem.firstChild.data = val
s = StringIO.StringIO() info = document.toxml(encoding='utf-16')
Print(document, s) obj.info = info
obj.info = s.getvalue()
s.close()
def __str__(self): def __str__(self):
return self.tag_name return self.tag_name
def __repr__(self): def __repr__(self):
return "XML Field: " + self.tag_name return "XML Field: " + self.tag_name + " in " + self.parent
class LRFMetaFile(object): class LRFMetaFile(object):
""" Has properties to read and write all Meta information in a LRF file. """ """ Has properties to read and write all Meta information in a LRF file. """
@ -128,36 +158,39 @@ class LRFMetaFile(object):
LRF_HEADER = u'LRF'.encode('utf-16')[2:]+'\0\0' LRF_HEADER = u'LRF'.encode('utf-16')[2:]+'\0\0'
lrf_header = fixed_stringfield(length=8, start=0) lrf_header = fixed_stringfield(length=8, start=0)
version = field(fmt=WORD, start=8) version = field(fmt=WORD, start=8)
xor_key = field(fmt=WORD, start=10) xor_key = field(fmt=WORD, start=10)
root_object_id = field(fmt=DWORD, start=12) root_object_id = field(fmt=DWORD, start=12)
number_of_objets = field(fmt=QWORD, start=16) number_of_objets = field(fmt=QWORD, start=16)
object_index_offset = field(fmt=QWORD, start=24) object_index_offset = field(fmt=QWORD, start=24)
binding = field(fmt=BYTE, start=36) binding = field(fmt=BYTE, start=36)
dpi = field(fmt=WORD, start=38) dpi = field(fmt=WORD, start=38)
width = field(fmt=WORD, start=42) width = field(fmt=WORD, start=42)
height = field(fmt=WORD, start=44) height = field(fmt=WORD, start=44)
color_depth = field(fmt=BYTE, start=46) color_depth = field(fmt=BYTE, start=46)
toc_object_id = field(fmt=DWORD, start=0x44) toc_object_id = field(fmt=DWORD, start=0x44)
toc_object_offset = field(fmt=DWORD, start=0x48) toc_object_offset = field(fmt=DWORD, start=0x48)
compressed_info_size = field(fmt=WORD, start=0x4c) compressed_info_size = field(fmt=WORD, start=0x4c)
thumbnail_type = versioned_field(version, 800, fmt=WORD, start=0x4e) thumbnail_type = versioned_field(version, 800, fmt=WORD, start=0x4e)
thumbnail_size = versioned_field(version, 800, fmt=DWORD, start=0x50) thumbnail_size = versioned_field(version, 800, fmt=DWORD, start=0x50)
uncompressed_info_size = versioned_field(compressed_info_size, 0, \ uncompressed_info_size = versioned_field(compressed_info_size, 0, \
fmt=DWORD, start=0x54) fmt=DWORD, start=0x54)
title = xml_field("Title") title = xml_field("Title", parent="BookInfo")
author = xml_field("Author") author = xml_field("Author", parent="BookInfo")
book_id = xml_field("BookID") book_id = xml_field("BookID", parent="BookInfo")
publisher = xml_field("Publisher") publisher = xml_field("Publisher", parent="BookInfo")
label = xml_field("Label") label = xml_field("Label", parent="BookInfo")
category = xml_field("Category") category = xml_field("Category", parent="BookInfo")
classification = xml_field("Classification", parent="BookInfo")
free_text = xml_field("FreeText", parent="BookInfo")
language = xml_field("Language") language = xml_field("Language", parent="DocInfo")
creator = xml_field("Creator") creator = xml_field("Creator", parent="DocInfo")
creation_date = xml_field("CreationDate") #: Format is %Y-%m-%d # Format is %Y-%m-%d
producer = xml_field("Producer") creation_date = xml_field("CreationDate", parent="DocInfo")
page = xml_field("Page") producer = xml_field("Producer", parent="DocInfo")
page = xml_field("Page", parent="DocInfo")
def safe(func): def safe(func):
""" """
@ -198,20 +231,36 @@ class LRFMetaFile(object):
@safe_property @safe_property
def info(): def info():
doc = """ Document meta information in raw XML format """ doc = \
"""
Document meta information in raw XML format as a byte string encoded in
utf-16.
To set use raw XML in a byte string encoded in utf-16.
"""
def fget(self): def fget(self):
if self.compressed_info_size == 0: if self.compressed_info_size == 0:
raise LRFException("This document has no meta info") raise LRFException("This document has no meta info")
size = self.compressed_info_size - 4 size = self.compressed_info_size - 4
self._file.seek(self.info_start) self._file.seek(self.info_start)
try: try:
stream = zlib.decompress(self._file.read(size)) src = zlib.decompress(self._file.read(size))
if len(stream) != self.uncompressed_info_size: if len(src) != self.uncompressed_info_size:
raise LRFException("Decompression of document meta info\ raise LRFException("Decompression of document meta info\
yielded unexpected results") yielded unexpected results")
# Remove null characters from string as in some LRF files candidate = unicode(src, 'utf-16')
# the stream is null-terminated # LRF files produced with makelrf dont have a correctly
return stream.strip().replace('\0', '') # encoded metadata block.
# Decoding using latin1 is the most useful for me since I
# occassionally read french books.
if not u"Info" in candidate:
candidate = unicode(src, 'latin1', errors='ignore')
if candidate[-1:] == '\0':
candidate = candidate[:-1]
candidate = dom.parseString(candidate.encode('utf-8')).\
toxml(encoding='utf-16')
else:
candidate = candidate.encode('utf-16')
return candidate.strip()
except zlib.error, e: except zlib.error, e:
raise LRFException("Unable to decompress document meta information") raise LRFException("Unable to decompress document meta information")
@ -222,6 +271,7 @@ class LRFMetaFile(object):
self._file.seek(self.info_start) self._file.seek(self.info_start)
self._file.write(stream) self._file.write(stream)
self._file.flush() self._file.flush()
return { "fget":fget, "fset":fset, "doc":doc } return { "fget":fget, "fset":fset, "doc":doc }
@safe_property @safe_property
@ -273,7 +323,7 @@ class LRFMetaFile(object):
ttype = 0x11 ttype = 0x11
self.thumbnail_type = ttype self.thumbnail_type = ttype
# Needed as new thumbnail may have different size than old thumbnail # Needed as new thumbnail may have different size than old thumbnail
self.update_object_offsets(self.toc_object_offset - orig_offset) self.update_object_offsets(self.toc_object_offset - orig_offset)
return { "fget":fget, "fset":fset, "doc":doc } return { "fget":fget, "fset":fset, "doc":doc }
def __init__(self, file): def __init__(self, file):