Greatly improved decoding of metadata block in LRF files. We now handle the incorrectly encoded metadata blocks produced by makelrf 0.3 by assuming a latin1 encoding. Also added additional fields and support for creating a field that does not exist. Metadata blocks are now always written correctly encoded in utf-16 independent of the encoding of the original metadata block.

This commit is contained in:
Kovid Goyal 2007-01-03 02:22:43 +00:00
parent c25865e76b
commit cead54c959

View File

@ -27,9 +27,7 @@ to get and set meta information. For example:
import struct
import array
import zlib
import StringIO
import xml.dom.minidom as dom
from xml.dom.ext import Print
from libprs500.prstypes import field
@ -91,36 +89,68 @@ class xml_field(object):
Descriptor that gets and sets XML based meta information from an LRF file.
Works for simple XML fields of the form <tagname>data</tagname>
"""
def __init__(self, tag_name):
""" @param tag_name: The XML tag whoose data we operate on """
def __init__(self, tag_name, parent="BookInfo"):
"""
@param tag_name: The XML tag whose data we operate on
@param parent: The tagname of the parent element of C{tag_name}
"""
self.tag_name = tag_name
self.parent = parent
def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """
document = dom.parseString(obj.info)
elem = document.getElementsByTagName(self.tag_name)[0]
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if elem:
elem.normalize()
if not elem.hasChildNodes():
return ""
if elem.hasChildNodes():
return elem.firstChild.data.strip()
return ""
def __set__(self, obj, val):
document = dom.parseString(obj.info)
elem = document.getElementsByTagName(self.tag_name)[0]
def create_elem():
elem = document.createElement(self.tag_name)
elem.appendChild(dom.Text())
parent = document.getElementsByTagName(self.parent)[0]
parent.appendChild(elem)
return elem
if not val:
val = u''
if type(val).__name__ != 'unicode':
val = unicode(val, 'utf-8')
elems = document.getElementsByTagName(self.tag_name)
elem = None
if len(elems):
for candidate in elems:
if candidate.parentNode.nodeName == self.parent:
elem = candidate
if not elem:
elem = create_elem()
else:
elem.normalize()
while elem.hasChildNodes():
elem.removeChild(elem.lastChild)
elem.appendChild(dom.Text())
else:
elem = create_elem()
elem.firstChild.data = val
s = StringIO.StringIO()
Print(document, s)
obj.info = s.getvalue()
s.close()
info = document.toxml(encoding='utf-16')
obj.info = info
def __str__(self):
return self.tag_name
def __repr__(self):
return "XML Field: " + self.tag_name
return "XML Field: " + self.tag_name + " in " + self.parent
class LRFMetaFile(object):
""" Has properties to read and write all Meta information in a LRF file. """
@ -146,18 +176,21 @@ class LRFMetaFile(object):
uncompressed_info_size = versioned_field(compressed_info_size, 0, \
fmt=DWORD, start=0x54)
title = xml_field("Title")
author = xml_field("Author")
book_id = xml_field("BookID")
publisher = xml_field("Publisher")
label = xml_field("Label")
category = xml_field("Category")
title = xml_field("Title", parent="BookInfo")
author = xml_field("Author", parent="BookInfo")
book_id = xml_field("BookID", parent="BookInfo")
publisher = xml_field("Publisher", parent="BookInfo")
label = xml_field("Label", parent="BookInfo")
category = xml_field("Category", parent="BookInfo")
classification = xml_field("Classification", parent="BookInfo")
free_text = xml_field("FreeText", parent="BookInfo")
language = xml_field("Language")
creator = xml_field("Creator")
creation_date = xml_field("CreationDate") #: Format is %Y-%m-%d
producer = xml_field("Producer")
page = xml_field("Page")
language = xml_field("Language", parent="DocInfo")
creator = xml_field("Creator", parent="DocInfo")
# Format is %Y-%m-%d
creation_date = xml_field("CreationDate", parent="DocInfo")
producer = xml_field("Producer", parent="DocInfo")
page = xml_field("Page", parent="DocInfo")
def safe(func):
"""
@ -198,20 +231,36 @@ class LRFMetaFile(object):
@safe_property
def info():
doc = """ Document meta information in raw XML format """
doc = \
"""
Document meta information in raw XML format as a byte string encoded in
utf-16.
To set use raw XML in a byte string encoded in utf-16.
"""
def fget(self):
if self.compressed_info_size == 0:
raise LRFException("This document has no meta info")
size = self.compressed_info_size - 4
self._file.seek(self.info_start)
try:
stream = zlib.decompress(self._file.read(size))
if len(stream) != self.uncompressed_info_size:
src = zlib.decompress(self._file.read(size))
if len(src) != self.uncompressed_info_size:
raise LRFException("Decompression of document meta info\
yielded unexpected results")
# Remove null characters from string as in some LRF files
# the stream is null-terminated
return stream.strip().replace('\0', '')
candidate = unicode(src, 'utf-16')
# LRF files produced with makelrf dont have a correctly
# encoded metadata block.
# Decoding using latin1 is the most useful for me since I
# occassionally read french books.
if not u"Info" in candidate:
candidate = unicode(src, 'latin1', errors='ignore')
if candidate[-1:] == '\0':
candidate = candidate[:-1]
candidate = dom.parseString(candidate.encode('utf-8')).\
toxml(encoding='utf-16')
else:
candidate = candidate.encode('utf-16')
return candidate.strip()
except zlib.error, e:
raise LRFException("Unable to decompress document meta information")
@ -222,6 +271,7 @@ class LRFMetaFile(object):
self._file.seek(self.info_start)
self._file.write(stream)
self._file.flush()
return { "fget":fget, "fset":fset, "doc":doc }
@safe_property