Encode metadata block in UTF-8 thereby working around endianess issues and hopefully fixing #321

This commit is contained in:
Kovid Goyal 2007-12-01 00:13:53 +00:00
parent e5ce6adad4
commit 5cbe331c07

View File

@ -29,9 +29,12 @@ from shutil import copyfileobj
from cStringIO import StringIO from cStringIO import StringIO
import xml.dom.minidom as dom import xml.dom.minidom as dom
from functools import wraps from functools import wraps
from xml.parsers.expat import ParserCreate
from libprs500.devices.prs500.prstypes import field from libprs500.devices.prs500.prstypes import field
from libprs500.ebooks.metadata import MetaInformation from libprs500.ebooks.metadata import MetaInformation
from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
BYTE = "<B" #: Unsigned char little endian encoded in 1 byte BYTE = "<B" #: Unsigned char little endian encoded in 1 byte
WORD = "<H" #: Unsigned short little endian encoded in 2 bytes WORD = "<H" #: Unsigned short little endian encoded in 2 bytes
@ -94,12 +97,7 @@ class xml_attr_field(object):
def __get__(self, obj, typ=None): def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """ """ Return the data in this field or '' if the field is empty """
try: document = obj.info
document = dom.parseString(obj.info)
except Exception, err:
print >>sys.stderr, "Could not parse XML:", err
print obj.info
raise
elems = document.getElementsByTagName(self.tag_name) elems = document.getElementsByTagName(self.tag_name)
if len(elems): if len(elems):
elem = None elem = None
@ -113,12 +111,7 @@ class xml_attr_field(object):
def __set__(self, obj, val): def __set__(self, obj, val):
if val == None: if val == None:
val = "" val = ""
try: document = obj.info
document = dom.parseString(obj.info)
except Exception, err:
print >>sys.stderr, "Could not parse XML:", err
print obj.info
raise
elems = document.getElementsByTagName(self.tag_name) elems = document.getElementsByTagName(self.tag_name)
if len(elems): if len(elems):
elem = None elem = None
@ -127,8 +120,7 @@ class xml_attr_field(object):
elem = candidate elem = candidate
if elem: if elem:
elem.setAttribute(self.attr, val) elem.setAttribute(self.attr, val)
info = document.toxml(encoding='utf-16') obj.info = document
obj.info = info
def __repr__(self): def __repr__(self):
@ -152,12 +144,7 @@ class xml_field(object):
def __get__(self, obj, typ=None): def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """ """ Return the data in this field or '' if the field is empty """
try: document = obj.info
document = dom.parseString(obj.info)
except Exception, err:
print >>sys.stderr, "Could not parse XML:", err
print obj.info
raise
elems = document.getElementsByTagName(self.tag_name) elems = document.getElementsByTagName(self.tag_name)
if len(elems): if len(elems):
@ -174,12 +161,7 @@ class xml_field(object):
def __set__(self, obj, val): def __set__(self, obj, val):
if not val: if not val:
val = '' val = ''
try: document = obj.info
document = dom.parseString(obj.info)
except Exception, err:
print >>sys.stderr, "Could not parse XML:", err
print obj.info
raise
def create_elem(): def create_elem():
elem = document.createElement(self.tag_name) elem = document.createElement(self.tag_name)
parent = document.getElementsByTagName(self.parent)[0] parent = document.getElementsByTagName(self.parent)[0]
@ -206,8 +188,7 @@ class xml_field(object):
else: else:
elem = create_elem() elem = create_elem()
elem.appendChild(document.createTextNode(val)) elem.appendChild(document.createTextNode(val))
info = document.toxml(encoding='utf-16') obj.info = document
obj.info = info
def __str__(self): def __str__(self):
@ -371,9 +352,8 @@ class LRFMetaFile(object):
def info(): def info():
doc = \ doc = \
""" """
Document meta information in raw XML format as a byte string encoded in Document meta information as a minidom Document object.
utf-16. To set use a minidom document object.
To set use raw XML in a byte string encoded in utf-16.
""" """
def fget(self): def fget(self):
if self.compressed_info_size == 0: if self.compressed_info_size == 0:
@ -381,30 +361,16 @@ class LRFMetaFile(object):
size = self.compressed_info_size - 4 size = self.compressed_info_size - 4
self._file.seek(self.info_start) self._file.seek(self.info_start)
try: try:
src = zlib.decompress(self._file.read(size)) src = zlib.decompress(self._file.read(size))
if len(src) != self.uncompressed_info_size: if len(src) != self.uncompressed_info_size:
raise LRFException("Decompression of document meta info\ raise LRFException("Decompression of document meta info\
yielded unexpected results") yielded unexpected results")
candidate = unicode(src, 'utf-16', 'replace') return dom.parseString(src)
# LRF files produced with makelrf dont have a correctly
# encoded metadata block.
# Decoding using latin1 is the most useful for me since I
# occassionally read french books.
# pdflrf creates invalif metadata blocks
candidate = re.compile(u'</Info>.*', re.DOTALL).sub(u'</Info>', candidate)
if not u"Info" in candidate:
candidate = unicode(src, 'latin1', errors='ignore')
if candidate[-1:] == '\0':
candidate = candidate[:-1]
candidate = dom.parseString(candidate.encode('utf-8')).\
toxml(encoding='utf-16').strip()
else:
candidate = candidate.strip().encode('utf-16')
return candidate
except zlib.error: except zlib.error:
raise LRFException("Unable to decompress document meta information") raise LRFException("Unable to decompress document meta information")
def fset(self, info): def fset(self, document):
info = document.toxml('utf-8')
self.uncompressed_info_size = len(info) self.uncompressed_info_size = len(info)
stream = zlib.compress(info) stream = zlib.compress(info)
orig_size = self.compressed_info_size orig_size = self.compressed_info_size