Encode metadata block in UTF-8 thereby working around endianess issues and hopefully fixing #321

2025-07-09 03:04:10 -04:00 · 2007-12-01 00:13:53 +00:00 · 2007-12-01 00:13:53 +00:00 · 5cbe331c07
commit 5cbe331c07
parent e5ce6adad4
1 changed files with 16 additions and 50 deletions
--- a/src/libprs500/ebooks/lrf/meta.py
+++ b/src/libprs500/ebooks/lrf/meta.py
@ -29,9 +29,12 @@ from shutil import copyfileobj
 from cStringIO import StringIO
 import xml.dom.minidom as dom
 from functools import wraps
 from xml.parsers.expat import ParserCreate
 from libprs500.devices.prs500.prstypes import field
 from libprs500.ebooks.metadata import MetaInformation
 from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup
 BYTE      = "<B"  #: Unsigned char little endian encoded in 1 byte 
 WORD      = "<H"  #: Unsigned short little endian encoded in 2 bytes 
@ -94,12 +97,7 @@ class xml_attr_field(object):
    def __get__(self, obj, typ=None):
        """ Return the data in this field or '' if the field is empty """
-        try:
+        document = obj.info
            document = dom.parseString(obj.info)
        except Exception, err:
            print >>sys.stderr, "Could not parse XML:", err
            print obj.info
            raise
        elems = document.getElementsByTagName(self.tag_name)
        if len(elems):
            elem = None
@ -113,12 +111,7 @@ class xml_attr_field(object):
    def __set__(self, obj, val):
        if val == None:
            val = ""
-        try:
+        document = obj.info
            document = dom.parseString(obj.info)
        except Exception, err:
            print >>sys.stderr, "Could not parse XML:", err
            print obj.info
            raise
        elems = document.getElementsByTagName(self.tag_name)
        if len(elems):
            elem = None
@ -127,8 +120,7 @@ class xml_attr_field(object):
                    elem = candidate
        if elem:
            elem.setAttribute(self.attr, val)
-        info = document.toxml(encoding='utf-16')
+        obj.info = document
        obj.info = info
    def __repr__(self):
@ -152,12 +144,7 @@ class xml_field(object):
    def __get__(self, obj, typ=None): 
        """ Return the data in this field or '' if the field is empty """
-        try:
+        document = obj.info
            document = dom.parseString(obj.info)
        except Exception, err:
            print >>sys.stderr, "Could not parse XML:", err
            print obj.info
            raise
        elems = document.getElementsByTagName(self.tag_name)
        if len(elems):
@ -174,12 +161,7 @@ class xml_field(object):
    def __set__(self, obj, val):
        if not val:
            val = ''
-        try:
+        document = obj.info
            document = dom.parseString(obj.info)
        except Exception, err:
            print >>sys.stderr, "Could not parse XML:", err
            print obj.info
            raise
        def create_elem():
            elem = document.createElement(self.tag_name)
            parent = document.getElementsByTagName(self.parent)[0]
@ -206,8 +188,7 @@ class xml_field(object):
        else:
            elem = create_elem()  
        elem.appendChild(document.createTextNode(val))
-        info = document.toxml(encoding='utf-16')
+        obj.info = document
        obj.info = info
    def __str__(self):
@ -371,9 +352,8 @@ class LRFMetaFile(object):
    def info():
        doc = \
        """ 
-        Document meta information in raw XML format as a byte string encoded in
+        Document meta information as a minidom Document object.
-        utf-16.
+        To set use a minidom document object.
        To set use raw XML in a byte string encoded in utf-16.
        """
        def fget(self):
            if self.compressed_info_size == 0:
@ -381,30 +361,16 @@ class LRFMetaFile(object):
            size = self.compressed_info_size - 4
            self._file.seek(self.info_start)      
            try:
-                src =  zlib.decompress(self._file.read(size))        
+                src =  zlib.decompress(self._file.read(size))
                if len(src) != self.uncompressed_info_size:          
                    raise LRFException("Decompression of document meta info\
-                                        yielded unexpected results")                
+                                        yielded unexpected results")
-                candidate = unicode(src, 'utf-16', 'replace')
+                return dom.parseString(src)
                # LRF files produced with makelrf dont have a correctly
                # encoded metadata block. 
                # Decoding using latin1 is the most useful for me since I
                # occassionally read french books.
                # pdflrf creates invalif metadata blocks
                candidate = re.compile(u'</Info>.*', re.DOTALL).sub(u'</Info>', candidate)
                if not u"Info" in candidate: 
                    candidate = unicode(src, 'latin1', errors='ignore')
                    if candidate[-1:] == '\0':
                        candidate = candidate[:-1]
                    candidate = dom.parseString(candidate.encode('utf-8')).\
                            toxml(encoding='utf-16').strip()
                else:
                    candidate = candidate.strip().encode('utf-16')
                return candidate
            except zlib.error:
                raise LRFException("Unable to decompress document meta information")
-        def fset(self, info):
+        def fset(self, document):
            info = document.toxml('utf-8')
            self.uncompressed_info_size = len(info)
            stream = zlib.compress(info)
            orig_size = self.compressed_info_size