Greatly improved decoding of metadata block in LRF files. We now handle the incorrectly encoded metadata blocks produced by makelrf 0.3 by assuming a latin1 encoding. Also added additional fields and support for creating a field that does not exist. Metadata blocks are now always written correctly encoded in utf-16 independent of the encoding of the original metadata block.

2025-07-09 03:04:10 -04:00 · 2007-01-03 02:22:43 +00:00 · 2007-01-03 02:22:43 +00:00 · cead54c959
commit cead54c959
parent c25865e76b
1 changed files with 110 additions and 60 deletions
--- a/libprs500/lrf/meta.py
+++ b/libprs500/lrf/meta.py
@ -27,9 +27,7 @@ to get and set meta information. For example:
 import struct
 import array
 import zlib
-import StringIO
 import xml.dom.minidom as dom
-from xml.dom.ext import Print

 from libprs500.prstypes import field

@ -91,36 +89,68 @@ class xml_field(object):
    Descriptor that gets and sets XML based meta information from an LRF file. 
    Works for simple XML fields of the form <tagname>data</tagname>
    """    
-    def __init__(self, tag_name):
-        """ @param tag_name: The XML tag whoose data we operate on """
+    def __init__(self, tag_name, parent="BookInfo"):
+        """ 
+        @param tag_name: The XML tag whose data we operate on 
+        @param parent: The tagname of the parent element of C{tag_name}
+        """
        self.tag_name = tag_name
+        self.parent = parent
        
    def __get__(self, obj, typ=None): 
+        """ Return the data in this field or '' if the field is empty """
        document = dom.parseString(obj.info)
-        elem = document.getElementsByTagName(self.tag_name)[0]
+        elems = document.getElementsByTagName(self.tag_name)
+        if len(elems):
+            elem = None
+            for candidate in elems:
+                if candidate.parentNode.nodeName == self.parent:
+                    elem = candidate
+            if elem:
                elem.normalize() 
-        if not elem.hasChildNodes(): 
-            return ""      
+                if elem.hasChildNodes(): 
                    return elem.firstChild.data.strip()
+        return ""
        
    def __set__(self, obj, val):
        document = dom.parseString(obj.info)
-        elem = document.getElementsByTagName(self.tag_name)[0]      
+        def create_elem():
+            elem = document.createElement(self.tag_name)
+            elem.appendChild(dom.Text())
+            parent = document.getElementsByTagName(self.parent)[0]
+            parent.appendChild(elem)
+            return elem
+            
+        if not val:
+            val = u''
+        if type(val).__name__ != 'unicode':
+            val = unicode(val, 'utf-8')
+        
+        elems = document.getElementsByTagName(self.tag_name)
+        elem = None
+        if len(elems):
+            for candidate in elems:
+                if candidate.parentNode.nodeName == self.parent:
+                    elem = candidate
+            if not elem:
+                elem = create_elem()
+            else:
                elem.normalize()
                while elem.hasChildNodes(): 
                    elem.removeChild(elem.lastChild)
                elem.appendChild(dom.Text())            
+        else:
+            elem = create_elem()            
        elem.firstChild.data = val
-        s = StringIO.StringIO()
-        Print(document, s)
-        obj.info = s.getvalue()
-        s.close()
+        info = document.toxml(encoding='utf-16')
+        obj.info = info
+            
    
    def __str__(self):
        return self.tag_name
    
    def __repr__(self):
-        return "XML Field: " + self.tag_name
+        return "XML Field: " + self.tag_name + " in " + self.parent

 class LRFMetaFile(object):
    """ Has properties to read and write all Meta information in a LRF file. """
@ -146,18 +176,21 @@ class LRFMetaFile(object):
    uncompressed_info_size   = versioned_field(compressed_info_size, 0, \
                                             fmt=DWORD, start=0x54)
    
-    title                          = xml_field("Title")
-    author                     = xml_field("Author")
-    book_id                   = xml_field("BookID")
-    publisher                 = xml_field("Publisher")
-    label                        = xml_field("Label")
-    category                 = xml_field("Category")
+    title                 = xml_field("Title", parent="BookInfo")
+    author                = xml_field("Author", parent="BookInfo")
+    book_id               = xml_field("BookID", parent="BookInfo")
+    publisher             = xml_field("Publisher", parent="BookInfo")
+    label                 = xml_field("Label", parent="BookInfo")
+    category              = xml_field("Category", parent="BookInfo")
+    classification        = xml_field("Classification", parent="BookInfo")
+    free_text             = xml_field("FreeText", parent="BookInfo")
    
-    language                 = xml_field("Language")
-    creator                    = xml_field("Creator")
-    creation_date          = xml_field("CreationDate") #: Format is %Y-%m-%d
-    producer                  = xml_field("Producer")
-    page                        = xml_field("Page")
+    language              = xml_field("Language", parent="DocInfo")
+    creator               = xml_field("Creator", parent="DocInfo")
+    # Format is %Y-%m-%d
+    creation_date         = xml_field("CreationDate", parent="DocInfo") 
+    producer              = xml_field("Producer", parent="DocInfo")
+    page                  = xml_field("Page", parent="DocInfo")
    
    def safe(func):
        """ 
@ -198,20 +231,36 @@ class LRFMetaFile(object):
    
    @safe_property
    def info():
-        doc = """ Document meta information in raw XML format """
+        doc = \
+        """ 
+        Document meta information in raw XML format as a byte string encoded in
+        utf-16.
+        To set use raw XML in a byte string encoded in utf-16.
+        """
        def fget(self):
            if self.compressed_info_size == 0:
                raise LRFException("This document has no meta info")      
            size = self.compressed_info_size - 4
            self._file.seek(self.info_start)      
            try:
-                stream =  zlib.decompress(self._file.read(size))        
-                if len(stream) != self.uncompressed_info_size:          
+                src =  zlib.decompress(self._file.read(size))        
+                if len(src) != self.uncompressed_info_size:          
                    raise LRFException("Decompression of document meta info\
                                        yielded unexpected results")                
-                # Remove null characters from string as in some LRF files 
-                # the stream is null-terminated
-                return stream.strip().replace('\0', '')
+                candidate = unicode(src, 'utf-16')
+                # LRF files produced with makelrf dont have a correctly
+                # encoded metadata block. 
+                # Decoding using latin1 is the most useful for me since I
+                # occassionally read french books.
+                if not u"Info" in candidate: 
+                    candidate = unicode(src, 'latin1', errors='ignore')
+                    if candidate[-1:] == '\0':
+                        candidate = candidate[:-1]
+                    candidate = dom.parseString(candidate.encode('utf-8')).\
+                            toxml(encoding='utf-16')
+                else:
+                    candidate = candidate.encode('utf-16')
+                return candidate.strip()
            except zlib.error, e:
                raise LRFException("Unable to decompress document meta information")
        
@ -222,6 +271,7 @@ class LRFMetaFile(object):
            self._file.seek(self.info_start)
            self._file.write(stream)
            self._file.flush()
+        
        return { "fget":fget, "fset":fset, "doc":doc }
    
    @safe_property