Fix bug in handling non-ascii characters in lit meta file

2025-08-30 23:00:21 -04:00 · 2007-12-05 19:31:04 +00:00 · 2007-12-05 19:31:04 +00:00 · 494c50dd96
commit 494c50dd96
parent cee1d202cf
1 changed files with 20 additions and 17 deletions
--- a/src/libprs500/ebooks/metadata/lit.py
+++ b/src/libprs500/ebooks/metadata/lit.py
@ -178,13 +178,16 @@ class UnBinary(object):
    def write_spaces(self, depth):
        self.buf.write(u' '.join(u'' for i in range(depth)))
    def item_path(self, internal_id):
        for i in self.manifest:
            if i == internal_id:
                return i.path
        raise LitReadError('Could not find item %s'%(internal_id,))
    def __unicode__(self):
        raw = self.buf.getvalue().lstrip()
        return raw.decode('utf-8')
    def binary_to_text(self, base=0, depth=0):
        space_enabled, saved_space_enabled = 1, 0
        was_indented, is_goingdown = 0, 0
@ -210,7 +213,7 @@ class UnBinary(object):
                if c == '\v': 
                    c = '\n'
                pending_indent = 0
-                self.buf.write(c)
+                self.buf.write(c.encode('utf-8') if isinstance(c, unicode) else c)
            elif state == 'get flags':
                if ord(c) == 0:
                    state = 'text'
@ -249,7 +252,7 @@ class UnBinary(object):
                        current_map = self.tag_to_attr_map[tag]
                        print 'WARNING: tag %s unknown'%(unichr(tag),)
-                    self.buf.write(unicode(tag_name))
+                    self.buf.write(unicode(tag_name).encode('utf-8'))
                elif flags & FLAG_CLOSING:
                    if depth == 0:
                        raise LitReadError('Extra closing tag')
@ -261,9 +264,9 @@ class UnBinary(object):
                    if not is_goingdown:
                        tag_name = None
                        dynamic_tag = 0
-                        self.buf.write(u' />')
+                        self.buf.write(' />')
                    else:
-                        self.buf.write(u'>')
+                        self.buf.write('>')
                        if not self.opf and (flags & (FLAG_BLOCK|FLAG_HEAD)):
                            pending_indent += 1
                        index = self.binary_to_text(base=index, depth=depth+1)
@ -273,9 +276,9 @@ class UnBinary(object):
                        saved_space_enabled = space_enabled
                        space_enabled = self.lingering_space
                        if space_enabled and was_indented and not self.was_in_text:
-                            self.buf.write(u'\n')
+                            self.buf.write('\n')
                            self.write_spaces(depth)
-                        self.buf.write(u'</'+tag_name+u'>')
+                        self.buf.write('</'+tag_name+'>')
                        if (space_enabled and self.opf) or (flags & (FLAG_BLOCK|FLAG_HEAD)):
                            self.pending_indent += 1
                        dynamic_tag = 0
@ -302,18 +305,18 @@ class UnBinary(object):
                        state = 'get value length'
                        continue
-                    self.buf.write(u' ' + unicode(attr) + u'=')
+                    self.buf.write(' ' + unicode(attr).encode('utf-8') + '=')
                    if attr in ['href', 'src']:
                        state = 'get href'
                    else:
                        state = 'get value length'
            elif state == 'get value length':
                if not in_censorship:
-                    self.buf.write(u'"')
+                    self.buf.write('"')
                char_count = ord(c) - 1
                if not char_count:
                    if not in_censorship:
-                        self.buf.write(u'"')
+                        self.buf.write('"')
                    in_censorship = 0
                    state = 'get attr'
                state = 'get value'
@ -324,7 +327,7 @@ class UnBinary(object):
            elif state == 'get value':
                if char_count == 0xfffe:
                    if not in_censorship:
-                        self.buf.write(unicode(ord(c)-1))
+                        self.buf.write(str(ord(c)-1))
                    in_censorship = 0
                    state = 'get attr'
                elif char_count:
@ -353,13 +356,13 @@ class UnBinary(object):
                char_count = ord(c) - 1
                if char_count <= 0 or char_count > len(self.bin)-index:
                    raise LitReadError('Invalid character count %d'%(char_count,))
-                self.buf.write(u' ')
+                self.buf.write(' ')
                state = 'get custom attr'
            elif state == 'get custom attr':
                self.buf.write(c)
                char_count -= 1
                if not char_count:
-                    self.buf.write(u'=')
+                    self.buf.write('=')
                    state = 'get value length'
            elif state == 'get href':
                char_count = ord(c) - 1
@ -371,7 +374,7 @@ class UnBinary(object):
                path = self.item_path(doc)
                if m and frag:
                    path += m+frag
-                self.buf.write(u'"%s"'%(path,))
+                self.buf.write((u'"%s"'%(path,)).encode('utf-8'))
                state = 'get attr'
        self.lingering_space = space_enabled
@ -682,7 +685,7 @@ class LitFile(object):
  PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.0.1 Package//EN"
  "http://openebook.org/dtds/oeb-1.0.1/oebpkg101.dtd">
 '''+\
-                UnBinary(raw, self.manifest).buf.getvalue().lstrip()
+                unicode(UnBinary(raw, self.manifest))
            self.meta = xml
        finally:
            self._stream.seek(opos)
@ -690,7 +693,7 @@ class LitFile(object):
 def get_metadata(stream):
    try:
        litfile = LitFile(stream)
-        mi = OPFReader(cStringIO.StringIO(litfile.meta))
+        mi = OPFReader(cStringIO.StringIO(litfile.meta.encode('utf-8')))
    except:
        title = stream.name if hasattr(stream, 'name') and stream.name else 'Unknown'
        mi = MetaInformation(title, ['Unknown'])
@ -702,7 +705,7 @@ def main(args=sys.argv):
    if len(args) != 2:
        print >>sys.stderr, 'Usage: %s file.lit'%(args[0],)
        return 1
-    print get_metadata(open(args[1], 'rb'))
+    print unicode(get_metadata(open(args[1], 'rb')))
    return 0
 if __name__ == '__main__':