Improve support in lrfviewer for LRF files that are created by badly programmed software that puts unicode strings directly into text streams. BookDesigner, I'm looking at you! lrfviewer now handles a much larger range of such unicode characters in text streams.

2025-07-09 03:04:10 -04:00 · 2009-01-12 20:57:54 -08:00 · 2009-01-12 20:57:54 -08:00 · 451986a9ce
commit 451986a9ce
parent 87657c6fd4
1 changed files with 29 additions and 12 deletions
--- a/src/calibre/ebooks/lrf/objects.py
+++ b/src/calibre/ebooks/lrf/objects.py
@ -698,10 +698,7 @@ class Text(LRFStream):
    lineposition_map = {1:'before', 2:'after'}
    
    def add_text(self, text):
-        try:
-            s = unicode(text, "utf-16-le")
-        except UnicodeDecodeError: # Work around for Book Designer
-            s = unicode(text+'\x00', 'utf-16-le')
+        s = unicode(text, "utf-16-le")
        if s:
            s = s.translate(self.text_map)
            self.content.append(self.entity_pattern.sub(entity_to_unicode, s))
@ -802,18 +799,39 @@ class Text(LRFStream):
        length = len(self.stream)
        style = self.style.as_dict()
        current_style = style.copy()
+        text_tags = set(list(TextAttr.tag_map.keys()) + \
+                        list(Text.text_tags.keys()) + \
+                        list(ruby_tags.keys()))
+        text_tags -= set([0xf500+i for i in range(10)])
+        text_tags.add(0xf5cc)
        
        while stream.tell() < length:
        
-            # Is there some text beofre a tag?
-            pos = self.stream.find('\xf5', stream.tell()) - 1
-            if pos > 0:
-                self.add_text(self.stream[stream.tell():pos])
-                stream.seek(pos)
-            elif pos == -2: # No tags in this stream
+            # Is there some text before a tag?
+            def find_first_tag(start):
+                pos = self.stream.find('\xf5', start)
+                if pos == -1:
+                    return -1
+                try:
+                    stream.seek(pos-1)
+                    _t = Tag(stream)
+                    if _t.id in text_tags:
+                        return pos-1
+                    return find_first_tag(pos+1)
+                    
+                    
+                except:
+                    return find_first_tag(pos+1)
+                    
+            start_pos = stream.tell()        
+            tag_pos = find_first_tag(start_pos)
+            if tag_pos >= start_pos:
+                if tag_pos > start_pos:
+                    self.add_text(self.stream[start_pos:tag_pos])
+                stream.seek(tag_pos)
+            else: # No tags in this stream
                self.add_text(self.stream)
                stream.seek(0, 2)
-                print repr(self.stream)
                break
            
            tag = Tag(stream)
@ -1170,7 +1188,6 @@ class TOCObject(LRFStream):
            refobj  = struct.unpack("<I", stream.read(4))[0]
            cnt = struct.unpack("<H", stream.read(2))[0]
            raw = stream.read(cnt)
-            print repr(raw) 
            label = raw.decode('utf_16_le')
            self._contents.append(TocLabel(refpage, refobj, label))
            c -= 1