Improve support in lrfviewer for LRF files that are created by badly programmed software that puts unicode strings directly into text streams. BookDesigner, I'm looking at you! lrfviewer now handles a much larger range of such unicode characters in text streams.

This commit is contained in:
Kovid Goyal 2009-01-12 20:57:54 -08:00
parent 87657c6fd4
commit 451986a9ce

View File

@ -698,10 +698,7 @@ class Text(LRFStream):
lineposition_map = {1:'before', 2:'after'}
def add_text(self, text):
try:
s = unicode(text, "utf-16-le")
except UnicodeDecodeError: # Work around for Book Designer
s = unicode(text+'\x00', 'utf-16-le')
s = unicode(text, "utf-16-le")
if s:
s = s.translate(self.text_map)
self.content.append(self.entity_pattern.sub(entity_to_unicode, s))
@ -802,18 +799,39 @@ class Text(LRFStream):
length = len(self.stream)
style = self.style.as_dict()
current_style = style.copy()
text_tags = set(list(TextAttr.tag_map.keys()) + \
list(Text.text_tags.keys()) + \
list(ruby_tags.keys()))
text_tags -= set([0xf500+i for i in range(10)])
text_tags.add(0xf5cc)
while stream.tell() < length:
# Is there some text beofre a tag?
pos = self.stream.find('\xf5', stream.tell()) - 1
if pos > 0:
self.add_text(self.stream[stream.tell():pos])
stream.seek(pos)
elif pos == -2: # No tags in this stream
# Is there some text before a tag?
def find_first_tag(start):
pos = self.stream.find('\xf5', start)
if pos == -1:
return -1
try:
stream.seek(pos-1)
_t = Tag(stream)
if _t.id in text_tags:
return pos-1
return find_first_tag(pos+1)
except:
return find_first_tag(pos+1)
start_pos = stream.tell()
tag_pos = find_first_tag(start_pos)
if tag_pos >= start_pos:
if tag_pos > start_pos:
self.add_text(self.stream[start_pos:tag_pos])
stream.seek(tag_pos)
else: # No tags in this stream
self.add_text(self.stream)
stream.seek(0, 2)
print repr(self.stream)
break
tag = Tag(stream)
@ -1170,7 +1188,6 @@ class TOCObject(LRFStream):
refobj = struct.unpack("<I", stream.read(4))[0]
cnt = struct.unpack("<H", stream.read(2))[0]
raw = stream.read(cnt)
print repr(raw)
label = raw.decode('utf_16_le')
self._contents.append(TocLabel(refpage, refobj, label))
c -= 1