Refactor parsing of text streams

This commit is contained in:
Kovid Goyal 2007-09-22 05:03:56 +00:00
parent b1aa2f9abb
commit 04ccd5d30a

View File

@ -12,7 +12,7 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import struct, array, zlib, cStringIO import struct, array, zlib, cStringIO, collections
from libprs500.ebooks.lrf import LRFParseError from libprs500.ebooks.lrf import LRFParseError
from libprs500.ebooks.lrf.tags import Tag from libprs500.ebooks.lrf.tags import Tag
@ -512,13 +512,6 @@ class Block(LRFStream):
if hasattr(self, attr): if hasattr(self, attr):
self.attrs[attr] = getattr(self, attr) self.attrs[attr] = getattr(self, attr)
def __iter__(self):
try:
for i in iter(self.content):
yield i
except TypeError:
yield self.content
def __unicode__(self): def __unicode__(self):
s = u'\n<%s objid="%d" blockstyle="%d" '%(self.name, self.id, self.style_id) s = u'\n<%s objid="%d" blockstyle="%d" '%(self.name, self.id, self.style_id)
if hasattr(self, 'textstyle_id'): if hasattr(self, 'textstyle_id'):
@ -526,9 +519,7 @@ class Block(LRFStream):
for attr in self.attrs: for attr in self.attrs:
s += '%s="%s" '%(attr, self.attrs[attr]) s += '%s="%s" '%(attr, self.attrs[attr])
s = s.rstrip()+'>\n' s = s.rstrip()+'>\n'
if self.name != 'ImageBlock': s += unicode(self.content)
for i in self:
s += unicode(i)
s += '</%s>\n'%(self.name,) s += '</%s>\n'%(self.name,)
return s return s
@ -541,6 +532,8 @@ class MiniPage(LRFStream):
tag_map.update(LRFStream.tag_map) tag_map.update(LRFStream.tag_map)
tag_map.update(BlockAttr.tag_map) tag_map.update(BlockAttr.tag_map)
class Text(LRFStream): class Text(LRFStream):
tag_map = { tag_map = {
0xF503: ['style_id', 'D'], 0xF503: ['style_id', 'D'],
@ -550,8 +543,9 @@ class Text(LRFStream):
style = property(fget=lambda self : self._document.objects[self.style_id]) style = property(fget=lambda self : self._document.objects[self.style_id])
class Content(LRFContentObject): text_map = { 0x22: u'&quot;', 0x26: u'&amp;', 0x27: u'&squot;', 0x3c: u'&lt;', 0x3e: u'&gt;' }
tag_map = {
text_tags = {
0xF581: ['simple_container', 'Italic'], 0xF581: ['simple_container', 'Italic'],
0xF582: 'end_container', 0xF582: 'end_container',
0xF5B1: ['simple_container', 'Yoko'], 0xF5B1: ['simple_container', 'Yoko'],
@ -585,112 +579,75 @@ class Text(LRFStream):
0xF5C6: 'box', 0xF5C6: 'box',
0xF5C7: 'end_container', 0xF5C7: 'end_container',
0xF5CA: 'space', 0xF5CA: 'space',
0xF5CC: 'string',
0xF5D1: 'plot', 0xF5D1: 'plot',
0xF5D2: 'cr', 0xF5D2: 'cr',
} }
text_map = { 0x22: u'&quot;', 0x26: u'&amp;', 0x27: u'&squot;', 0x3c: u'&lt;', 0x3e: u'&gt;' } class TextTag(object):
def __init__(self, name, attrs={}, self_closing=False):
self.name = name
self.attrs = attrs
self.self_closing = self_closing
def __unicode__(self):
s = u'<%s '%(self.name,)
for name, val in self.attrs.items():
s += '%s="%s" '%(name, val)
return s.rstrip() + (u' />' if self.self_closing else u'>') + (u'\n' if self.name in ('P', 'CR') else u'')
linetype_map = {0: 'none', 0x10: 'solid', 0x20: 'dashed', 0x30: 'double', 0x40: 'dotted'} linetype_map = {0: 'none', 0x10: 'solid', 0x20: 'dashed', 0x30: 'double', 0x40: 'dotted'}
adjustment_map = {1: 'top', 2: 'center', 3: 'baseline', 4: 'bottom'} adjustment_map = {1: 'top', 2: 'center', 3: 'baseline', 4: 'bottom'}
lineposition_map = {1:'before', 2:'after'} lineposition_map = {1:'before', 2:'after'}
def __init__(self, bytes, objects, parent=None, name=None, attrs={}):
self.parent = parent
self.name = name
self.attrs = attrs
LRFContentObject.__init__(self, bytes, objects)
def parse_stream(self, length): def add_text(self, text):
offset = self.stream.tell() s = unicode(text, "utf-16-le")
while self.in_container and offset < length: if s:
buf = self.stream.getvalue()[offset:] self.containers.append(s.translate(self.text_map))
pos = buf.find('\xf5') - 1
if pos > 0:
self.stream.seek(offset+pos)
self.add_text(buf[:pos])
self.handle_tag(Tag(self.stream))
offset = self.stream.tell()
def handle_tag(self, tag): def empty_containers(self):
if tag.id in self.tag_map: open_containers = 0
action = self.tag_map[tag.id] while len(self.containers) > 0:
if isinstance(action, basestring): c = self.containers.popleft()
func, args = action, tuple([]) self.content.append(c)
else: if c is None:
func, args = action[0], (action[1],) open_containers -= 1
getattr(self, func)(tag, *args) elif isinstance(c, self.__class__.TextTag) and not c.self_closing:
elif tag.id in TextAttr.tag_map: open_containers += 1
h = TextAttr.tag_map[tag.id] while open_containers > 0:
val = LRFObject.tag_to_val(h, None, tag, self.stream) self.content.append(None)
if self.name == 'Span': open_containers -= 1
if h[0] not in self.attrs:
self.attrs[h[0]] = val
elif val != self.attrs[h[0]]:
if self._contents:
self.parent._contents.append(self)
Text.Content(self.stream, self.objects, self.parent,
'Span', {h[0]: val})
def end_container(self, tag, stream):
self.containers.append(None)
else: def start_para(self, tag, stream):
Text.Content(self.stream, self.objects, self, self.empty_containers()
'Span', {h[0]: val}) self.containers.append(self.__class__.TextTag('P'))
else: def end_para(self, tag, stream):
raise LRFParseError('Unknown tag in text stream %s'&(tag,)) self.empty_containers()
def cr(self, tag, stream):
self.containers.append(self.__class__.TextTag('CR', self_closing=True))
def char_button(self, tag, stream):
self.containers.append(self.__class__.TextTag(
'CharButton', attrs={'refobj':tag.dword}))
def simple_container(self, tag, name): def simple_container(self, tag, name):
cont = Text.Content(self.stream, self.objects, parent=self, name=name) self.containers.append(self.__class__.TextTag(name))
self._contents.append(cont)
def end_container(self, *args): def empline(self, tag, stream):
self.in_container = False
if self.name == 'Span' and self._contents and self not in self.parent._contents:
self.parent._contents.append(self)
def end_to_root(self):
parent = self
while parent:
parent.end_container()
parent = parent.parent
def root(self):
root = self
while root.parent:
root = root.parent
return root
def start_para(self, tag):
self.end_to_root()
root = self.root()
root.in_container = True
p = Text.Content(self.stream, self.objects, parent=root, name='P')
root._contents.append(p)
def end_para(self, tag):
self.end_to_root()
root = self.root()
root.in_container = True
def cr(self, tag):
self._contents.append(Text.Content('', self.objects, parent=self, name='CR'))
def char_button(self, tag):
self._contents.append(Text.Content(self.stream, self.objects, parent=self,
name='CharButton', attrs={'refobj':tag.dword}))
def empline(self, tag):
def invalid(op): def invalid(op):
self.stream.seek(op) stream.seek(op)
self.simple_container('EmpLine') self.simple_container('EmpLine')
oldpos = self.stream.tell() oldpos = stream.tell()
try: try:
t = Tag(self.stream) t = Tag(stream)
if t.id not in [0xF579, 0xF57A]: if t.id not in [0xF579, 0xF57A]:
raise LRFParseError raise LRFParseError
except LRFParseError: except LRFParseError:
@ -699,80 +656,106 @@ class Text(LRFStream):
h = TextAttr.tag_map[t.id] h = TextAttr.tag_map[t.id]
attrs = {} attrs = {}
attrs[h[0]] = TextAttr.tag_to_val(h, None, t, None) attrs[h[0]] = TextAttr.tag_to_val(h, None, t, None)
oldpos = self.stream.tell() oldpos = stream.tell()
try: try:
t = Tag(self.stream) t = Tag(stream)
if t.id not in [0xF579, 0xF57A]: if t.id not in [0xF579, 0xF57A]:
raise LRFParseError raise LRFParseError
h = TextAttr.tag_map[t.id] h = TextAttr.tag_map[t.id]
attrs[h[0]] = TextAttr.tag_to_val(h, None, t, None) attrs[h[0]] = TextAttr.tag_to_val(h, None, t, None)
except LRFParseError: except LRFParseError:
self.stream.seek(oldpos) stream.seek(oldpos)
cont = Text.Content(self.stream, self.objects, parent=self, self.containers.append(self.__class__.TextTag(
name='EmpLine', attrs=attrs) 'EmpLine', attrs=attrs))
self._contents.append(cont)
def space(self, tag): def space(self, tag, stream):
self._contents.append(Text.Content('', self.objects, parent=self, self.containers.append(self.__class__.TextTag('Space',
name='Space', attrs={'xsize':tag.sword})) attrs={'xsize':tag.sword},
self_closing=True))
def string(self, tag): def plot(self, tag, stream):
strlen = tag.word
self.add_text(self.stream.read(strlen))
def add_text(self, text):
s = unicode(text, "utf-16-le")
self._contents.append(s.translate(self.text_map))
def plot(self, tag):
xsize, ysize, refobj, adjustment = struct.unpack("<HHII", tag.contents) xsize, ysize, refobj, adjustment = struct.unpack("<HHII", tag.contents)
plot = Text.Content('', self.objects, self, 'Plot', plot = self.__class__.TextTag('Plot',
{'xsize': xsize, 'ysize': ysize, 'refobj':refobj, {'xsize': xsize, 'ysize': ysize, 'refobj':refobj,
'adjustment':self.adjustment_map[adjustment]}) 'adjustment':self.adjustment_map[adjustment]}, self_closing=True)
plot.refobj = self.objects[refobj] plot.refobj = self._document.objects[refobj]
self._contents.append(plot) self.containers.append(plot)
def draw_char(self, tag): def draw_char(self, tag, stream):
self._contents.append(Text.Content(self.stream, self.objects, self, self.containers.append(self.__class__.TextTag('DrawChar', {'line':tag.word}))
'DrawChar', {'line':tag.word}))
def box(self, tag): def box(self, tag, stream):
self._contents.append(Text.Content(self.stream, self.objects, self, self.containers.append(self.__class__.TextTag('Box',
'Box', {'linetype':self.linetype_map[tag.word]})) {'linetype':self.linetype_map[tag.word]}))
def __iter__(self): def initialize(self):
for i in self._contents: self.content = collections.deque()
yield i self.containers = collections.deque()
stream = cStringIO.StringIO(self.stream)
length = len(self.stream)
previous_span = None
while stream.tell() < length:
# Is there some text beofre a tag?
pos = self.stream.find('\xf5', stream.tell()) - 1
if pos > 0:
self.add_text(self.stream[stream.tell():pos])
stream.seek(pos)
tag = Tag(stream)
if tag.id == 0xF5CC:
self.add_text(stream.read(tag.word))
elif tag.id in self.__class__.text_tags: # A Text tag
action = self.__class__.text_tags[tag.id]
if isinstance(action, basestring):
getattr(self, action)(tag, stream)
else:
getattr(self, action[0])(tag, action[1])
elif tag.id in TextAttr.tag_map: # A Span attribute
action = TextAttr.tag_map[tag.id]
if len(self.containers) == 0:
previous_span = None
name, val = action[0], LRFObject.tag_to_val(action, None, tag, None)
if previous_span is None:
# No existing Span so start a new one
previous_span = self.__class__.TextTag('Span', {name:val})
self.containers.append(previous_span)
else:
# Already in a Span
if name in previous_span.attrs:
# Start new Span
if hasattr(self.containers[-1], 'name') and self.containers[-1].name == 'Span':
self.containers.pop()
else:
self.empty_containers()
previous_span = self.__class__.TextTag('Span', {name:val})
self.containers.append(previous_span)
else:
# Add attribute to current span
previous_span.attrs[name] = val
self.stream = None
def __unicode__(self): def __unicode__(self):
s = u'' s = u''
if self.name is not None: open_containers = collections.deque()
s += u'<'+self.name+u' ' for c in self.content:
for attr in self.attrs: if isinstance(c, basestring):
s += u'%s="%s" '%(attr, self.attrs[attr]) s += c
s = s.rstrip() elif c is None:
children = u'' p = open_containers.pop()
for i in self: s += u'</%s>'%(p.name,)
children += unicode(i) else:
if len(children) == 0: s += unicode(c)
return s + u' />' if not c.self_closing:
if self.name is None: open_containers.append(c)
return children
return s + u'>' + children + '</%s>'%(self.name,) + ('\n' if self.name == 'P' else '')
def __str__(self): if len(open_containers) > 0:
return unicode(self).encode('utf-8') raise LRFParseError('Malformed text stream')
return s
def initialize(self):
self.content = Text.Content(self.stream, self._document.objects)
def __iter__(self):
for i in self.content:
yield i
def __str__(self):
return unicode(self.content)
class Image(LRFObject): class Image(LRFObject):