From da826622b066b7502985ac3dcd9e536f5278d95e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 28 Jan 2008 02:10:18 +0000 Subject: [PATCH] A rewritten and much improved lrs2lrf --- src/libprs500/ebooks/lrf/lrs/convert_from.py | 1250 ++++------------- src/libprs500/ebooks/lrf/objects.py | 5 +- src/libprs500/ebooks/lrf/pylrs/pylrf.py | 2 +- src/libprs500/ebooks/lrf/pylrs/pylrs.py | 80 +- .../ebooks/lrf/web/profiles/__init__.py | 35 +- src/libprs500/ebooks/lrf/web/profiles/wsj.py | 4 +- 6 files changed, 331 insertions(+), 1045 deletions(-) diff --git a/src/libprs500/ebooks/lrf/lrs/convert_from.py b/src/libprs500/ebooks/lrf/lrs/convert_from.py index 3ef82dc433..6fec9b508e 100644 --- a/src/libprs500/ebooks/lrf/lrs/convert_from.py +++ b/src/libprs500/ebooks/lrf/lrs/convert_from.py @@ -1,4 +1,4 @@ -## Copyright (C) 2008 Roger Critchlow +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or @@ -12,1004 +12,259 @@ ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Compile a LRS file into a LRF file. +''' -# -# major mismatch in Button/ButtonBlock/JumpButton -# major mismatch in providing referenced object instead of object reference -# +import sys, os, logging -import sys, os -from xml.etree.ElementTree import ElementTree - -from libprs500.ebooks.lrf.pylrs.pylrs import \ - Book, StyleDefault, BookSetting, \ - ImageBlock, Header, Footer, PutObj, \ - Paragraph, CR, Italic, Bold, ImageStream, \ - CharButton, Button, PushButton, JumpTo, \ - Plot, Image, RuledLine, Canvas, DropCaps, \ - Sup, Sub, Span, Text, EmpLine, Font, \ - LrsError, Space, Box, ButtonBlock, NoBR - -from libprs500 import __appname__, __version__ +from libprs500 import __author__, __appname__, __version__, setup_cli_handlers +from libprs500.ebooks.BeautifulSoup import BeautifulStoneSoup, NavigableString, \ + CData, Tag +from libprs500.ebooks.lrf.pylrs.pylrs import Book, PageStyle, TextStyle, \ + BlockStyle, ImageStream, Font, StyleDefault, BookSetting, Header, \ + Image, ImageBlock, Page, TextBlock, Canvas, Paragraph, CR, Span, \ + Italic, Sup, Sub, Bold, EmpLine, JumpButton, CharButton, Plot, \ + DropCaps, Footer, RuledLine class LrsParser(object): - filterAttrib = ['objid', 'refobj', 'objlabel', 'pagestyle', 'blockstyle', 'textstyle', 'stylelabel', - 'evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid', 'page_tree_id', 'refstream'] - def __init__(self, file): - self.file = file - self.book = Book() - self.objects = dict() - self.dobjects = dict() - self.tocs = list() - self.charbuttons = list() - self.jumptos = list() - self.pagestyles = list() - self.blockstyles = list() - self.textstyles = list() - self.footers = list() - self.headers = list() - self.putobjs = list() - self.plots = list() - self.images = list() - self.imageblocks = list() - self.root = ElementTree(file=file) - - # - # find an element by objid - # - def get_element_by_objid(self, objid): - if objid not in self.objects: - for element in self.root.getiterator(): - if 'objid' in element.attrib: - id = element.attrib['objid'] - if id not in self.objects: - self.objects[id] = element - elif self.equal_element(self.objects[id], element): - continue - elif self.objects[id] != element: - raise LrsError, "multiple objects with same objid=%s, %s and %s"%(id, element.tag, self.objects[id].tag) - if objid in self.objects: - return self.objects[objid] - return None - # - # put the tag and attributes into one string - # - def element_dump(self, element): - str = "" - str += "<%s"%element.tag - keys = element.attrib.keys() - keys.sort() - for key in keys: - str += " %s=\"%s\""%(key,element.attrib[key]) - # should do something about sub elements and sub text - str += "/>" - return str + SELF_CLOSING_TAGS = [i.lower() for i in ['CR', 'Plot', 'NoBR', 'Space', + 'PutObj', 'RuledLine', + 'Plot', 'SetDefault', 'BookSetting', 'RegistFont', + 'PageStyle', 'TextStyle', 'BlockStyle', 'JumpTo', + 'ImageStream', 'Image']] - # - # compare two elements for identical tags and attributes - # - def equal_element(self, e1, e2): - return e1.tag == e2.tag and self._equal_attrib(e1, e2, ignore=[]) - - # - # compare two element attrib dictionaries for equivalence - # ignoring some attributes - # - def equal_attrib(self, e1, e2): - return self._equal_attrib(e1, e2, ignore=LrsParser.filterAttrib) - - def _equal_attrib(self, e1, e2, ignore): - #print "comparing %s to %s in equal_attrib"%(e1.tag,e2.tag) - a1 = e1.attrib - a2 = e2.attrib - for name in a1.keys(): - if name in ignore: + def __init__(self, stream, logger): + self.logger = logger + src = stream.read() + self.soup = BeautifulStoneSoup(src, selfClosingTags=self.SELF_CLOSING_TAGS) + self.objects = {} + for obj in self.soup.findAll(objid=True): + self.objects[obj['objid']] = obj + + self.parsed_objects = {} + self.first_pass() + self.second_pass() + self.third_pass() + self.fourth_pass() + self.fifth_pass() + + def fifth_pass(self): + for tag in self.soup.findAll(['canvas', 'header', 'footer']): + canvas = self.parsed_objects[tag.get('objid')] + for po in tag.findAll('putobj'): + canvas.put_object(self.parsed_objects[po.get('refobj')], + po.get('x1'), po.get('y1')) + + + @classmethod + def attrs_to_dict(cls, tag, exclude=('objid',)): + result = {} + for key, val in tag.attrs: + if key in exclude: continue - if name not in a2: - #print "compare: %s in e1 not in e2"%name - return False - if a1[name] != a2[name]: - #print "compare: %s e1=%s != e2=%s"%(name, a1[name], a2[name]) - return False - for name in a2.keys(): - if name in ignore: - continue - if name not in a1: - #print "compare: %s in e1 not in e2"%name - return False - if a1[name] != a2[name]: - #print "compare: %s e1=%s != e2=%s"%(name, a1[name], a2[name]) - return False - return True - - # - # process an attrib dictionary for passing into a pylrs create - # - def process_attrib(self, element): - attrib = element.attrib.copy() - for name in LrsParser.filterAttrib: - if name in attrib: - id = attrib[name] - if name == 'objid': - if id not in self.objects: - self.objects[id] = element - elif self.objects[id] != element and not self.equal_element(self.objects[id], element): - raise LrsError, "multiple objects with same objid=%s, %s and %s"%(id, element.tag, self.objects[id].tag) - del attrib[name] - return attrib - - # - # get and parse a style element - # - def fetch_style(self, element, stylename): - """get the style element referenced by stylename in element.attrib""" - - if stylename not in element.attrib: - return None - id = element.attrib[stylename] - if id in self.dobjects: - return self.dobjects[id] - style = self.get_element_by_objid(id) - if style == None: - raise LrsError, "no %s style element found for objid=%s"%(stylename, id) - #print "found style type %s with objid = %s after getting %s"%(style.tag, style.attrib['objid'], id) - newstyle = None - # - # yuck - headers and footers really mess this up - # until then, there were no objid pointers in any - # style object. - # hmm, so maybe we push them always into the page - if stylename == 'pagestyle': - for e in self.pagestyles: - if self.equal_attrib(e, style): - #print "making pagestyle %s alias to %s"%(id, e.attrib['objid']) - newstyle = self.dobjects[e.attrib['objid']] - break - if newstyle == None: - #print "making pagestyle %s"%id - self.pagestyles.append(style) - attrib = self.process_attrib(style) - for name in ['evenfooter', 'evenheader', 'footer', 'header', 'oddfooter', 'oddheader' ]: - if name+'id' in style.attrib: - attrib[name] = self.fetch_header_footer(style, name+'id') - newstyle = self.book.create_page_style(**attrib) - elif stylename == 'blockstyle': - for e in self.blockstyles: - if self.equal_attrib(e, style): - #print "making blockstyle %s alias to %s"%(id, e.attrib['objid']) - newstyle = self.dobjects[e.attrib['objid']] - break - if newstyle == None: - #print "making blockstyle %s"%id - self.blockstyles.append(style) - newstyle = self.book.create_block_style(**self.process_attrib(style)) - elif stylename == 'textstyle': - for e in self.textstyles: - if self.equal_attrib(e, style): - #print "making textstyle %s alias to %s"%(id, e.attrib['objid']) - newstyle = self.dobjects[e.attrib['objid']] - break - if newstyle == None: - #print "making textstyle %s"%id - self.textstyles.append(style) - #if 'textlinewidth' in style.attrib: - # print "creating new TextStyle with textlinewidth='%s'"%style.attrib['textlinewidth'] - newstyle = self.book.create_text_style(**self.process_attrib(style)) + result[str(key)] = val + return result + + def text_tag_to_element(self, tag): + map = { + 'span' : Span, + 'italic' : Italic, + 'bold' : Bold, + 'empline' : EmpLine, + 'sup' : Sup, + 'sub' : Sub, + 'cr' : CR, + 'drawchar': DropCaps, + } + if tag.name == 'charbutton': + return CharButton(self.parsed_objects[tag.get('refobj')], None) + if tag.name == 'plot': + return Plot(self.parsed_objects[tag.get('refobj')], **self.attrs_to_dict(tag, ['refobj'])) + return map[tag.name](**self.attrs_to_dict(tag)) + + def process_text_element(self, tag, elem): + for item in tag.contents: + if isinstance(item, NavigableString): + elem.append(item.string) + else: + subelem = self.text_tag_to_element(item) + elem.append(subelem) + self.process_text_element(item, subelem) + + + def process_paragraph(self, tag): + p = Paragraph() + contents = [i for i in tag.contents] + if contents: + if isinstance(contents[0], NavigableString): + contents[0] = contents[0].string.lstrip() + for item in contents: + if isinstance(item, basestring): + p.append(item) + elif isinstance(item, NavigableString): + p.append(item.string) + else: + elem = self.text_tag_to_element(item) + p.append(elem) + self.process_text_element(item, elem) + return p + + def process_text_block(self, tag): + tb = self.parsed_objects[tag.get('objid')] + for item in tag.contents: + if hasattr(item, 'name'): + if item.name == 'p': + tb.append(self.process_paragraph(item)) + elif item.name == 'cr': + tb.append(CR()) + + def fourth_pass(self): + for tag in self.soup.findAll('page'): + page = self.parsed_objects[tag.get('objid')] + self.book.append(page) + for block_tag in tag.findAll(['canvas', 'imageblock', 'textblock', 'ruledline']): + if block_tag.name == 'ruledline': + page.append(RuledLine(**self.attrs_to_dict(block_tag))) + else: + page.append(self.parsed_objects[block_tag.get('objid')]) + + for tag in self.soup.find('objects').findAll('button'): + jt = tag.find('jumpto') + tb = self.parsed_objects[jt.get('refobj')] + jb = JumpButton(tb) + self.book.append(jb) + self.parsed_objects[tag.get('objid')] = jb + + for tag in self.soup.findAll('textblock'): + self.process_text_block(tag) + + toc = self.soup.find('toc') + if toc: + for tag in toc.findAll('toclabel'): + label = self.tag_to_string(tag).encode('ascii', 'ignore') # Bug in SONY reader software cant handle non ascii toc labels + self.book.addTocEntry(label, self.parsed_objects[tag.get('refobj')]) + + + def third_pass(self): + map = { + 'page' : (Page, ['pagestyle', 'evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid']), + 'textblock' : (TextBlock, ['textstyle', 'blockstyle']), + 'imageblock' : (ImageBlock, ['blockstyle', 'refstream']), + 'image' : (Image, ['refstream']), + 'canvas' : (Canvas, ['canvaswidth', 'canvasheight']), + } + attrmap = { + 'pagestyle' : 'pageStyle', + 'blockstyle' : 'blockStyle', + 'textstyle' : 'textStyle', + } + for id, tag in self.objects.items(): + if tag.name in map.keys(): + settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid', 'objlabel']) + for a in ('pagestyle', 'blockstyle', 'textstyle'): + if tag.has_key(a): + settings[attrmap[a]] = self.parsed_objects[tag.get(a)] + for a in ('evenfooterid', 'oddfooterid', 'evenheaderid', 'oddheaderid'): + if tag.has_key(a): + settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)] + args = [] + if tag.has_key('refstream'): + args.append(self.parsed_objects[tag.get('refstream')]) + if tag.has_key('canvaswidth'): + args += [tag.get('canvaswidth'), tag.get('canvasheight')] + self.parsed_objects[id] = map[tag.name][0](*args, **settings) + + + + def second_pass(self): + map = { + 'pagestyle' : (PageStyle, ['stylelabel', 'evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid']), + 'textstyle' : (TextStyle, ['stylelabel', 'rubyalignandadjust']), + 'blockstyle' : (BlockStyle, ['stylelabel']), + 'imagestream': (ImageStream, ['imagestreamlabel']), + 'registfont' : (Font, []) + } + for id, tag in self.objects.items(): + if tag.name in map.keys(): + settings = self.attrs_to_dict(tag, map[tag.name][1]+['objid']) + if tag.name == 'pagestyle': + for a in ('evenheaderid', 'oddheaderid', 'evenfooterid', 'oddfooterid'): + if tag.has_key(a): + settings[a.replace('id', '')] = self.parsed_objects[tag.get(a)] + self.parsed_objects[id] = map[tag.name][0](**settings) + if tag.name == 'registfont': + self.book.append(self.parsed_objects[id]) + + + @classmethod + def tag_to_string(cls, tag): + ''' + Convenience method to take a BeautifulSoup Tag and extract the text from it + recursively. + @return: A unicode (possibly empty) object + ''' + if not tag: + return '' + strings = [] + for item in tag.contents: + if isinstance(item, (NavigableString, CData)): + strings.append(item.string) + elif isinstance(item, Tag): + res = cls.tag_to_string(item) + if res: + strings.append(res) + return u''.join(strings) + + def first_pass(self): + info = self.soup.find('bbebxylog').find('bookinformation').find('info') + bookinfo = info.find('bookinfo') + docinfo = info.find('docinfo') + + def me(base, tagname): + tag = base.find(tagname.lower()) + tag = (self.tag_to_string(tag), tag.get('reading') if tag.has_key('reading') else '') + return tag + + title = me(bookinfo, 'Title') + author = me(bookinfo, 'Author') + publisher = me(bookinfo, 'Publisher') + category = me(bookinfo, 'Category')[0] + classification = me(bookinfo, 'Classification')[0] + freetext = me(bookinfo, 'FreeText')[0] + language = me(docinfo, 'Language')[0] + creator = me(docinfo, 'Creator')[0] + producer = me(docinfo, 'Producer')[0] + bookid = me(bookinfo, 'BookID')[0] + + sd = self.soup.find('setdefault') + sd = StyleDefault(**self.attrs_to_dict(sd, ['page_tree_id', 'rubyalignandadjust'])) + bs = self.soup.find('booksetting') + bs = BookSetting(**self.attrs_to_dict(bs, [])) + + self.book = Book(title=title, author=author, publisher=publisher, + category=category, classification=classification, + freetext=freetext, language=language, creator=creator, + producer=producer, bookid=bookid, setdefault=sd, + booksetting=bs) + + for hdr in self.soup.findAll(['header', 'footer']): + elem = Header if hdr.name == 'header' else Footer + self.parsed_objects[hdr.get('objid')] = elem(**self.attrs_to_dict(hdr)) + + def render(self, file, to_lrs=False): + if to_lrs: + self.book.renderLrs(file, 'utf-8') else: - raise LrsError, "no handler for %s style name" - self.dobjects[id] = newstyle - return newstyle + self.book.renderLrf(file) - # - # get and parse a header or footer element - # - def fetch_header_footer(self, element, hfname): - """get the header/footer element referenced by hfname in element.attrib""" - if hfname not in element.attrib: - return None - id = element.attrib[hfname] - if id in self.dobjects: - return self.dobjects[id] - hf = self.get_element_by_objid(id) - if hf == None: - raise LrsError, "no %s element found for objid=%s"%(hfname, id) - #print "found header/footer type %s with objid = %s after getting %s"%(hf.tag, hf.attrib['objid'], id) - newhf = None - if hfname == 'evenheaderid' or hfname == 'oddheaderid': - for e in self.headers: - if self.equal_header_footer(e, hf): - #print "making header/footer %s alias to %s"%(id, e.attrib['objid']) - newhf = self.dobjects[e.attrib['objid']] - break - if newhf == None: - #print "making header %s"%id - self.headers.append(hf) - newhf = self.process_Header(hf) - elif hfname == 'evenfooterid' or hfname == 'oddfooterid': - for e in self.footers: - if self.equal_header_footer(e, hf): - #print "making footer %s alias to %s"%(id, e.attrib['objid']) - newhf = self.dobjects[e.attrib['objid']] - break - if newhf == None: - #print "making footer %s"%id - self.footers.append(hf) - newhf = self.process_Footer(hf) - else: - raise LrsError, "no handler for %s header/footer name" - self.dobjects[id] = newhf - return newhf - - # - # these mostly ignore the terminal elements, should be errors in the end - # - def process_leaf(self, element): - raise LrsError, "process leaf element %s???"%element.tag - - def process_empty(self, element): - if element.text or element.getchildren(): - raise LrsError, "element %s is not empty???"%element.tag - - # - # elements referenced by sets of text elements - # - # def process_Rubi(rubi): - # """Process element""" - # for element in rubi: - # if element.tag == "Oyamoji": - # process_simple_char0(element) - # elif element.tag == "Rubimoji": - # process_simple_char0(element) - # else: - # print "No processor for ", element.tag - # - # def process_AltString(altString): - # """Process element""" - # for element in altString: - # if element.tag == "Org": - # process_text(element) - # elif element.tag == "Alt": - # process_text(element) - # else: - # print "No processor for ", element.tag - - # - # sets of text elements - # - def process_text(self, text, obj): - """process an element as text""" - - if text.text != None: - obj.append(Text(text.text)) - - for element in text: - print "No text processor for %s", self.element_dump(element) - if element.tail != None: - obj.append(Text(element.tail)) - - return obj - - # - # occurs in draw_char, simple_char2, ... - # - def process_Plot(self, plot): - self.plots.append(plot) - plot.lrsplot = Plot(None, **self.process_attrib(plot)) - return plot.lrsplot - - def process_draw_char(self, draw_char, obj): - """Process an element in the DrawChar set""" - - if draw_char.text != None: - obj.append(Text(draw_char.text)) - - for element in draw_char: - if element.tag == "Span": - span = self.process_draw_char(element, Span(**self.process_attrib(element))) - if not span.isEmpty(): - obj.append(span) - elif element.tag == "Plot": - obj.append(self.process_text(element, self.process_Plot(element))) - elif element.tag == "CR": - obj.append(CR()) - elif element.tag == "Space": - obj.append(Space(**self.process_attrib(element))) - elif element.tag == "CharButton": - self.charbuttons.append(element) - element.lrscharbutton = CharButton(None, **self.process_attrib(element)) - obj.append(self.process_simple_char1(element, element.lrscharbutton)) - elif element.tag == "Sup": - obj.append(self.process_simple_char0(element, Sup())) - elif element.tag == "Sub": - obj.append(self.process_simple_char0(element, Sub())) - elif element.tag == "NoBR": - obj.append(self.process_simple_char1(element, NoBR())) - elif element.tag == "DrawChar": - obj.append(self.process_simple_char0(element, DropCaps(**self.process_attrib(element)))) - elif element.tag == "Box": - obj.append(self.process_simple_char0(element, Box(**self.process_attrib(element)))) - elif element.tag == "Italic": - obj.append(self.process_draw_char(element, Italic())) - elif element.tag == "Bold": - obj.append(self.process_draw_char(element, Bold())) - # elif element.tag == "Fill": - # obj.append(Fill(**self.process_attrib(element))) - # elif element.tag == "Rubi": - # obj.append(self.process_Rubi(element)) - # elif element.tag == "Yoko": - # obj.append(self.process_simple_char0(element, Yoko(**self.process_attrib(element)))) - # elif element.tag == "Tate": - # obj.append(self.process_simple_char2(element, Tate(**self.process_attrib(element)))) - # elif element.tag == "Nekase": - # obj.append(self.process_simple_char2(element, Nekase(**self.process_attrib(element)))) - elif element.tag == "EmpLine": - obj.append(self.process_simple_char0(element, EmpLine(**self.process_attrib(element)))) - # elif element.tag == "EmpDots": - # obj.append(self.process_simple_char0(element, EmpDots(**self.process_attrib(element)))) - # elif element.tag == "Gaiji": - # obj.append(self.process_text(element, Gaiji(**self.process_attrib(element)))) - # elif element.tag == "AltString": - # obj.append(self.process_AltString(element)) - else: - print "No DrawChar set processor for ", element.tag - if element.tail != None: - obj.append(Text(element.tail)) - - return obj - - def process_simple_char0(self, simple_char0, obj): - """Process an element in the SimpleChar0 set""" - - if simple_char0.text != None: - obj.append(Text(simple_char0.text)) - for element in simple_char0: - if element.tag == "Plot": - obj.append(self.process_text(element, self.process_Plot(element))) - # elif element.tag == "Gaiji": - # obj.append(process_text(element, Gaiji(**self.process_attrib(element)))) - # elif element.tag == "AltString": - # obj.append(process_AltString(element)) - else: - print "No SimpleChar0 set processor for ", element.tag - if element.tail != None: - obj.append(Text(element.tail)) - - return obj - - - def process_simple_char1(self, simple_char1, obj): - """Process an element in the SimpleChar1 set""" - if simple_char1.text != None: - obj.append(Text(simple_char1.text)) - - for element in simple_char1: - if element.tag == "Box": - obj.append(self.process_simple_char0(element), Box(**self.process_attrib(element))) - elif element.tag == "Sub": - obj.append(self.process_simple_char0(element, Sub(**self.process_attrib(element)))) - elif element.tag == "Sup": - obj.append(self.process_simple_char0(element, Sup(**self.process_attrib(element)))) - elif element.tag == "Space": - obj.append(Space(**self.process_attrib(element))) - # elif element.tag == "Rubi": - # obj.append(process_Rubi(element)) - # elif element.tag == "Gaiji": - # obj.append(process_text(element, Gaiji(**self.process_attrib(element)))) - # elif element.tag == "EmpDots": - # obj.append(process_simple_char0(element, EmpDots(**self.process_attrib(element)))) - # elif element.tag == "EmpLine": - # obj.append(process_simple_char0(element, EmpLine(**self.process_attrib(element)))) - # elif element.tag == "AltString": - # obj.append(process_AltString(element)) - else: - print "No SimpleChar1 set processor for ", element.tag - if element.tail != None: - obj.append(Text(element.tail)) - - return obj - - def process_simple_char2(self, simple_char2, obj): - """Process an element in the SimpleChar2 set""" - - if simple_char2.text != None: - obj.append(Text(simple_char2.text)) - - for element in simple_char2: - if element.tag == "Plot": - obj.append(self.process_text(element, self.process_Plot(element))) - # elif element.tag == "Gaiji": - # obj.append(process_text(element, Gaiji(**self.process_attrib(element)))) - # elif element.tag == "AltString": - # obj.append(process_AltString(element)) - else: - print "No SimpleChar2 set processor for ", element.tag - if element.tail != None: - obj.append(Text(element.tail)) - - return obj - - # - # occurs in ,
,