Added support for conversion for HTML tables.

Added support for common encodings to txt2lrf.
2025-07-09 03:04:10 -04:00 · 2007-05-21 00:40:06 +00:00 · 2007-05-21 00:40:06 +00:00 · 806aba6f80
commit 806aba6f80
parent b26adb541e
9 changed files with 492 additions and 39 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
 suit your distribution.
 """
-__version__   = "0.3.32"
+__version__   = "0.3.33"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
--- a/src/libprs500/devices/prs500/prstypes.py
+++ b/src/libprs500/devices/prs500/prstypes.py
@ -352,7 +352,7 @@ class SetTime(Command):
        self.day = t[2]
        self.hour = t[3]
        self.minute = t[4]
-        # Hack you should actually update the entire time tree is 
+        # Hack you should actually update the entire time tree if 
        # second is > 59
        self.second = t[5] if t[5] < 60 else 59 
--- a/src/libprs500/ebooks/lrf/fonts/init.py
+++ b/src/libprs500/ebooks/lrf/fonts/init.py
@ -13,7 +13,11 @@
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import pkg_resources
-from PIL import ImageFont
+try:
    from PIL import ImageFont
 except ImportError:
    import ImageFont
 '''
 Default fonts used in the PRS500
 '''
@ -26,7 +30,8 @@ FONT_MAP = {
 def get_font(name, size, encoding='unic'):
    '''
    Get an ImageFont object by name. 
-    @param size: Size in pts
+    @param size: Font height in pixels. To convert from pts:
                 sz in pixels = (dpi/72) * size in pts
    @param encoding: Font encoding to use. E.g. 'unic', 'symbol', 'ADOB', 'ADBE', 'aprm'
    '''
    if name in FONT_MAP.keys():
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -39,6 +39,7 @@ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream,
                Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas
 from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
 from libprs500.ebooks.lrf import ConversionError, option_parser, Book, PRS500_PROFILE
 from libprs500.ebooks.lrf.html.table import Table 
 from libprs500 import extract, filename_to_utf8
 from libprs500.ptempfile import PersistentTemporaryFile
@ -303,6 +304,7 @@ class HTMLConverter(object):
        self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
        self.link_exclude = link_exclude #: Ignore matching hrefs
        self.scaled_images = {}   #: Temporary files with scaled version of images        
        self.rotated_images = {}  #: Temporary files with rotated version of images        
        self.max_link_levels = max_link_levels #: Number of link levels to process recursively
        self.link_level  = link_level  #: Current link level
        self.blockquote_style = book.create_block_style(sidemargin=60, 
@ -317,6 +319,9 @@ class HTMLConverter(object):
        self.files   = {}         #: links that point to other files
        self.links_processed = False #: Whether links_processed has been called on this object
        self.font_delta = font_delta
        # Set by table processing code so that any <a name> within the table 
        # point to the previous element
        self.anchor_to_previous = None 
        self.cover = cover
        self.memory = []          #: Used to ensure that duplicate CSS unhandled erros are not reported
        self.in_ol = False #: Flag indicating we're in an <ol> element
@ -478,6 +483,15 @@ class HTMLConverter(object):
            return text
    def process_links(self):
        def add_toc_entry(text, target):
            # TextBlocks in Canvases have a None parent or an Objects Parent
            if target.parent != None and \
               hasattr(target.parent, 'objId'): 
                self.book.addTocEntry(ascii_text, tb)
            elif self.verbose:
                print "Cannot add link", ascii_text, "to TOC"
        def get_target_block(fragment, targets):
            '''Return the correct block for the <a name> element'''
            bs = targets[fragment]
@ -535,7 +549,7 @@ class HTMLConverter(object):
                if fragment in self.targets.keys():
                    tb = get_target_block(fragment, self.targets)
                    if self.is_root:
-                        self.book.addTocEntry(ascii_text, tb)                 
+                        add_toc_entry(ascii_text, tb)                        
                    sys.stdout.flush()
                    jb = JumpButton(tb)
                    self.book.append(jb)
@ -580,7 +594,7 @@ class HTMLConverter(object):
                else:
                    tb = conv.top
                if self.is_root:
-                    self.book.addTocEntry(ascii_text, tb)      
+                    add_toc_entry(ascii_text, tb)  
                jb = JumpButton(tb)                
                self.book.append(jb)
                cb = CharButton(jb, text=text)
@ -727,22 +741,32 @@ class HTMLConverter(object):
                                                         blockStyle=self.current_block.blockStyle)
    def process_image(self, path, tag_css, width=None, height=None):
        if self.rotated_images.has_key(path):
            path = self.rotated_images[path].name
        if self.scaled_images.has_key(path):
            path = self.scaled_images[path].name            
        im = PILImage.open(path)
        if width == None or height == None:            
            width, height = im.size
        def scale_image(width, height):
            pt = PersistentTemporaryFile(suffix='.jpeg')
            im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
            pt.close()
            self.scaled_images[path] = pt
            return pt.name
-                
+        
-        if self.scaled_images.has_key(path):
+        if width > self.profile.page_width and width > height:
-            path = self.scaled_images[path].name
+            pt = PersistentTemporaryFile(suffix='.jpeg')
        im = PILImage.open(path)
        if width == None or height == None:            
            width, height = im.size
        if width > height:
            im = im.rotate(-90)
            im.convert('RGB').save(pt, 'JPEG')
            path = pt.name
            pt.close()            
            self.rotated_images[path] = pt
            width, height = im.size
        if height > self.profile.page_height:
            corrf = self.profile.page_height/(1.*height)
            width, height = floor(corrf*width), self.profile.page_height-1                        
@ -788,7 +812,7 @@ class HTMLConverter(object):
            self.end_page()
            self.current_page.append(Canvas(width=self.profile.page_width,
                                            height=height))
-            left = int(floor((self.profile.page_width - width)/2.))            
+            left = int(floor((self.profile.page_width - width)/2.))
            self.current_page.contents[0].put_object(ImageBlock(self.images[path]),
                                                  left, 0)
@ -824,6 +848,18 @@ class HTMLConverter(object):
            pass
        elif tagname == 'a' and self.max_link_levels >= 0:
            if tag.has_key('name'):
                if self.anchor_to_previous:
                    self.process_children(tag, tag_css)
                    return
                    for c in self.anchor_to_previous.contents:
                        if isinstance(c, (TextBlock, ImageBlock)):
                            self.targets[tag['name']] = c
                            return
                    tb = self.book.create_text_block()
                    tb.Paragraph(" ")
                    self.anchor_to_previous.append(tb)
                    self.targets[tag['name']] = tb                    
                    return
                previous = self.current_block
                self.process_children(tag, tag_css)
                target = None
@ -867,7 +903,7 @@ class HTMLConverter(object):
                    ['png', 'jpg', 'bmp', 'jpeg']:
                    self.process_image(path, tag_css)
                else:
-                    self.add_text('Link: ' + tag['href'], tag_css)
+                    self.add_text(self.get_text(tag), tag_css)
                    self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
        elif tagname == 'img':
            if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
@ -1010,30 +1046,45 @@ class HTMLConverter(object):
            if tag.has_key('face'):
                tag_css['font-family'] = tag['face']
            self.process_children(tag, tag_css)
-        elif tagname in ['br', 'tr']:
+        elif tagname in ['br']:
            self.current_para.append(CR())
            self.process_children(tag, tag_css)
        elif tagname in ['td']:
            self.current_para.append(' ')
            self.process_children(tag, tag_css)
        elif tagname == 'hr':
            self.end_current_para()            
            self.current_block.append(CR())
            self.end_current_block()
            self.current_page.RuledLine(linelength=self.profile.page_width)
        elif tagname == 'table':
            tag_css = self.tag_css(tag) # Table should not inherit CSS
            self.process_table(tag, tag_css)
        else:            
-            self.process_children(tag, tag_css)
+            self.process_children(tag, tag_css)        
        if end_page:
                self.end_page()
    def process_table(self, tag, tag_css):
        self.end_current_block()
        colpad = 10
        table = Table(self, tag, tag_css, rowpad=10, colpad=10)   
        canvases = []
        for block, xpos, ypos, delta in table.blocks(self.profile.page_width):
            if not block:
                canvases.append(Canvas(self.profile.page_width, ypos+colpad,
                        blockrule='block-fixed'))
            else:
                canvases[-1].put_object(block, xpos + int(delta/2.), 0)
        for canvas in canvases:
            self.current_page.append(canvas)
        self.end_current_block()
    def writeto(self, path, lrs=False):
        self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
    def cleanup(self):
-        for _file in self.scaled_images.values():   
+        for _file in self.scaled_images.values() + self.rotated_images.values():   
            _file.__del__()
-        
+
 def process_file(path, options):
    cwd = os.getcwd()
    dirpath = None
@ -1070,7 +1121,7 @@ def process_file(path, options):
                tim.save(tf.name)
                tpath = tf.name
            else:
-                raise ConversionError, 'Cannot read from: %s', (options.cover,)
+                raise ConversionError, 'Cannot read from: %s'% (options.cover,)
        if not options.title:
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@ -2,20 +2,23 @@
 <head>
 <style type='text/css'>
 .toc { page-break-after: always; text-indent: 0em; }
 .tocpn            {text-align: right; }
 .tocchr           {text-align: right; font-variant: small-caps;}
 </style>
 </head>
  <h1>Demo of <span style='font-family:monospace'>html2lrf</span></h1>  
  <p>
-  This file contains a demonstration of the capabilities of   <span style='font-family:monospace'>html2lrf,</span>   the HTML to LRF converter   from <em>libprs500.</em> To obtain libprs500 visit  <span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
+  This file contains a demonstration of the capabilities of   <span style='font-family:monospace'>html2lrf,</span>   the HTML to LRF converter   from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
  </p>
  <br/>
  <h2><a name='toc'>Table of Contents</a></h2>
  <ul style='page-break-after:always'>
-    <li><a href='#lists'>Demonstration of Lists</a></li>
+    <li><a href='#lists'>Lists</a></li>
    <li><a href='#tables'>Tables</a></li>
    <li><a href='#text'>Text formatting and ruled lines</a></li>
    <li><a href='#images'>Inline images</a></li>
    <li><a href='#recursive'>Recursive link following</a></li>
-    <li><a href='demo_ext.html'>The HTML used to create this file</a>
+    <!--<li><a href='demo_ext.html'>The HTML used to create this file</a>-->
 </ul>
 <h2><a name='lists'>Lists</a></h2>
@ -40,6 +43,53 @@
 <a href='#toc'>Table of Contents</a>
 </p>
 <h2><a name='tables'>Tables</a></h2>
 <p>
 Because I can!
 </p>
 <br/>
 <table>
 <tr><td colspan=4><h3 style="text-align:center">A matrix</h3></td></tr>
 <tr><td></td><td style="text-align:center"><b>Column 1</b></td><td style="text-align:center"><b>Column 2</b></td><td style="text-align:center"><b>Column 3</b></td></tr>
 <tr><td><b>Row 1</b></td><td><p style="text-align:center">(1, 1)</p></tr>
 <tr><td><b>Row 2</b></td><td></td><td style="text-align:center"><p>(2, 2)</p></td><td></td></tr>
 <tr><td><b>Row 3</b></td><td></td><td></td><td><p style="text-align:center">(3, 3)</p></td></tr>
 </table>
 <br/>
 <p>
 html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells. 
 </p>
 <br/>
 <p>
 The table conversion code is very new and likely to be swarming with bugs, so please report them at <br/><font name="monospace>https://libprs500.kovidgoyal.net/newticket</font>
 </p>
 <br/>
 <p style="page-break-after:always">
 On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
 </p>
 <h3 align="center">Sample Complex Table of Contents</h3>
 <table summary="TOC">
 <tr><td colspan="3">&nbsp;</td><td align="right">PAGE</td></tr>
 <tr><td class="tocch" colspan="3"><a href="#PREFACE">Preface</a></td><td class="tocpn">v</td></tr>
 <tr><td class="tocch" colspan="3"><a href="#REFERENCE_WORKS">List of Works of Reference</a></td><td class="tocpn">vii</td></tr>
 <tr><td class="tocch" colspan="3"><a href="#LIST_OF_ILLUSTRATIONS">List of Illustrations</a></td><td class="tocpn">xi</td></tr>
 <tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch"><a href="#CHAPTER_I">History of the Foundation</a></td><td class="tocpn">3</td></tr>
 <tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#CHAPTER_II">Exterior of the Church</a></td><td class="tocpn">25</td></tr>
 <tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#CHAPTER_III">Interior of the Church</a></td><td class="tocpn">33</td></tr>
 <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#CHAPTER_IV">St. Bartholomew-the-Less and the Hospital</a></td><td class="tocpn">63</td></tr>
 <tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch"><a href="#APPENDIX_I">The Priory Seals</a></td><td class="tocpn">73</td></tr>
 <tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#APPENDIX_II">The Priors and Rectors</a></td><td class="tocpn">77</td></tr>
 <tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#APPENDIX_III">Inventory of Vestments, etc.</a></td><td class="tocpn">79</td></tr>
 <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#APPENDIX_IV">The Organ</a></td><td class="tocpn">80</td></tr>
 <tr><td class="tocch" colspan="3"><a href="#INDEX">Index</a></td><td class="tocpn">83</td></tr>
 </table>
 <p class='toc'>
 <hr />
 <a href='#toc'>Table of Contents</a>
 </p>
 <h2><a name='text'>Text formatting</a></h2>
 <p>
 A simple <i>paragraph</i> of <b>formatted 
--- a/src/libprs500/ebooks/lrf/html/table.py
+++ b/src/libprs500/ebooks/lrf/html/table.py
@ -0,0 +1,306 @@
 ##    Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import math, sys
 from libprs500.ebooks.lrf.fonts import get_font
 from libprs500.ebooks.lrf.pylrs.pylrs import TextBlock, Text, CR, Span, \
                                             CharButton, Plot, Paragraph, \
                                             LrsTextTag
 def ceil(num):
    return int(math.ceil(num))
 def print_xml(elem):
    from libprs500.ebooks.lrf.pylrs.pylrs import ElementWriter    
    elem = elem.toElement('utf8')
    ew = ElementWriter(elem, sourceEncoding='utf8')
    ew.write(sys.stdout)
    print
 def cattrs(base, extra):
    new = base.copy()
    new.update(extra)
    return new
 def tokens(tb):
    '''
    Return the next token. A token is :
    1. A string 
    a block of text that has the same style
    '''        
    def process_element(x, attrs):
        if isinstance(x, CR):
            yield 2, None
        elif isinstance(x, Text):
            yield x.text, cattrs(attrs, {})
        elif isinstance(x, basestring):
            yield x, cattrs(attrs, {})
        elif isinstance(x, (CharButton, LrsTextTag)):
            if x.contents:
                yield x.contents[0].text, cattrs(attrs, {})
        elif isinstance(x, Plot):
            yield x, None
        elif isinstance(x, Span):
            attrs = cattrs(attrs, x.attrs)
            for y in x.contents:
                for z in process_element(y, attrs):
                    yield z
    for i in tb.contents:
        if isinstance(i, CR):
            yield 1, None
        elif isinstance(i, Paragraph):
            for j in i.contents: 
                attrs = {}
                if hasattr(j, 'attrs'):
                    attrs = j.attrs
                for k in process_element(j, attrs):                    
                    yield k
 class Cell(object):
    def __init__(self, conv, cell, css):
        self.conv = conv
        self.cell = cell
        self.css  = css
        self.text_blocks = []
        self.rowspan = self.colspan = 1
        try:
            self.colspan = int(cell['colspan']) if cell.has_key('colspan') else 1
            self.rowspan = int(cell['rowspan']) if cell.has_key('rowspan') else 1
        except:
            if conv.verbose:
                print >>sys.stderr, "Error reading row/colspan for ", cell
        pp = conv.current_page
        conv.book.allow_new_page = False
        conv.anchor_to_previous = pp
        conv.current_page = conv.book.create_page()
        conv.parse_tag(cell, css)
        conv.end_current_block()
        for item in conv.current_page.contents:
            if isinstance(item, TextBlock):
                self.text_blocks.append(item)
        conv.current_page = pp
        conv.book.allow_new_page = True
        conv.anchor_to_previous = None
        if not self.text_blocks:
            tb = conv.book.create_text_block()
            tb.Paragraph(' ')
            self.text_blocks.append(tb)
        for tb in self.text_blocks:
            tb.parent = None
            tb.objId  = 0
            # Needed as we have to eventually change this BlockStyle's width and 
            # height attributes. This blockstyle may be shared with other
            # elements, so doing that causes havoc.
            tb.blockStyle = conv.book.create_block_style()
            ts = conv.book.create_text_style(**tb.textStyle.attrs)
            ts.attrs['parindent'] = 0
            tb.textStyle = ts
            if ts.attrs['align'] == 'foot':
                if isinstance(tb.contents[-1], Paragraph):
                    tb.contents[-1].append(' ')
    def pts_to_pixels(self, pts):
        pts = int(pts)
        return ceil((float(self.conv.profile.dpi)/72)*(pts/10.))
    def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
        ts = tb.textStyle.attrs
        default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
        parindent = self.pts_to_pixels(ts['parindent'])
        ls, ws = self.pts_to_pixels(ts['linespace']), self.pts_to_pixels(ts['wordspace'])
        top, bottom, left, right = 0, 0, parindent, parindent
        def add_word(width, height, left, right, top, bottom):            
            if left + width > maxwidth:
                left = width + ws
                top += height + ls
                bottom = top+height if top+height > bottom else bottom
            else:
                left += (width + ws)
                right = left if left > right else right                    
                bottom = top+height if top+height > bottom else bottom
            return left, right, top, bottom
        for token, attrs in tokens(tb):
            font = default_font
            if isinstance(token, int): # Handle para and line breaks
                top = bottom
                left = parindent if int == 1 else 0
                continue
            if isinstance(token, Plot):
                width, height = self.pts_to_pixels(token.xsize), self.pts_to_pixels(token.ysize)
                left, right, top, bottom = add_word(width, height, left, right, top, bottom)
                continue
            ff = attrs.get('fontfacename', ts['fontfacename'])
            fs = attrs.get('fontsize', ts['fontsize'])
            if (ff, fs) != (ts['fontfacename'], ts['fontsize']):
                font = get_font(ff, self.pts_to_pixels(fs))
            for word in token.split():
                width, height = font.getsize(word)
                left, right, top, bottom = add_word(width, height, left, right, top, bottom)
        return right+3, bottom
    def text_block_preferred_width(self, tb, debug=False):
        return self.text_block_size(tb, sys.maxint, debug=debug)[0]
    def preferred_width(self, debug=False):
        return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks]))
    def height(self, width):
        return sum([self.text_block_size(i, width)[1] for i in self.text_blocks])
 class Row(object):
    def __init__(self, conv, row, css, colpad):
        self.cells = []
        self.colpad = colpad
        cells = row.findAll('td')
        for cell in cells:
            ccss = conv.tag_css(cell, css)
            self.cells.append(Cell(conv, cell, ccss))        
    def number_of_cells(self):
        '''Number of cells in this row. Respects colspan'''
        ans = 0
        for cell in self.cells:
            ans += cell.colspan
        return ans
    def height(self, widths):
        i, heights = 0, []
        for cell in self.cells:
            width = sum(widths[i:i+cell.colspan])
            heights.append(cell.height(width))
            i += cell.colspan
        return max(heights)
    def preferred_width(self, col):
        i = -1
        cell = None        
        for cell in self.cells:            
            for k in range(0, cell.colspan):
                if i == col:
                    break
                i += 1
            if i == col:
                break
        return 0 if cell.colspan > 1 else cell.preferred_width()
    def cell_iterator(self):
        for c in self.cells:
            yield c
 class Table(object):
    def __init__(self, conv, table, css, rowpad=10, colpad=10):
        self.rows = []
        self.conv = conv
        self.rowpad = rowpad
        self.colpad = colpad
        rows = table.findAll('tr')
        for row in rows:            
            rcss = conv.tag_css(row, css)
            self.rows.append(Row(conv, row, rcss, colpad))
    def number_of_columns(self):
        max = 0
        for row in self.rows:
            max = row.number_of_cells() if row.number_of_cells() > max else max
        return max
    def number_or_rows(self):
        return len(self.rows)
    def height(self, maxwidth):
        ''' Return row heights + self.rowpad'''
        widths = self.get_widths(maxwidth)
        return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad
    def get_widths(self, maxwidth):
        '''
        Return widths of columns + sefl.colpad
        '''
        rows, cols = self.number_or_rows(), self.number_of_columns()
        widths = range(cols)
        for c in range(cols):
            cellwidths = [ 0 for i in range(rows)]
            for r in range(rows):
                try:
                    cellwidths[r] = self.rows[r].preferred_width(c)
                except IndexError:
                    continue                 
            widths[c] = max(cellwidths)
        itercount = 0
        while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
            widths = [ceil((95./100.)*w) for w in widths]
            itercount += 1
        return [i+self.colpad for i in widths]
    def blocks(self, maxwidth):       
        rows, cols = self.number_or_rows(), self.number_of_columns()
        cellmatrix = [[None for c in range(cols)] for r in range(rows)]        
        rowpos = [0 for i in range(rows)]
        for r in range(rows):
            nc = self.rows[r].cell_iterator()
            try:
                while True:
                    cell = nc.next()
                    cellmatrix[r][rowpos[r]] = cell
                    rowpos[r] += cell.colspan
                    for k in range(1, cell.rowspan):
                        try:
                            rowpos[r+k] += 1
                        except IndexError:
                            break
            except StopIteration: # No more cells in this row
                continue
        widths = self.get_widths(maxwidth)
        heights = [row.height(widths) for row in self.rows]
        xpos = [sum(widths[:i]) for i in range(cols)]
        delta = maxwidth - sum(widths)
        if delta < 0: 
            delta = 0
        for r in range(len(cellmatrix)):
            yield None, 0, heights[r], 0
            for c in range(len(cellmatrix[r])):
                cell = cellmatrix[r][c]
                if not cell:
                    continue
                width = sum(widths[c:c+cell.colspan])
                sypos = 0
                for tb in cell.text_blocks:
                    tb.blockStyle = self.conv.book.create_block_style(
                                    blockwidth=width, 
                                    blockheight=cell.text_block_size(tb, width)[1])
                    yield tb, xpos[c], sypos, delta
                    sypos += tb.blockStyle.attrs['blockheight']
--- a/src/libprs500/ebooks/lrf/meta.py
+++ b/src/libprs500/ebooks/lrf/meta.py
@ -94,7 +94,12 @@ class xml_attr_field(object):
    def __get__(self, obj, typ=None):
        """ Return the data in this field or '' if the field is empty """
-        document = dom.parseString(obj.info)
+        try:
            document = dom.parseString(obj.info)
        except Exception, err:
            print >>sys.stderr, "Could not parse XML:", err
            print obj.info
            raise
        elems = document.getElementsByTagName(self.tag_name)
        if len(elems):
            elem = None
@ -108,7 +113,12 @@ class xml_attr_field(object):
    def __set__(self, obj, val):
        if val == None:
            val = ""
-        document = dom.parseString(obj.info)
+        try:
            document = dom.parseString(obj.info)
        except Exception, err:
            print >>sys.stderr, "Could not parse XML:", err
            print obj.info
            raise
        elems = document.getElementsByTagName(self.tag_name)
        if len(elems):
            elem = None
@ -142,7 +152,13 @@ class xml_field(object):
    def __get__(self, obj, typ=None): 
        """ Return the data in this field or '' if the field is empty """
-        document = dom.parseString(obj.info)
+        try:
            document = dom.parseString(obj.info)
        except Exception, err:
            print >>sys.stderr, "Could not parse XML:", err
            print obj.info
            raise
        elems = document.getElementsByTagName(self.tag_name)
        if len(elems):
            elem = None
@ -158,7 +174,12 @@ class xml_field(object):
    def __set__(self, obj, val):
        if val == None:
            val = ""
-        document = dom.parseString(obj.info)
+        try:
            document = dom.parseString(obj.info)
        except Exception, err:
            print >>sys.stderr, "Could not parse XML:", err
            print obj.info
            raise
        def create_elem():
            elem = document.createElement(self.tag_name)
            elem.appendChild(dom.Text())
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@ -56,6 +56,8 @@ DEFAULT_GENREADING      = "fs"          # default is yes to both lrf and lrs
 class LrsError(Exception):
    pass
 class ContentError(Exception):
    pass
 def _checkExists(filename):
    if not os.path.exists(filename):
@ -435,6 +437,8 @@ class Book(Delegator):
        self.applySetting("sourceencoding", DEFAULT_SOURCE_ENCODING)
        self.applySettings(settings, testValid=True)
        self.allow_new_page = True #: If False L{create_page} raises an exception
    def create_text_style(self, **settings):
        ans = TextStyle(**self.defaultTextStyle.attrs.copy())
@ -447,6 +451,8 @@ class Book(Delegator):
        return ans
    def create_page_style(self, **settings):
        if not self.allow_new_page:
            raise ContentError
        ans = PageStyle(**self.defaultPageStyle.attrs.copy())
        ans.update(settings)
        return ans
@ -641,12 +647,15 @@ class TableOfContents(object):
            raise LrsError, "TOC destination must be a TextBlock, ImageBlock or RuledLine"+\
                            " not a " + str(type(textBlock))
-        if textBlock.parent is None or not isinstance(textBlock.parent, Page):
+        if textBlock.parent is None:
            raise LrsError, "TOC text block must be already appended to a page"
        if textBlock.parent.parent is None:
            raise LrsError, \
                    "TOC destination page must be already appended to a book"
        if not hasattr(textBlock.parent, 'objId'):
            raise LrsError, "TOC destination must be appended to a container with an objID"
        self.tocEntries.append(TocLabel(tocLabel, textBlock))
        textBlock.tocLabel = tocLabel
@ -1373,7 +1382,6 @@ class TextBlock(LrsObject, LrsContainer):
        self.textSettings = {}
        self.blockSettings = {}
        for name, value in settings.items():
            if name in TextStyle.validSettings:
@ -1428,7 +1436,6 @@ class TextBlock(LrsObject, LrsContainer):
            tb.append(content.toElement(sourceEncoding))
        return tb
    def getReferencedObjIds(self):
        ids = [self.objId, self.extraId, self.blockStyle.objId,
@ -2111,7 +2118,7 @@ class PutObj(LrsContainer):
        self.y1 = int(y)
-    def appendReferencedObjects(self, parent):
+    def appendReferencedObjects(self, parent):        
        if self.content.parent is None:
            parent.append(self.content)
--- a/src/libprs500/ebooks/lrf/txt/convert_from.py
+++ b/src/libprs500/ebooks/lrf/txt/convert_from.py
@ -17,6 +17,7 @@ Convert .txt files to .lrf
 """
 import os, sys
 from libprs500.ebooks import BeautifulSoup
 from libprs500.ebooks.lrf import ConversionError, option_parser
 from libprs500.ebooks.lrf import Book
 from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, Italic, Bold, BookSetting
@ -63,7 +64,7 @@ def convert_txt(path, options):
                    C{author}, C{title}, C{encoding} (the assumed encoding of 
                    the text in C{path}.)
    """
-    import fileinput
+    import codecs
    header = None
    if options.header:
        header = Paragraph()
@ -84,7 +85,19 @@ def convert_txt(path, options):
    block = book.create_text_block()
    pg.append(block)
    book.append(pg)
-    for line in fileinput.input(path):
+    lines = ""
    try:
        lines = codecs.open(path, 'rb', 'ascii').readlines()
        print 'huh'
    except UnicodeDecodeError:
            try:
                lines = codecs.open(path, 'rb', 'cp1252').readlines()
            except UnicodeDecodeError:
                try:
                    lines = codecs.open(path, 'rb', 'iso-8859-1').readlines()
                except UnicodeDecodeError:
                    lines = codecs.open(path, 'rb', 'utf8').readlines()
    for line in lines:
        line = line.strip()
        if line:
            buffer = buffer.rstrip() + ' ' + line