Added support for conversion for HTML tables.

Added support for common encodings to txt2lrf.
This commit is contained in:
Kovid Goyal 2007-05-21 00:40:06 +00:00
parent b26adb541e
commit 806aba6f80
9 changed files with 492 additions and 39 deletions

View File

@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
suit your distribution.
"""
__version__ = "0.3.32"
__version__ = "0.3.33"
__docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"

View File

@ -352,7 +352,7 @@ class SetTime(Command):
self.day = t[2]
self.hour = t[3]
self.minute = t[4]
# Hack you should actually update the entire time tree is
# Hack you should actually update the entire time tree if
# second is > 59
self.second = t[5] if t[5] < 60 else 59

View File

@ -13,7 +13,11 @@
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import pkg_resources
from PIL import ImageFont
try:
from PIL import ImageFont
except ImportError:
import ImageFont
'''
Default fonts used in the PRS500
'''
@ -26,7 +30,8 @@ FONT_MAP = {
def get_font(name, size, encoding='unic'):
'''
Get an ImageFont object by name.
@param size: Size in pts
@param size: Font height in pixels. To convert from pts:
sz in pixels = (dpi/72) * size in pts
@param encoding: Font encoding to use. E.g. 'unic', 'symbol', 'ADOB', 'ADBE', 'aprm'
'''
if name in FONT_MAP.keys():

View File

@ -39,6 +39,7 @@ from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream,
Plot, Image, BlockSpace, RuledLine, BookSetting, Canvas
from libprs500.ebooks.lrf.pylrs.pylrs import Span as _Span
from libprs500.ebooks.lrf import ConversionError, option_parser, Book, PRS500_PROFILE
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import extract, filename_to_utf8
from libprs500.ptempfile import PersistentTemporaryFile
@ -303,6 +304,7 @@ class HTMLConverter(object):
self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
self.link_exclude = link_exclude #: Ignore matching hrefs
self.scaled_images = {} #: Temporary files with scaled version of images
self.rotated_images = {} #: Temporary files with rotated version of images
self.max_link_levels = max_link_levels #: Number of link levels to process recursively
self.link_level = link_level #: Current link level
self.blockquote_style = book.create_block_style(sidemargin=60,
@ -317,6 +319,9 @@ class HTMLConverter(object):
self.files = {} #: links that point to other files
self.links_processed = False #: Whether links_processed has been called on this object
self.font_delta = font_delta
# Set by table processing code so that any <a name> within the table
# point to the previous element
self.anchor_to_previous = None
self.cover = cover
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
self.in_ol = False #: Flag indicating we're in an <ol> element
@ -478,6 +483,15 @@ class HTMLConverter(object):
return text
def process_links(self):
def add_toc_entry(text, target):
# TextBlocks in Canvases have a None parent or an Objects Parent
if target.parent != None and \
hasattr(target.parent, 'objId'):
self.book.addTocEntry(ascii_text, tb)
elif self.verbose:
print "Cannot add link", ascii_text, "to TOC"
def get_target_block(fragment, targets):
'''Return the correct block for the <a name> element'''
bs = targets[fragment]
@ -535,7 +549,7 @@ class HTMLConverter(object):
if fragment in self.targets.keys():
tb = get_target_block(fragment, self.targets)
if self.is_root:
self.book.addTocEntry(ascii_text, tb)
add_toc_entry(ascii_text, tb)
sys.stdout.flush()
jb = JumpButton(tb)
self.book.append(jb)
@ -580,7 +594,7 @@ class HTMLConverter(object):
else:
tb = conv.top
if self.is_root:
self.book.addTocEntry(ascii_text, tb)
add_toc_entry(ascii_text, tb)
jb = JumpButton(tb)
self.book.append(jb)
cb = CharButton(jb, text=text)
@ -727,22 +741,32 @@ class HTMLConverter(object):
blockStyle=self.current_block.blockStyle)
def process_image(self, path, tag_css, width=None, height=None):
if self.rotated_images.has_key(path):
path = self.rotated_images[path].name
if self.scaled_images.has_key(path):
path = self.scaled_images[path].name
im = PILImage.open(path)
if width == None or height == None:
width, height = im.size
def scale_image(width, height):
pt = PersistentTemporaryFile(suffix='.jpeg')
im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
pt.close()
self.scaled_images[path] = pt
return pt.name
if self.scaled_images.has_key(path):
path = self.scaled_images[path].name
im = PILImage.open(path)
if width == None or height == None:
width, height = im.size
if width > height:
if width > self.profile.page_width and width > height:
pt = PersistentTemporaryFile(suffix='.jpeg')
im = im.rotate(-90)
im.convert('RGB').save(pt, 'JPEG')
path = pt.name
pt.close()
self.rotated_images[path] = pt
width, height = im.size
if height > self.profile.page_height:
corrf = self.profile.page_height/(1.*height)
width, height = floor(corrf*width), self.profile.page_height-1
@ -788,7 +812,7 @@ class HTMLConverter(object):
self.end_page()
self.current_page.append(Canvas(width=self.profile.page_width,
height=height))
left = int(floor((self.profile.page_width - width)/2.))
left = int(floor((self.profile.page_width - width)/2.))
self.current_page.contents[0].put_object(ImageBlock(self.images[path]),
left, 0)
@ -824,6 +848,18 @@ class HTMLConverter(object):
pass
elif tagname == 'a' and self.max_link_levels >= 0:
if tag.has_key('name'):
if self.anchor_to_previous:
self.process_children(tag, tag_css)
return
for c in self.anchor_to_previous.contents:
if isinstance(c, (TextBlock, ImageBlock)):
self.targets[tag['name']] = c
return
tb = self.book.create_text_block()
tb.Paragraph(" ")
self.anchor_to_previous.append(tb)
self.targets[tag['name']] = tb
return
previous = self.current_block
self.process_children(tag, tag_css)
target = None
@ -867,7 +903,7 @@ class HTMLConverter(object):
['png', 'jpg', 'bmp', 'jpeg']:
self.process_image(path, tag_css)
else:
self.add_text('Link: ' + tag['href'], tag_css)
self.add_text(self.get_text(tag), tag_css)
self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
elif tagname == 'img':
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
@ -1010,30 +1046,45 @@ class HTMLConverter(object):
if tag.has_key('face'):
tag_css['font-family'] = tag['face']
self.process_children(tag, tag_css)
elif tagname in ['br', 'tr']:
elif tagname in ['br']:
self.current_para.append(CR())
self.process_children(tag, tag_css)
elif tagname in ['td']:
self.current_para.append(' ')
self.process_children(tag, tag_css)
elif tagname == 'hr':
self.end_current_para()
self.current_block.append(CR())
self.end_current_block()
self.current_page.RuledLine(linelength=self.profile.page_width)
elif tagname == 'table':
tag_css = self.tag_css(tag) # Table should not inherit CSS
self.process_table(tag, tag_css)
else:
self.process_children(tag, tag_css)
self.process_children(tag, tag_css)
if end_page:
self.end_page()
def process_table(self, tag, tag_css):
self.end_current_block()
colpad = 10
table = Table(self, tag, tag_css, rowpad=10, colpad=10)
canvases = []
for block, xpos, ypos, delta in table.blocks(self.profile.page_width):
if not block:
canvases.append(Canvas(self.profile.page_width, ypos+colpad,
blockrule='block-fixed'))
else:
canvases[-1].put_object(block, xpos + int(delta/2.), 0)
for canvas in canvases:
self.current_page.append(canvas)
self.end_current_block()
def writeto(self, path, lrs=False):
self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
def cleanup(self):
for _file in self.scaled_images.values():
for _file in self.scaled_images.values() + self.rotated_images.values():
_file.__del__()
def process_file(path, options):
cwd = os.getcwd()
dirpath = None
@ -1070,7 +1121,7 @@ def process_file(path, options):
tim.save(tf.name)
tpath = tf.name
else:
raise ConversionError, 'Cannot read from: %s', (options.cover,)
raise ConversionError, 'Cannot read from: %s'% (options.cover,)
if not options.title:

View File

@ -2,20 +2,23 @@
<head>
<style type='text/css'>
.toc { page-break-after: always; text-indent: 0em; }
.tocpn {text-align: right; }
.tocchr {text-align: right; font-variant: small-caps;}
</style>
</head>
<h1>Demo of <span style='font-family:monospace'>html2lrf</span></h1>
<p>
This file contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf,</span> the HTML to LRF converter from <em>libprs500.</em> To obtain libprs500 visit <span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
This file contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf,</span> the HTML to LRF converter from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
</p>
<br/>
<h2><a name='toc'>Table of Contents</a></h2>
<ul style='page-break-after:always'>
<li><a href='#lists'>Demonstration of Lists</a></li>
<li><a href='#lists'>Lists</a></li>
<li><a href='#tables'>Tables</a></li>
<li><a href='#text'>Text formatting and ruled lines</a></li>
<li><a href='#images'>Inline images</a></li>
<li><a href='#recursive'>Recursive link following</a></li>
<li><a href='demo_ext.html'>The HTML used to create this file</a>
<!--<li><a href='demo_ext.html'>The HTML used to create this file</a>-->
</ul>
<h2><a name='lists'>Lists</a></h2>
@ -40,6 +43,53 @@
<a href='#toc'>Table of Contents</a>
</p>
<h2><a name='tables'>Tables</a></h2>
<p>
Because I can!
</p>
<br/>
<table>
<tr><td colspan=4><h3 style="text-align:center">A matrix</h3></td></tr>
<tr><td></td><td style="text-align:center"><b>Column 1</b></td><td style="text-align:center"><b>Column 2</b></td><td style="text-align:center"><b>Column 3</b></td></tr>
<tr><td><b>Row 1</b></td><td><p style="text-align:center">(1, 1)</p></tr>
<tr><td><b>Row 2</b></td><td></td><td style="text-align:center"><p>(2, 2)</p></td><td></td></tr>
<tr><td><b>Row 3</b></td><td></td><td></td><td><p style="text-align:center">(3, 3)</p></td></tr>
</table>
<br/>
<p>
html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells.
</p>
<br/>
<p>
The table conversion code is very new and likely to be swarming with bugs, so please report them at <br/><font name="monospace>https://libprs500.kovidgoyal.net/newticket</font>
</p>
<br/>
<p style="page-break-after:always">
On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
</p>
<h3 align="center">Sample Complex Table of Contents</h3>
<table summary="TOC">
<tr><td colspan="3">&nbsp;</td><td align="right">PAGE</td></tr>
<tr><td class="tocch" colspan="3"><a href="#PREFACE">Preface</a></td><td class="tocpn">v</td></tr>
<tr><td class="tocch" colspan="3"><a href="#REFERENCE_WORKS">List of Works of Reference</a></td><td class="tocpn">vii</td></tr>
<tr><td class="tocch" colspan="3"><a href="#LIST_OF_ILLUSTRATIONS">List of Illustrations</a></td><td class="tocpn">xi</td></tr>
<tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch"><a href="#CHAPTER_I">History of the Foundation</a></td><td class="tocpn">3</td></tr>
<tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#CHAPTER_II">Exterior of the Church</a></td><td class="tocpn">25</td></tr>
<tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#CHAPTER_III">Interior of the Church</a></td><td class="tocpn">33</td></tr>
<tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#CHAPTER_IV">St. Bartholomew-the-Less and the Hospital</a></td><td class="tocpn">63</td></tr>
<tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch"><a href="#APPENDIX_I">The Priory Seals</a></td><td class="tocpn">73</td></tr>
<tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#APPENDIX_II">The Priors and Rectors</a></td><td class="tocpn">77</td></tr>
<tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#APPENDIX_III">Inventory of Vestments, etc.</a></td><td class="tocpn">79</td></tr>
<tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#APPENDIX_IV">The Organ</a></td><td class="tocpn">80</td></tr>
<tr><td class="tocch" colspan="3"><a href="#INDEX">Index</a></td><td class="tocpn">83</td></tr>
</table>
<p class='toc'>
<hr />
<a href='#toc'>Table of Contents</a>
</p>
<h2><a name='text'>Text formatting</a></h2>
<p>
A simple <i>paragraph</i> of <b>formatted

View File

@ -0,0 +1,306 @@
## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import math, sys
from libprs500.ebooks.lrf.fonts import get_font
from libprs500.ebooks.lrf.pylrs.pylrs import TextBlock, Text, CR, Span, \
CharButton, Plot, Paragraph, \
LrsTextTag
def ceil(num):
return int(math.ceil(num))
def print_xml(elem):
from libprs500.ebooks.lrf.pylrs.pylrs import ElementWriter
elem = elem.toElement('utf8')
ew = ElementWriter(elem, sourceEncoding='utf8')
ew.write(sys.stdout)
print
def cattrs(base, extra):
new = base.copy()
new.update(extra)
return new
def tokens(tb):
'''
Return the next token. A token is :
1. A string
a block of text that has the same style
'''
def process_element(x, attrs):
if isinstance(x, CR):
yield 2, None
elif isinstance(x, Text):
yield x.text, cattrs(attrs, {})
elif isinstance(x, basestring):
yield x, cattrs(attrs, {})
elif isinstance(x, (CharButton, LrsTextTag)):
if x.contents:
yield x.contents[0].text, cattrs(attrs, {})
elif isinstance(x, Plot):
yield x, None
elif isinstance(x, Span):
attrs = cattrs(attrs, x.attrs)
for y in x.contents:
for z in process_element(y, attrs):
yield z
for i in tb.contents:
if isinstance(i, CR):
yield 1, None
elif isinstance(i, Paragraph):
for j in i.contents:
attrs = {}
if hasattr(j, 'attrs'):
attrs = j.attrs
for k in process_element(j, attrs):
yield k
class Cell(object):
def __init__(self, conv, cell, css):
self.conv = conv
self.cell = cell
self.css = css
self.text_blocks = []
self.rowspan = self.colspan = 1
try:
self.colspan = int(cell['colspan']) if cell.has_key('colspan') else 1
self.rowspan = int(cell['rowspan']) if cell.has_key('rowspan') else 1
except:
if conv.verbose:
print >>sys.stderr, "Error reading row/colspan for ", cell
pp = conv.current_page
conv.book.allow_new_page = False
conv.anchor_to_previous = pp
conv.current_page = conv.book.create_page()
conv.parse_tag(cell, css)
conv.end_current_block()
for item in conv.current_page.contents:
if isinstance(item, TextBlock):
self.text_blocks.append(item)
conv.current_page = pp
conv.book.allow_new_page = True
conv.anchor_to_previous = None
if not self.text_blocks:
tb = conv.book.create_text_block()
tb.Paragraph(' ')
self.text_blocks.append(tb)
for tb in self.text_blocks:
tb.parent = None
tb.objId = 0
# Needed as we have to eventually change this BlockStyle's width and
# height attributes. This blockstyle may be shared with other
# elements, so doing that causes havoc.
tb.blockStyle = conv.book.create_block_style()
ts = conv.book.create_text_style(**tb.textStyle.attrs)
ts.attrs['parindent'] = 0
tb.textStyle = ts
if ts.attrs['align'] == 'foot':
if isinstance(tb.contents[-1], Paragraph):
tb.contents[-1].append(' ')
def pts_to_pixels(self, pts):
pts = int(pts)
return ceil((float(self.conv.profile.dpi)/72)*(pts/10.))
def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
parindent = self.pts_to_pixels(ts['parindent'])
ls, ws = self.pts_to_pixels(ts['linespace']), self.pts_to_pixels(ts['wordspace'])
top, bottom, left, right = 0, 0, parindent, parindent
def add_word(width, height, left, right, top, bottom):
if left + width > maxwidth:
left = width + ws
top += height + ls
bottom = top+height if top+height > bottom else bottom
else:
left += (width + ws)
right = left if left > right else right
bottom = top+height if top+height > bottom else bottom
return left, right, top, bottom
for token, attrs in tokens(tb):
font = default_font
if isinstance(token, int): # Handle para and line breaks
top = bottom
left = parindent if int == 1 else 0
continue
if isinstance(token, Plot):
width, height = self.pts_to_pixels(token.xsize), self.pts_to_pixels(token.ysize)
left, right, top, bottom = add_word(width, height, left, right, top, bottom)
continue
ff = attrs.get('fontfacename', ts['fontfacename'])
fs = attrs.get('fontsize', ts['fontsize'])
if (ff, fs) != (ts['fontfacename'], ts['fontsize']):
font = get_font(ff, self.pts_to_pixels(fs))
for word in token.split():
width, height = font.getsize(word)
left, right, top, bottom = add_word(width, height, left, right, top, bottom)
return right+3, bottom
def text_block_preferred_width(self, tb, debug=False):
return self.text_block_size(tb, sys.maxint, debug=debug)[0]
def preferred_width(self, debug=False):
return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks]))
def height(self, width):
return sum([self.text_block_size(i, width)[1] for i in self.text_blocks])
class Row(object):
def __init__(self, conv, row, css, colpad):
self.cells = []
self.colpad = colpad
cells = row.findAll('td')
for cell in cells:
ccss = conv.tag_css(cell, css)
self.cells.append(Cell(conv, cell, ccss))
def number_of_cells(self):
'''Number of cells in this row. Respects colspan'''
ans = 0
for cell in self.cells:
ans += cell.colspan
return ans
def height(self, widths):
i, heights = 0, []
for cell in self.cells:
width = sum(widths[i:i+cell.colspan])
heights.append(cell.height(width))
i += cell.colspan
return max(heights)
def preferred_width(self, col):
i = -1
cell = None
for cell in self.cells:
for k in range(0, cell.colspan):
if i == col:
break
i += 1
if i == col:
break
return 0 if cell.colspan > 1 else cell.preferred_width()
def cell_iterator(self):
for c in self.cells:
yield c
class Table(object):
def __init__(self, conv, table, css, rowpad=10, colpad=10):
self.rows = []
self.conv = conv
self.rowpad = rowpad
self.colpad = colpad
rows = table.findAll('tr')
for row in rows:
rcss = conv.tag_css(row, css)
self.rows.append(Row(conv, row, rcss, colpad))
def number_of_columns(self):
max = 0
for row in self.rows:
max = row.number_of_cells() if row.number_of_cells() > max else max
return max
def number_or_rows(self):
return len(self.rows)
def height(self, maxwidth):
''' Return row heights + self.rowpad'''
widths = self.get_widths(maxwidth)
return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad
def get_widths(self, maxwidth):
'''
Return widths of columns + sefl.colpad
'''
rows, cols = self.number_or_rows(), self.number_of_columns()
widths = range(cols)
for c in range(cols):
cellwidths = [ 0 for i in range(rows)]
for r in range(rows):
try:
cellwidths[r] = self.rows[r].preferred_width(c)
except IndexError:
continue
widths[c] = max(cellwidths)
itercount = 0
while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
widths = [ceil((95./100.)*w) for w in widths]
itercount += 1
return [i+self.colpad for i in widths]
def blocks(self, maxwidth):
rows, cols = self.number_or_rows(), self.number_of_columns()
cellmatrix = [[None for c in range(cols)] for r in range(rows)]
rowpos = [0 for i in range(rows)]
for r in range(rows):
nc = self.rows[r].cell_iterator()
try:
while True:
cell = nc.next()
cellmatrix[r][rowpos[r]] = cell
rowpos[r] += cell.colspan
for k in range(1, cell.rowspan):
try:
rowpos[r+k] += 1
except IndexError:
break
except StopIteration: # No more cells in this row
continue
widths = self.get_widths(maxwidth)
heights = [row.height(widths) for row in self.rows]
xpos = [sum(widths[:i]) for i in range(cols)]
delta = maxwidth - sum(widths)
if delta < 0:
delta = 0
for r in range(len(cellmatrix)):
yield None, 0, heights[r], 0
for c in range(len(cellmatrix[r])):
cell = cellmatrix[r][c]
if not cell:
continue
width = sum(widths[c:c+cell.colspan])
sypos = 0
for tb in cell.text_blocks:
tb.blockStyle = self.conv.book.create_block_style(
blockwidth=width,
blockheight=cell.text_block_size(tb, width)[1])
yield tb, xpos[c], sypos, delta
sypos += tb.blockStyle.attrs['blockheight']

View File

@ -94,7 +94,12 @@ class xml_attr_field(object):
def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """
document = dom.parseString(obj.info)
try:
document = dom.parseString(obj.info)
except Exception, err:
print >>sys.stderr, "Could not parse XML:", err
print obj.info
raise
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
@ -108,7 +113,12 @@ class xml_attr_field(object):
def __set__(self, obj, val):
if val == None:
val = ""
document = dom.parseString(obj.info)
try:
document = dom.parseString(obj.info)
except Exception, err:
print >>sys.stderr, "Could not parse XML:", err
print obj.info
raise
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
@ -142,7 +152,13 @@ class xml_field(object):
def __get__(self, obj, typ=None):
""" Return the data in this field or '' if the field is empty """
document = dom.parseString(obj.info)
try:
document = dom.parseString(obj.info)
except Exception, err:
print >>sys.stderr, "Could not parse XML:", err
print obj.info
raise
elems = document.getElementsByTagName(self.tag_name)
if len(elems):
elem = None
@ -158,7 +174,12 @@ class xml_field(object):
def __set__(self, obj, val):
if val == None:
val = ""
document = dom.parseString(obj.info)
try:
document = dom.parseString(obj.info)
except Exception, err:
print >>sys.stderr, "Could not parse XML:", err
print obj.info
raise
def create_elem():
elem = document.createElement(self.tag_name)
elem.appendChild(dom.Text())

View File

@ -56,6 +56,8 @@ DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs
class LrsError(Exception):
pass
class ContentError(Exception):
pass
def _checkExists(filename):
if not os.path.exists(filename):
@ -435,6 +437,8 @@ class Book(Delegator):
self.applySetting("sourceencoding", DEFAULT_SOURCE_ENCODING)
self.applySettings(settings, testValid=True)
self.allow_new_page = True #: If False L{create_page} raises an exception
def create_text_style(self, **settings):
ans = TextStyle(**self.defaultTextStyle.attrs.copy())
@ -447,6 +451,8 @@ class Book(Delegator):
return ans
def create_page_style(self, **settings):
if not self.allow_new_page:
raise ContentError
ans = PageStyle(**self.defaultPageStyle.attrs.copy())
ans.update(settings)
return ans
@ -641,12 +647,15 @@ class TableOfContents(object):
raise LrsError, "TOC destination must be a TextBlock, ImageBlock or RuledLine"+\
" not a " + str(type(textBlock))
if textBlock.parent is None or not isinstance(textBlock.parent, Page):
if textBlock.parent is None:
raise LrsError, "TOC text block must be already appended to a page"
if textBlock.parent.parent is None:
raise LrsError, \
"TOC destination page must be already appended to a book"
if not hasattr(textBlock.parent, 'objId'):
raise LrsError, "TOC destination must be appended to a container with an objID"
self.tocEntries.append(TocLabel(tocLabel, textBlock))
textBlock.tocLabel = tocLabel
@ -1373,7 +1382,6 @@ class TextBlock(LrsObject, LrsContainer):
self.textSettings = {}
self.blockSettings = {}
for name, value in settings.items():
if name in TextStyle.validSettings:
@ -1428,7 +1436,6 @@ class TextBlock(LrsObject, LrsContainer):
tb.append(content.toElement(sourceEncoding))
return tb
def getReferencedObjIds(self):
ids = [self.objId, self.extraId, self.blockStyle.objId,
@ -2111,7 +2118,7 @@ class PutObj(LrsContainer):
self.y1 = int(y)
def appendReferencedObjects(self, parent):
def appendReferencedObjects(self, parent):
if self.content.parent is None:
parent.append(self.content)

View File

@ -17,6 +17,7 @@ Convert .txt files to .lrf
"""
import os, sys
from libprs500.ebooks import BeautifulSoup
from libprs500.ebooks.lrf import ConversionError, option_parser
from libprs500.ebooks.lrf import Book
from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, Italic, Bold, BookSetting
@ -63,7 +64,7 @@ def convert_txt(path, options):
C{author}, C{title}, C{encoding} (the assumed encoding of
the text in C{path}.)
"""
import fileinput
import codecs
header = None
if options.header:
header = Paragraph()
@ -84,7 +85,19 @@ def convert_txt(path, options):
block = book.create_text_block()
pg.append(block)
book.append(pg)
for line in fileinput.input(path):
lines = ""
try:
lines = codecs.open(path, 'rb', 'ascii').readlines()
print 'huh'
except UnicodeDecodeError:
try:
lines = codecs.open(path, 'rb', 'cp1252').readlines()
except UnicodeDecodeError:
try:
lines = codecs.open(path, 'rb', 'iso-8859-1').readlines()
except UnicodeDecodeError:
lines = codecs.open(path, 'rb', 'utf8').readlines()
for line in lines:
line = line.strip()
if line:
buffer = buffer.rstrip() + ' ' + line