diff --git a/src/libprs500/__init__.py b/src/libprs500/__init__.py
index f2968b840e..f59403d9f0 100644
--- a/src/libprs500/__init__.py
+++ b/src/libprs500/__init__.py
@@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
suit your distribution.
"""
-__version__ = "0.3.32"
+__version__ = "0.3.33"
__docformat__ = "epytext"
__author__ = "Kovid Goyal
- This file contains a demonstration of the capabilities of html2lrf, the HTML to LRF converter from libprs500. To obtain libprs500 visit https://libprs500.kovidgoyal.net
+ This file contains a demonstration of the capabilities of html2lrf, the HTML to LRF converter from libprs500. To obtain libprs500 visit element
@@ -478,6 +483,15 @@ class HTMLConverter(object):
return text
def process_links(self):
+ def add_toc_entry(text, target):
+ # TextBlocks in Canvases have a None parent or an Objects Parent
+ if target.parent != None and \
+ hasattr(target.parent, 'objId'):
+ self.book.addTocEntry(ascii_text, tb)
+ elif self.verbose:
+ print "Cannot add link", ascii_text, "to TOC"
+
+
def get_target_block(fragment, targets):
'''Return the correct block for the element'''
bs = targets[fragment]
@@ -535,7 +549,7 @@ class HTMLConverter(object):
if fragment in self.targets.keys():
tb = get_target_block(fragment, self.targets)
if self.is_root:
- self.book.addTocEntry(ascii_text, tb)
+ add_toc_entry(ascii_text, tb)
sys.stdout.flush()
jb = JumpButton(tb)
self.book.append(jb)
@@ -580,7 +594,7 @@ class HTMLConverter(object):
else:
tb = conv.top
if self.is_root:
- self.book.addTocEntry(ascii_text, tb)
+ add_toc_entry(ascii_text, tb)
jb = JumpButton(tb)
self.book.append(jb)
cb = CharButton(jb, text=text)
@@ -727,22 +741,32 @@ class HTMLConverter(object):
blockStyle=self.current_block.blockStyle)
def process_image(self, path, tag_css, width=None, height=None):
+ if self.rotated_images.has_key(path):
+ path = self.rotated_images[path].name
+ if self.scaled_images.has_key(path):
+ path = self.scaled_images[path].name
+
+ im = PILImage.open(path)
+
+ if width == None or height == None:
+ width, height = im.size
+
def scale_image(width, height):
pt = PersistentTemporaryFile(suffix='.jpeg')
im.resize((int(width), int(height)), PILImage.ANTIALIAS).convert('RGB').save(pt, 'JPEG')
pt.close()
self.scaled_images[path] = pt
return pt.name
-
- if self.scaled_images.has_key(path):
- path = self.scaled_images[path].name
-
- im = PILImage.open(path)
- if width == None or height == None:
- width, height = im.size
- if width > height:
+
+ if width > self.profile.page_width and width > height:
+ pt = PersistentTemporaryFile(suffix='.jpeg')
im = im.rotate(-90)
+ im.convert('RGB').save(pt, 'JPEG')
+ path = pt.name
+ pt.close()
+ self.rotated_images[path] = pt
width, height = im.size
+
if height > self.profile.page_height:
corrf = self.profile.page_height/(1.*height)
width, height = floor(corrf*width), self.profile.page_height-1
@@ -788,7 +812,7 @@ class HTMLConverter(object):
self.end_page()
self.current_page.append(Canvas(width=self.profile.page_width,
height=height))
- left = int(floor((self.profile.page_width - width)/2.))
+ left = int(floor((self.profile.page_width - width)/2.))
self.current_page.contents[0].put_object(ImageBlock(self.images[path]),
left, 0)
@@ -824,6 +848,18 @@ class HTMLConverter(object):
pass
elif tagname == 'a' and self.max_link_levels >= 0:
if tag.has_key('name'):
+ if self.anchor_to_previous:
+ self.process_children(tag, tag_css)
+ return
+ for c in self.anchor_to_previous.contents:
+ if isinstance(c, (TextBlock, ImageBlock)):
+ self.targets[tag['name']] = c
+ return
+ tb = self.book.create_text_block()
+ tb.Paragraph(" ")
+ self.anchor_to_previous.append(tb)
+ self.targets[tag['name']] = tb
+ return
previous = self.current_block
self.process_children(tag, tag_css)
target = None
@@ -867,7 +903,7 @@ class HTMLConverter(object):
['png', 'jpg', 'bmp', 'jpeg']:
self.process_image(path, tag_css)
else:
- self.add_text('Link: ' + tag['href'], tag_css)
+ self.add_text(self.get_text(tag), tag_css)
self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
elif tagname == 'img':
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
@@ -1010,30 +1046,45 @@ class HTMLConverter(object):
if tag.has_key('face'):
tag_css['font-family'] = tag['face']
self.process_children(tag, tag_css)
- elif tagname in ['br', 'tr']:
+ elif tagname in ['br']:
self.current_para.append(CR())
- self.process_children(tag, tag_css)
- elif tagname in ['td']:
- self.current_para.append(' ')
- self.process_children(tag, tag_css)
elif tagname == 'hr':
self.end_current_para()
self.current_block.append(CR())
self.end_current_block()
self.current_page.RuledLine(linelength=self.profile.page_width)
+ elif tagname == 'table':
+ tag_css = self.tag_css(tag) # Table should not inherit CSS
+ self.process_table(tag, tag_css)
else:
- self.process_children(tag, tag_css)
-
+ self.process_children(tag, tag_css)
if end_page:
self.end_page()
+ def process_table(self, tag, tag_css):
+ self.end_current_block()
+ colpad = 10
+ table = Table(self, tag, tag_css, rowpad=10, colpad=10)
+ canvases = []
+ for block, xpos, ypos, delta in table.blocks(self.profile.page_width):
+ if not block:
+ canvases.append(Canvas(self.profile.page_width, ypos+colpad,
+ blockrule='block-fixed'))
+ else:
+ canvases[-1].put_object(block, xpos + int(delta/2.), 0)
+
+ for canvas in canvases:
+ self.current_page.append(canvas)
+ self.end_current_block()
+
+
def writeto(self, path, lrs=False):
self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
def cleanup(self):
- for _file in self.scaled_images.values():
+ for _file in self.scaled_images.values() + self.rotated_images.values():
_file.__del__()
-
+
def process_file(path, options):
cwd = os.getcwd()
dirpath = None
@@ -1070,7 +1121,7 @@ def process_file(path, options):
tim.save(tf.name)
tpath = tf.name
else:
- raise ConversionError, 'Cannot read from: %s', (options.cover,)
+ raise ConversionError, 'Cannot read from: %s'% (options.cover,)
if not options.title:
diff --git a/src/libprs500/ebooks/lrf/html/demo/demo.html b/src/libprs500/ebooks/lrf/html/demo/demo.html
index 53050dc65e..027c73a962 100644
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@@ -2,20 +2,23 @@
Demo of html2lrf
https://libprs500.kovidgoyal.net
Table of Contents
-
Lists
@@ -40,6 +43,53 @@
Table of Contents
+ Because I can! +
+A matrix | |||
Column 1 | Column 2 | Column 3 | |
Row 1 | (1, 1) | ||
Row 2 | (2, 2) | ||
Row 3 | (3, 3) |
+ html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells. +
+
+ The table conversion code is very new and likely to be swarming with bugs, so please report them at
+ On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
+
PAGE | |||
Preface | v | ||
List of Works of Reference | vii | ||
List of Illustrations | xi | ||
Chapter | I. | History of the Foundation | 3 |
II. | Exterior of the Church | 25 | |
III. | Interior of the Church | 33 | |
IV. | St. Bartholomew-the-Less and the Hospital | 63 | |
Appendix | I. | The Priory Seals | 73 |
II. | The Priors and Rectors | 77 | |
III. | Inventory of Vestments, etc. | 79 | |
IV. | The Organ | 80 | |
Index | 83 |
+
A simple paragraph of formatted diff --git a/src/libprs500/ebooks/lrf/html/table.py b/src/libprs500/ebooks/lrf/html/table.py new file mode 100644 index 0000000000..7cfeecb963 --- /dev/null +++ b/src/libprs500/ebooks/lrf/html/table.py @@ -0,0 +1,306 @@ +## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +import math, sys + +from libprs500.ebooks.lrf.fonts import get_font +from libprs500.ebooks.lrf.pylrs.pylrs import TextBlock, Text, CR, Span, \ + CharButton, Plot, Paragraph, \ + LrsTextTag + +def ceil(num): + return int(math.ceil(num)) + +def print_xml(elem): + from libprs500.ebooks.lrf.pylrs.pylrs import ElementWriter + elem = elem.toElement('utf8') + ew = ElementWriter(elem, sourceEncoding='utf8') + ew.write(sys.stdout) + print + +def cattrs(base, extra): + new = base.copy() + new.update(extra) + return new + +def tokens(tb): + ''' + Return the next token. A token is : + 1. A string + a block of text that has the same style + ''' + def process_element(x, attrs): + if isinstance(x, CR): + yield 2, None + elif isinstance(x, Text): + yield x.text, cattrs(attrs, {}) + elif isinstance(x, basestring): + yield x, cattrs(attrs, {}) + elif isinstance(x, (CharButton, LrsTextTag)): + if x.contents: + yield x.contents[0].text, cattrs(attrs, {}) + elif isinstance(x, Plot): + yield x, None + elif isinstance(x, Span): + attrs = cattrs(attrs, x.attrs) + for y in x.contents: + for z in process_element(y, attrs): + yield z + + + for i in tb.contents: + if isinstance(i, CR): + yield 1, None + elif isinstance(i, Paragraph): + for j in i.contents: + attrs = {} + if hasattr(j, 'attrs'): + attrs = j.attrs + for k in process_element(j, attrs): + yield k + + +class Cell(object): + + def __init__(self, conv, cell, css): + self.conv = conv + self.cell = cell + self.css = css + self.text_blocks = [] + self.rowspan = self.colspan = 1 + try: + self.colspan = int(cell['colspan']) if cell.has_key('colspan') else 1 + self.rowspan = int(cell['rowspan']) if cell.has_key('rowspan') else 1 + except: + if conv.verbose: + print >>sys.stderr, "Error reading row/colspan for ", cell + + pp = conv.current_page + conv.book.allow_new_page = False + conv.anchor_to_previous = pp + conv.current_page = conv.book.create_page() + conv.parse_tag(cell, css) + conv.end_current_block() + for item in conv.current_page.contents: + if isinstance(item, TextBlock): + self.text_blocks.append(item) + conv.current_page = pp + conv.book.allow_new_page = True + conv.anchor_to_previous = None + if not self.text_blocks: + tb = conv.book.create_text_block() + tb.Paragraph(' ') + self.text_blocks.append(tb) + for tb in self.text_blocks: + tb.parent = None + tb.objId = 0 + # Needed as we have to eventually change this BlockStyle's width and + # height attributes. This blockstyle may be shared with other + # elements, so doing that causes havoc. + tb.blockStyle = conv.book.create_block_style() + ts = conv.book.create_text_style(**tb.textStyle.attrs) + ts.attrs['parindent'] = 0 + tb.textStyle = ts + if ts.attrs['align'] == 'foot': + if isinstance(tb.contents[-1], Paragraph): + tb.contents[-1].append(' ') + + + + + def pts_to_pixels(self, pts): + pts = int(pts) + return ceil((float(self.conv.profile.dpi)/72)*(pts/10.)) + + def text_block_size(self, tb, maxwidth=sys.maxint, debug=False): + ts = tb.textStyle.attrs + default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize'])) + parindent = self.pts_to_pixels(ts['parindent']) + ls, ws = self.pts_to_pixels(ts['linespace']), self.pts_to_pixels(ts['wordspace']) + top, bottom, left, right = 0, 0, parindent, parindent + + def add_word(width, height, left, right, top, bottom): + if left + width > maxwidth: + left = width + ws + top += height + ls + bottom = top+height if top+height > bottom else bottom + else: + left += (width + ws) + right = left if left > right else right + bottom = top+height if top+height > bottom else bottom + return left, right, top, bottom + + for token, attrs in tokens(tb): + font = default_font + if isinstance(token, int): # Handle para and line breaks + top = bottom + left = parindent if int == 1 else 0 + continue + if isinstance(token, Plot): + width, height = self.pts_to_pixels(token.xsize), self.pts_to_pixels(token.ysize) + left, right, top, bottom = add_word(width, height, left, right, top, bottom) + continue + ff = attrs.get('fontfacename', ts['fontfacename']) + fs = attrs.get('fontsize', ts['fontsize']) + if (ff, fs) != (ts['fontfacename'], ts['fontsize']): + font = get_font(ff, self.pts_to_pixels(fs)) + for word in token.split(): + width, height = font.getsize(word) + left, right, top, bottom = add_word(width, height, left, right, top, bottom) + return right+3, bottom + + def text_block_preferred_width(self, tb, debug=False): + return self.text_block_size(tb, sys.maxint, debug=debug)[0] + + def preferred_width(self, debug=False): + return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks])) + + def height(self, width): + return sum([self.text_block_size(i, width)[1] for i in self.text_blocks]) + + + +class Row(object): + def __init__(self, conv, row, css, colpad): + self.cells = [] + self.colpad = colpad + cells = row.findAll('td') + for cell in cells: + ccss = conv.tag_css(cell, css) + self.cells.append(Cell(conv, cell, ccss)) + + def number_of_cells(self): + '''Number of cells in this row. Respects colspan''' + ans = 0 + for cell in self.cells: + ans += cell.colspan + return ans + + def height(self, widths): + i, heights = 0, [] + for cell in self.cells: + width = sum(widths[i:i+cell.colspan]) + heights.append(cell.height(width)) + i += cell.colspan + return max(heights) + + def preferred_width(self, col): + i = -1 + cell = None + for cell in self.cells: + for k in range(0, cell.colspan): + if i == col: + break + i += 1 + if i == col: + break + + return 0 if cell.colspan > 1 else cell.preferred_width() + + def cell_iterator(self): + for c in self.cells: + yield c + + +class Table(object): + def __init__(self, conv, table, css, rowpad=10, colpad=10): + self.rows = [] + self.conv = conv + self.rowpad = rowpad + self.colpad = colpad + rows = table.findAll('tr') + for row in rows: + rcss = conv.tag_css(row, css) + self.rows.append(Row(conv, row, rcss, colpad)) + + def number_of_columns(self): + max = 0 + for row in self.rows: + max = row.number_of_cells() if row.number_of_cells() > max else max + return max + + def number_or_rows(self): + return len(self.rows) + + def height(self, maxwidth): + ''' Return row heights + self.rowpad''' + widths = self.get_widths(maxwidth) + return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad + + def get_widths(self, maxwidth): + ''' + Return widths of columns + sefl.colpad + ''' + rows, cols = self.number_or_rows(), self.number_of_columns() + widths = range(cols) + for c in range(cols): + cellwidths = [ 0 for i in range(rows)] + for r in range(rows): + try: + cellwidths[r] = self.rows[r].preferred_width(c) + except IndexError: + continue + widths[c] = max(cellwidths) + itercount = 0 + while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100: + widths = [ceil((95./100.)*w) for w in widths] + itercount += 1 + return [i+self.colpad for i in widths] + + def blocks(self, maxwidth): + rows, cols = self.number_or_rows(), self.number_of_columns() + cellmatrix = [[None for c in range(cols)] for r in range(rows)] + rowpos = [0 for i in range(rows)] + for r in range(rows): + nc = self.rows[r].cell_iterator() + try: + while True: + cell = nc.next() + cellmatrix[r][rowpos[r]] = cell + rowpos[r] += cell.colspan + for k in range(1, cell.rowspan): + try: + rowpos[r+k] += 1 + except IndexError: + break + except StopIteration: # No more cells in this row + continue + + + widths = self.get_widths(maxwidth) + heights = [row.height(widths) for row in self.rows] + + xpos = [sum(widths[:i]) for i in range(cols)] + delta = maxwidth - sum(widths) + if delta < 0: + delta = 0 + for r in range(len(cellmatrix)): + yield None, 0, heights[r], 0 + for c in range(len(cellmatrix[r])): + cell = cellmatrix[r][c] + if not cell: + continue + width = sum(widths[c:c+cell.colspan]) + sypos = 0 + for tb in cell.text_blocks: + tb.blockStyle = self.conv.book.create_block_style( + blockwidth=width, + blockheight=cell.text_block_size(tb, width)[1]) + + yield tb, xpos[c], sypos, delta + sypos += tb.blockStyle.attrs['blockheight'] + + + + \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/meta.py b/src/libprs500/ebooks/lrf/meta.py index 550ab6d592..d2d5c37445 100644 --- a/src/libprs500/ebooks/lrf/meta.py +++ b/src/libprs500/ebooks/lrf/meta.py @@ -94,7 +94,12 @@ class xml_attr_field(object): def __get__(self, obj, typ=None): """ Return the data in this field or '' if the field is empty """ - document = dom.parseString(obj.info) + try: + document = dom.parseString(obj.info) + except Exception, err: + print >>sys.stderr, "Could not parse XML:", err + print obj.info + raise elems = document.getElementsByTagName(self.tag_name) if len(elems): elem = None @@ -108,7 +113,12 @@ class xml_attr_field(object): def __set__(self, obj, val): if val == None: val = "" - document = dom.parseString(obj.info) + try: + document = dom.parseString(obj.info) + except Exception, err: + print >>sys.stderr, "Could not parse XML:", err + print obj.info + raise elems = document.getElementsByTagName(self.tag_name) if len(elems): elem = None @@ -142,7 +152,13 @@ class xml_field(object): def __get__(self, obj, typ=None): """ Return the data in this field or '' if the field is empty """ - document = dom.parseString(obj.info) + try: + document = dom.parseString(obj.info) + except Exception, err: + print >>sys.stderr, "Could not parse XML:", err + print obj.info + raise + elems = document.getElementsByTagName(self.tag_name) if len(elems): elem = None @@ -158,7 +174,12 @@ class xml_field(object): def __set__(self, obj, val): if val == None: val = "" - document = dom.parseString(obj.info) + try: + document = dom.parseString(obj.info) + except Exception, err: + print >>sys.stderr, "Could not parse XML:", err + print obj.info + raise def create_elem(): elem = document.createElement(self.tag_name) elem.appendChild(dom.Text()) diff --git a/src/libprs500/ebooks/lrf/pylrs/pylrs.py b/src/libprs500/ebooks/lrf/pylrs/pylrs.py index 63f252cefd..4a43e10e83 100644 --- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py +++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py @@ -56,6 +56,8 @@ DEFAULT_GENREADING = "fs" # default is yes to both lrf and lrs class LrsError(Exception): pass +class ContentError(Exception): + pass def _checkExists(filename): if not os.path.exists(filename): @@ -435,6 +437,8 @@ class Book(Delegator): self.applySetting("sourceencoding", DEFAULT_SOURCE_ENCODING) self.applySettings(settings, testValid=True) + + self.allow_new_page = True #: If False L{create_page} raises an exception def create_text_style(self, **settings): ans = TextStyle(**self.defaultTextStyle.attrs.copy()) @@ -447,6 +451,8 @@ class Book(Delegator): return ans def create_page_style(self, **settings): + if not self.allow_new_page: + raise ContentError ans = PageStyle(**self.defaultPageStyle.attrs.copy()) ans.update(settings) return ans @@ -641,12 +647,15 @@ class TableOfContents(object): raise LrsError, "TOC destination must be a TextBlock, ImageBlock or RuledLine"+\ " not a " + str(type(textBlock)) - if textBlock.parent is None or not isinstance(textBlock.parent, Page): + if textBlock.parent is None: raise LrsError, "TOC text block must be already appended to a page" if textBlock.parent.parent is None: raise LrsError, \ "TOC destination page must be already appended to a book" + + if not hasattr(textBlock.parent, 'objId'): + raise LrsError, "TOC destination must be appended to a container with an objID" self.tocEntries.append(TocLabel(tocLabel, textBlock)) textBlock.tocLabel = tocLabel @@ -1373,7 +1382,6 @@ class TextBlock(LrsObject, LrsContainer): self.textSettings = {} self.blockSettings = {} - for name, value in settings.items(): if name in TextStyle.validSettings: @@ -1428,7 +1436,6 @@ class TextBlock(LrsObject, LrsContainer): tb.append(content.toElement(sourceEncoding)) return tb - def getReferencedObjIds(self): ids = [self.objId, self.extraId, self.blockStyle.objId, @@ -2111,7 +2118,7 @@ class PutObj(LrsContainer): self.y1 = int(y) - def appendReferencedObjects(self, parent): + def appendReferencedObjects(self, parent): if self.content.parent is None: parent.append(self.content) diff --git a/src/libprs500/ebooks/lrf/txt/convert_from.py b/src/libprs500/ebooks/lrf/txt/convert_from.py index e5972a0db9..94f61a0978 100644 --- a/src/libprs500/ebooks/lrf/txt/convert_from.py +++ b/src/libprs500/ebooks/lrf/txt/convert_from.py @@ -17,6 +17,7 @@ Convert .txt files to .lrf """ import os, sys +from libprs500.ebooks import BeautifulSoup from libprs500.ebooks.lrf import ConversionError, option_parser from libprs500.ebooks.lrf import Book from libprs500.ebooks.lrf.pylrs.pylrs import Paragraph, Italic, Bold, BookSetting @@ -63,7 +64,7 @@ def convert_txt(path, options): C{author}, C{title}, C{encoding} (the assumed encoding of the text in C{path}.) """ - import fileinput + import codecs header = None if options.header: header = Paragraph() @@ -84,7 +85,19 @@ def convert_txt(path, options): block = book.create_text_block() pg.append(block) book.append(pg) - for line in fileinput.input(path): + lines = "" + try: + lines = codecs.open(path, 'rb', 'ascii').readlines() + print 'huh' + except UnicodeDecodeError: + try: + lines = codecs.open(path, 'rb', 'cp1252').readlines() + except UnicodeDecodeError: + try: + lines = codecs.open(path, 'rb', 'iso-8859-1').readlines() + except UnicodeDecodeError: + lines = codecs.open(path, 'rb', 'utf8').readlines() + for line in lines: line = line.strip() if line: buffer = buffer.rstrip() + ' ' + line