From d6c08b7da336a7f1399261b01be3f37f49d9a3e2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal
Date: Tue, 21 Aug 2007 23:33:33 +0000
Subject: [PATCH] Refinements for wasteland
---
src/libprs500/ebooks/lrf/html/convert_from.py | 86 +++++++++----------
src/libprs500/ebooks/lrf/html/demo/demo.html | 35 ++++----
src/libprs500/ebooks/lrf/html/table.py | 10 ++-
src/libprs500/ebooks/lrf/pylrs/pylrs.py | 6 +-
4 files changed, 63 insertions(+), 74 deletions(-)
diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index d8cee34e2d..765ff8eed5 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -293,12 +293,13 @@ class HTMLConverter(object):
if self.pseudo_css.has_key(tagname):
pprop.update(self.pseudo_css[tagname])
if tag.has_key("class"):
- cls = tag["class"].lower()
- for classname in ["."+cls, tagname+"."+cls]:
- if self.css.has_key(classname):
- prop.update(self.css[classname])
- if self.pseudo_css.has_key(classname):
- pprop.update(self.pseudo_css[classname])
+ cls = tag["class"].lower()
+ for cls in cls.split():
+ for classname in ["."+cls, tagname+"."+cls]:
+ if self.css.has_key(classname):
+ prop.update(self.css[classname])
+ if self.pseudo_css.has_key(classname):
+ pprop.update(self.pseudo_css[classname])
if tag.has_key("style"):
prop.update(self.parse_style_properties(tag["style"]))
return prop, pprop
@@ -625,7 +626,6 @@ class HTMLConverter(object):
unneeded.append(prop)
for prop in unneeded:
fp.pop(prop)
-
elem = Span(text=src, **fp) if (fp or force_span_use) else src
self.current_para.append(elem)
@@ -651,24 +651,25 @@ class HTMLConverter(object):
def end_current_para(self):
'''
- End current paragraph with a paragraph break after it. If the current
- paragraph has no non whitespace text in it do nothing.
+ End current paragraph with a paragraph break after it.
+ '''
+ if self.current_para.contents:
+ self.current_block.append(self.current_para)
+ self.current_block.append(CR())
+ self.current_para = Paragraph()
+
+ def end_current_block(self):
+ '''
+ End current TextBlock. Create new TextBlock with the same styles.
'''
- if not self.current_para.has_text():
- return
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_para = Paragraph()
- if self.current_block.contents and \
- not isinstance(self.current_block.contents[-1], CR):
- self.current_block.append(CR())
-
- def end_current_block(self):
- self.current_para.append_to(self.current_block)
- self.current_block.append_to(self.current_page)
- self.current_para = Paragraph()
- self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
+ if self.current_block.contents or self.current_block.must_append:
+ self.current_page.append(self.current_block)
+ self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
+
def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
original_path = path
@@ -1033,7 +1034,7 @@ class HTMLConverter(object):
return fp
- def process_block(self, tag, tag_css, tkey):
+ def process_block(self, tag, tag_css):
''' Ensure padding and text-indent properties are respected '''
text_properties = self.text_properties(tag_css)
block_properties = self.block_properties(tag_css)
@@ -1057,7 +1058,8 @@ class HTMLConverter(object):
self.block_styles.append(bs)
self.current_block = self.book.create_text_block(blockStyle=bs,
textStyle=ts)
- self.targets[tkey] = self.current_block
+ return True
+ return False
def parse_tag(self, tag, parent_css):
try:
@@ -1298,20 +1300,13 @@ class HTMLConverter(object):
self.current_para.append(elem(text))
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
- tkey = None
- if self.anchor_ids and tag.has_key('id'):
- target = self.book.create_text_block(textStyle=self.current_block.textStyle,
- blockStyle=self.current_block.blockStyle)
+ new_block = self.process_block(tag, tag_css)
+ if self.anchor_ids and tag.has_key('id'):
tkey = self.target_prefix+tag['id']
- self.targets[tkey] = target
-
- if len(self.current_block.contents) > 2:
+ if not new_block:
self.end_current_block()
- self.current_page.append(target)
- self.unused_target_blocks.append(target)
- else:
- self.targets[tkey] = self.current_block
- self.current_block.must_append = True
+ self.current_block.must_append = True
+ self.targets[tkey] = self.current_block
src = self.get_text(tag, limit=1000)
if not self.disable_chapter_detection and tagname.startswith('h'):
if self.chapter_regex.search(src):
@@ -1320,18 +1315,16 @@ class HTMLConverter(object):
self.page_break_found = True
if not tag.contents:
self.current_block.append(CR())
- self.current_block.must_append = True
return
- self.process_block(tag, tag_css, tkey)
- if self.current_para.contents:
- self.current_block.append(self.current_para)
+
+ if self.current_para.has_text():
+ self.current_para.append_to(self.current_block)
if self.current_block.contents:
self.current_block.append(CR())
self.previous_text = '\n'
self.current_para = Paragraph()
-
self.process_children(tag, tag_css, tag_pseudo_css)
- if self.current_para.contents:
+ if self.current_para.contents :
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if tagname.startswith('h') or self.blank_after_para:
@@ -1346,25 +1339,24 @@ class HTMLConverter(object):
self.line_break()
self.previous_text = '\n'
elif tagname in ['hr', 'tr']: # tr needed for nested tables
- self.end_current_para()
- self.line_break()
self.end_current_block()
if tagname == 'hr':
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
self.previous_text = '\n'
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'td': # Needed for nested tables
- self.current_para.append(' ')
- self.previous_text = ' '
+ if not self.in_table:
+ self.current_para.append(' ')
+ self.previous_text = ' '
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'table' and not self.ignore_tables and not self.in_table:
tag_css = self.tag_css(tag)[0] # Table should not inherit CSS
try:
self.process_table(tag, tag_css)
except Exception, err:
- self.logger.warning('An error occurred while processing a table: %s', str(err))
+ self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err))
self.logger.debug('', exc_info=True)
- self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
+ self.logger.debug('Bad table:\n%s', str(tag)[:300])
self.in_table = False
self.process_children(tag, tag_css, tag_pseudo_css)
finally:
@@ -1468,7 +1460,7 @@ def process_file(path, options, logger=None):
fheader = re.sub(r'%%a','%a',fheader)
fheader = re.sub(r'%%t','%t',fheader)
header.append(fheader + " ")
- book, fonts = Book(options, header=header, **args)
+ book, fonts = Book(options, logger, header=header, **args)
le = re.compile(options.link_exclude) if options.link_exclude else \
re.compile('$')
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
diff --git a/src/libprs500/ebooks/lrf/html/demo/demo.html b/src/libprs500/ebooks/lrf/html/demo/demo.html
index d0142ce120..3085c6a6fe 100644
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@@ -14,7 +14,7 @@
This file contains a demonstration of the capabilities of html2lrf, the HTML to LRF converter from libprs500. To obtain libprs500 visit
https://libprs500.kovidgoyal.net
-
+ Table of Contents
-
+ Lists
Nested lists
@@ -54,8 +54,6 @@
-
-
A matrix |
| Column 1 | Column 2 | Column 3 |
@@ -67,29 +65,27 @@
html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells.
-
Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
-
On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
Sample Complex Table of Contents
@@ -120,8 +116,7 @@
A very indented paragraph
An unindented paragraph
- A default indented paragraph
-
+ A default indented paragraph
Table of Contents
diff --git a/src/libprs500/ebooks/lrf/html/table.py b/src/libprs500/ebooks/lrf/html/table.py
index c63b1dfeba..f21dc6f3b6 100644
--- a/src/libprs500/ebooks/lrf/html/table.py
+++ b/src/libprs500/ebooks/lrf/html/table.py
@@ -140,7 +140,7 @@ class Cell(object):
ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
parindent = self.pts_to_pixels(ts['parindent'])
-
+ mwidth = 0
for token, attrs in tokens(tb):
font = default_font
if isinstance(token, int): # Handle para and line breaks
@@ -155,8 +155,10 @@ class Cell(object):
continue
word = token.split()
word = word[0] if word else ""
- width, height = font.getsize(word)
- return parindent + width + 2
+ width = font.getsize(word)[0]
+ if width > mwidth:
+ mwidth = width
+ return parindent + mwidth + 2
def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
ts = tb.textStyle.attrs
@@ -338,7 +340,7 @@ class Table(object):
adjustable_columns.append(i)
itercount = 0
- min_widths = [self.minimum_width(i) for i in xrange(cols)]
+ min_widths = [self.minimum_width(i)+10 for i in xrange(cols)]
while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
for i in adjustable_columns:
widths[i] = ceil((95./100.)*widths[i]) if \
diff --git a/src/libprs500/ebooks/lrf/pylrs/pylrs.py b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
index a3e05acd0e..9f7302e8c1 100644
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@@ -249,7 +249,7 @@ class LrsContainer(object):
self.parent = None
self.contents = []
self.validChildren = validChildren
- self.must_append = False
+ self.must_append = False #: If True even an empty container is appended by append_to
def has_text(self):
''' Return True iff this container has non whitespace text '''
@@ -261,7 +261,7 @@ class LrsContainer(object):
if child.has_text():
return True
for item in self.contents:
- if isinstance(item, (Plot, ImageBlock, Canvas)):
+ if isinstance(item, (Plot, ImageBlock, Canvas, CR)):
return True
return False
@@ -270,7 +270,7 @@ class LrsContainer(object):
Append self to C{parent} iff self has non whitespace textual content
@type parent: LrsContainer
'''
- if self.has_text() or self.must_append:
+ if self.contents or self.must_append:
parent.append(self)