Refinements for wasteland

This commit is contained in:
Kovid Goyal 2007-08-21 23:33:33 +00:00
parent 7402d7c4d8
commit d6c08b7da3
4 changed files with 63 additions and 74 deletions

View File

@ -294,6 +294,7 @@ class HTMLConverter(object):
pprop.update(self.pseudo_css[tagname])
if tag.has_key("class"):
cls = tag["class"].lower()
for cls in cls.split():
for classname in ["."+cls, tagname+"."+cls]:
if self.css.has_key(classname):
prop.update(self.css[classname])
@ -625,7 +626,6 @@ class HTMLConverter(object):
unneeded.append(prop)
for prop in unneeded:
fp.pop(prop)
elem = Span(text=src, **fp) if (fp or force_span_use) else src
self.current_para.append(elem)
@ -651,25 +651,26 @@ class HTMLConverter(object):
def end_current_para(self):
'''
End current paragraph with a paragraph break after it. If the current
paragraph has no non whitespace text in it do nothing.
End current paragraph with a paragraph break after it.
'''
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_block.append(CR())
self.current_para = Paragraph()
def end_current_block(self):
'''
End current TextBlock. Create new TextBlock with the same styles.
'''
if not self.current_para.has_text():
return
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if self.current_block.contents and \
not isinstance(self.current_block.contents[-1], CR):
self.current_block.append(CR())
def end_current_block(self):
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_para = Paragraph()
if self.current_block.contents or self.current_block.must_append:
self.current_page.append(self.current_block)
self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
original_path = path
if self.rotated_images.has_key(path):
@ -1033,7 +1034,7 @@ class HTMLConverter(object):
return fp
def process_block(self, tag, tag_css, tkey):
def process_block(self, tag, tag_css):
''' Ensure padding and text-indent properties are respected '''
text_properties = self.text_properties(tag_css)
block_properties = self.block_properties(tag_css)
@ -1057,7 +1058,8 @@ class HTMLConverter(object):
self.block_styles.append(bs)
self.current_block = self.book.create_text_block(blockStyle=bs,
textStyle=ts)
self.targets[tkey] = self.current_block
return True
return False
def parse_tag(self, tag, parent_css):
try:
@ -1298,20 +1300,13 @@ class HTMLConverter(object):
self.current_para.append(elem(text))
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
tkey = None
new_block = self.process_block(tag, tag_css)
if self.anchor_ids and tag.has_key('id'):
target = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
tkey = self.target_prefix+tag['id']
self.targets[tkey] = target
if len(self.current_block.contents) > 2:
if not new_block:
self.end_current_block()
self.current_page.append(target)
self.unused_target_blocks.append(target)
else:
self.targets[tkey] = self.current_block
self.current_block.must_append = True
self.targets[tkey] = self.current_block
src = self.get_text(tag, limit=1000)
if not self.disable_chapter_detection and tagname.startswith('h'):
if self.chapter_regex.search(src):
@ -1320,18 +1315,16 @@ class HTMLConverter(object):
self.page_break_found = True
if not tag.contents:
self.current_block.append(CR())
self.current_block.must_append = True
return
self.process_block(tag, tag_css, tkey)
if self.current_para.contents:
self.current_block.append(self.current_para)
if self.current_para.has_text():
self.current_para.append_to(self.current_block)
if self.current_block.contents:
self.current_block.append(CR())
self.previous_text = '\n'
self.current_para = Paragraph()
self.process_children(tag, tag_css, tag_pseudo_css)
if self.current_para.contents:
if self.current_para.contents :
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if tagname.startswith('h') or self.blank_after_para:
@ -1346,14 +1339,13 @@ class HTMLConverter(object):
self.line_break()
self.previous_text = '\n'
elif tagname in ['hr', 'tr']: # tr needed for nested tables
self.end_current_para()
self.line_break()
self.end_current_block()
if tagname == 'hr':
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
self.previous_text = '\n'
self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'td': # Needed for nested tables
if not self.in_table:
self.current_para.append(' ')
self.previous_text = ' '
self.process_children(tag, tag_css, tag_pseudo_css)
@ -1362,9 +1354,9 @@ class HTMLConverter(object):
try:
self.process_table(tag, tag_css)
except Exception, err:
self.logger.warning('An error occurred while processing a table: %s', str(err))
self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err))
self.logger.debug('', exc_info=True)
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
self.logger.debug('Bad table:\n%s', str(tag)[:300])
self.in_table = False
self.process_children(tag, tag_css, tag_pseudo_css)
finally:
@ -1468,7 +1460,7 @@ def process_file(path, options, logger=None):
fheader = re.sub(r'%%a','%a',fheader)
fheader = re.sub(r'%%t','%t',fheader)
header.append(fheader + " ")
book, fonts = Book(options, header=header, **args)
book, fonts = Book(options, logger, header=header, **args)
le = re.compile(options.link_exclude) if options.link_exclude else \
re.compile('$')
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \

View File

@ -14,7 +14,7 @@
This file contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
</p>
<br/>
<h2><a name='toc'>Table of Contents</a></h2>
<h2 id="toc">Table of Contents</h2>
<ul style='page-break-after:always'>
<li><a href='#lists'>Lists</a></li>
<li><a href='#tables'>Tables</a></li>
@ -25,7 +25,7 @@
<li><a href='#recursive'>Recursive link following</a></li>
</ul>
<h2><a name='lists'>Lists</a></h2>
<h2 id="lists">Lists</h2>
<h3>Nested lists</h3>
<ol>
@ -54,8 +54,6 @@
</p>
<h2><a name='tables'>Tables</a></h2>
<br/>
<table>
<tr><td colspan=4><h3 style="text-align:center">A matrix</h3></td></tr>
<tr><td></td><td style="text-align:center"><b>Column 1</b></td><td style="text-align:center"><b>Column 2</b></td><td style="text-align:center"><b>Column 3</b></td></tr>
@ -67,29 +65,27 @@
<p>
html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells.
</p>
<br/>
<p>
Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
</p>
<br />
<p style="page-break-after:always">
On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
</p>
<h3 align="center">Sample Complex Table of Contents</h3>
<table summary="TOC">
<tr><td colspan="3">&nbsp;</td><td align="right">PAGE</td></tr>
<tr><td class="tocch" colspan="3"><a href="#PREFACE">Preface</a></td><td class="tocpn">v</td></tr>
<tr><td class="tocch" colspan="3"><a href="#REFERENCE_WORKS">List of Works of Reference</a></td><td class="tocpn">vii</td></tr>
<tr><td class="tocch" colspan="3"><a href="#LIST_OF_ILLUSTRATIONS">List of Illustrations</a></td><td class="tocpn">xi</td></tr>
<tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch"><a href="#CHAPTER_I">History of the Foundation</a></td><td class="tocpn">3</td></tr>
<tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#CHAPTER_II">Exterior of the Church</a></td><td class="tocpn">25</td></tr>
<tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#CHAPTER_III">Interior of the Church</a></td><td class="tocpn">33</td></tr>
<tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#CHAPTER_IV">St. Bartholomew-the-Less and the Hospital</a></td><td class="tocpn">63</td></tr>
<tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch"><a href="#APPENDIX_I">The Priory Seals</a></td><td class="tocpn">73</td></tr>
<tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#APPENDIX_II">The Priors and Rectors</a></td><td class="tocpn">77</td></tr>
<tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#APPENDIX_III">Inventory of Vestments, etc.</a></td><td class="tocpn">79</td></tr>
<tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#APPENDIX_IV">The Organ</a></td><td class="tocpn">80</td></tr>
<tr><td class="tocch" colspan="3"><a href="#INDEX">Index</a></td><td class="tocpn">83</td></tr>
<tr><td class="tocch" colspan="3">Preface</td><td class="tocpn">v</td></tr>
<tr><td class="tocch" colspan="3">List of Works of Reference</td><td class="tocpn">vii</td></tr>
<tr><td class="tocch" colspan="3">List of Illustrations</td><td class="tocpn">xi</td></tr>
<tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch">History of the Foundation</td><td class="tocpn">3</td></tr>
<tr><td class="tocchr" colspan="2">II.</td><td class="tocch">Exterior of the Church</td><td class="tocpn">25</td></tr>
<tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Interior of the Church</td><td class="tocpn">33</td></tr>
<tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">St. Bartholomew-the-Less and the Hospital</td><td class="tocpn">63</td></tr>
<tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch">The Priory Seals</td><td class="tocpn">73</td></tr>
<tr><td class="tocchr" colspan="2">II.</td><td class="tocch">The Priors and Rectors</td><td class="tocpn">77</td></tr>
<tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Inventory of Vestments, etc.</td><td class="tocpn">79</td></tr>
<tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">The Organ</td><td class="tocpn">80</td></tr>
<tr><td class="tocch" colspan="3">Index</td><td class="tocpn">83</td></tr>
</table>
<p class='toc'>
@ -120,8 +116,7 @@
<hr/>
<p style='text-indent:10em'>A very indented paragraph</p>
<p style='text-indent:0em'>An unindented paragraph</p>
<p>A default indented paragraph</p><br/>
<hr/>
<p>A default indented paragraph</p>
<p class='toc'>
<hr />
<a href='#toc'>Table of Contents</a>

View File

@ -140,7 +140,7 @@ class Cell(object):
ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
parindent = self.pts_to_pixels(ts['parindent'])
mwidth = 0
for token, attrs in tokens(tb):
font = default_font
if isinstance(token, int): # Handle para and line breaks
@ -155,8 +155,10 @@ class Cell(object):
continue
word = token.split()
word = word[0] if word else ""
width, height = font.getsize(word)
return parindent + width + 2
width = font.getsize(word)[0]
if width > mwidth:
mwidth = width
return parindent + mwidth + 2
def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
ts = tb.textStyle.attrs
@ -338,7 +340,7 @@ class Table(object):
adjustable_columns.append(i)
itercount = 0
min_widths = [self.minimum_width(i) for i in xrange(cols)]
min_widths = [self.minimum_width(i)+10 for i in xrange(cols)]
while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
for i in adjustable_columns:
widths[i] = ceil((95./100.)*widths[i]) if \

View File

@ -249,7 +249,7 @@ class LrsContainer(object):
self.parent = None
self.contents = []
self.validChildren = validChildren
self.must_append = False
self.must_append = False #: If True even an empty container is appended by append_to
def has_text(self):
''' Return True iff this container has non whitespace text '''
@ -261,7 +261,7 @@ class LrsContainer(object):
if child.has_text():
return True
for item in self.contents:
if isinstance(item, (Plot, ImageBlock, Canvas)):
if isinstance(item, (Plot, ImageBlock, Canvas, CR)):
return True
return False
@ -270,7 +270,7 @@ class LrsContainer(object):
Append self to C{parent} iff self has non whitespace textual content
@type parent: LrsContainer
'''
if self.has_text() or self.must_append:
if self.contents or self.must_append:
parent.append(self)