Refinements for wasteland

This commit is contained in:
Kovid Goyal 2007-08-21 23:33:33 +00:00
parent 7402d7c4d8
commit d6c08b7da3
4 changed files with 63 additions and 74 deletions

View File

@ -293,12 +293,13 @@ class HTMLConverter(object):
if self.pseudo_css.has_key(tagname): if self.pseudo_css.has_key(tagname):
pprop.update(self.pseudo_css[tagname]) pprop.update(self.pseudo_css[tagname])
if tag.has_key("class"): if tag.has_key("class"):
cls = tag["class"].lower() cls = tag["class"].lower()
for classname in ["."+cls, tagname+"."+cls]: for cls in cls.split():
if self.css.has_key(classname): for classname in ["."+cls, tagname+"."+cls]:
prop.update(self.css[classname]) if self.css.has_key(classname):
if self.pseudo_css.has_key(classname): prop.update(self.css[classname])
pprop.update(self.pseudo_css[classname]) if self.pseudo_css.has_key(classname):
pprop.update(self.pseudo_css[classname])
if tag.has_key("style"): if tag.has_key("style"):
prop.update(self.parse_style_properties(tag["style"])) prop.update(self.parse_style_properties(tag["style"]))
return prop, pprop return prop, pprop
@ -625,7 +626,6 @@ class HTMLConverter(object):
unneeded.append(prop) unneeded.append(prop)
for prop in unneeded: for prop in unneeded:
fp.pop(prop) fp.pop(prop)
elem = Span(text=src, **fp) if (fp or force_span_use) else src elem = Span(text=src, **fp) if (fp or force_span_use) else src
self.current_para.append(elem) self.current_para.append(elem)
@ -651,24 +651,25 @@ class HTMLConverter(object):
def end_current_para(self): def end_current_para(self):
''' '''
End current paragraph with a paragraph break after it. If the current End current paragraph with a paragraph break after it.
paragraph has no non whitespace text in it do nothing. '''
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_block.append(CR())
self.current_para = Paragraph()
def end_current_block(self):
'''
End current TextBlock. Create new TextBlock with the same styles.
''' '''
if not self.current_para.has_text():
return
if self.current_para.contents: if self.current_para.contents:
self.current_block.append(self.current_para) self.current_block.append(self.current_para)
self.current_para = Paragraph() self.current_para = Paragraph()
if self.current_block.contents and \ if self.current_block.contents or self.current_block.must_append:
not isinstance(self.current_block.contents[-1], CR): self.current_page.append(self.current_block)
self.current_block.append(CR()) self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
def end_current_block(self):
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_para = Paragraph()
self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle) blockStyle=self.current_block.blockStyle)
def process_image(self, path, tag_css, width=None, height=None, dropcaps=False): def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
original_path = path original_path = path
@ -1033,7 +1034,7 @@ class HTMLConverter(object):
return fp return fp
def process_block(self, tag, tag_css, tkey): def process_block(self, tag, tag_css):
''' Ensure padding and text-indent properties are respected ''' ''' Ensure padding and text-indent properties are respected '''
text_properties = self.text_properties(tag_css) text_properties = self.text_properties(tag_css)
block_properties = self.block_properties(tag_css) block_properties = self.block_properties(tag_css)
@ -1057,7 +1058,8 @@ class HTMLConverter(object):
self.block_styles.append(bs) self.block_styles.append(bs)
self.current_block = self.book.create_text_block(blockStyle=bs, self.current_block = self.book.create_text_block(blockStyle=bs,
textStyle=ts) textStyle=ts)
self.targets[tkey] = self.current_block return True
return False
def parse_tag(self, tag, parent_css): def parse_tag(self, tag, parent_css):
try: try:
@ -1298,20 +1300,13 @@ class HTMLConverter(object):
self.current_para.append(elem(text)) self.current_para.append(elem(text))
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
tkey = None new_block = self.process_block(tag, tag_css)
if self.anchor_ids and tag.has_key('id'): if self.anchor_ids and tag.has_key('id'):
target = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
tkey = self.target_prefix+tag['id'] tkey = self.target_prefix+tag['id']
self.targets[tkey] = target if not new_block:
if len(self.current_block.contents) > 2:
self.end_current_block() self.end_current_block()
self.current_page.append(target) self.current_block.must_append = True
self.unused_target_blocks.append(target) self.targets[tkey] = self.current_block
else:
self.targets[tkey] = self.current_block
self.current_block.must_append = True
src = self.get_text(tag, limit=1000) src = self.get_text(tag, limit=1000)
if not self.disable_chapter_detection and tagname.startswith('h'): if not self.disable_chapter_detection and tagname.startswith('h'):
if self.chapter_regex.search(src): if self.chapter_regex.search(src):
@ -1320,18 +1315,16 @@ class HTMLConverter(object):
self.page_break_found = True self.page_break_found = True
if not tag.contents: if not tag.contents:
self.current_block.append(CR()) self.current_block.append(CR())
self.current_block.must_append = True
return return
self.process_block(tag, tag_css, tkey)
if self.current_para.contents: if self.current_para.has_text():
self.current_block.append(self.current_para) self.current_para.append_to(self.current_block)
if self.current_block.contents: if self.current_block.contents:
self.current_block.append(CR()) self.current_block.append(CR())
self.previous_text = '\n' self.previous_text = '\n'
self.current_para = Paragraph() self.current_para = Paragraph()
self.process_children(tag, tag_css, tag_pseudo_css) self.process_children(tag, tag_css, tag_pseudo_css)
if self.current_para.contents: if self.current_para.contents :
self.current_block.append(self.current_para) self.current_block.append(self.current_para)
self.current_para = Paragraph() self.current_para = Paragraph()
if tagname.startswith('h') or self.blank_after_para: if tagname.startswith('h') or self.blank_after_para:
@ -1346,25 +1339,24 @@ class HTMLConverter(object):
self.line_break() self.line_break()
self.previous_text = '\n' self.previous_text = '\n'
elif tagname in ['hr', 'tr']: # tr needed for nested tables elif tagname in ['hr', 'tr']: # tr needed for nested tables
self.end_current_para()
self.line_break()
self.end_current_block() self.end_current_block()
if tagname == 'hr': if tagname == 'hr':
self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth'])) self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
self.previous_text = '\n' self.previous_text = '\n'
self.process_children(tag, tag_css, tag_pseudo_css) self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'td': # Needed for nested tables elif tagname == 'td': # Needed for nested tables
self.current_para.append(' ') if not self.in_table:
self.previous_text = ' ' self.current_para.append(' ')
self.previous_text = ' '
self.process_children(tag, tag_css, tag_pseudo_css) self.process_children(tag, tag_css, tag_pseudo_css)
elif tagname == 'table' and not self.ignore_tables and not self.in_table: elif tagname == 'table' and not self.ignore_tables and not self.in_table:
tag_css = self.tag_css(tag)[0] # Table should not inherit CSS tag_css = self.tag_css(tag)[0] # Table should not inherit CSS
try: try:
self.process_table(tag, tag_css) self.process_table(tag, tag_css)
except Exception, err: except Exception, err:
self.logger.warning('An error occurred while processing a table: %s', str(err)) self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err))
self.logger.debug('', exc_info=True) self.logger.debug('', exc_info=True)
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300]) self.logger.debug('Bad table:\n%s', str(tag)[:300])
self.in_table = False self.in_table = False
self.process_children(tag, tag_css, tag_pseudo_css) self.process_children(tag, tag_css, tag_pseudo_css)
finally: finally:
@ -1468,7 +1460,7 @@ def process_file(path, options, logger=None):
fheader = re.sub(r'%%a','%a',fheader) fheader = re.sub(r'%%a','%a',fheader)
fheader = re.sub(r'%%t','%t',fheader) fheader = re.sub(r'%%t','%t',fheader)
header.append(fheader + " ") header.append(fheader + " ")
book, fonts = Book(options, header=header, **args) book, fonts = Book(options, logger, header=header, **args)
le = re.compile(options.link_exclude) if options.link_exclude else \ le = re.compile(options.link_exclude) if options.link_exclude else \
re.compile('$') re.compile('$')
pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \ pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \

View File

@ -14,7 +14,7 @@
This file contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span> This file contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
</p> </p>
<br/> <br/>
<h2><a name='toc'>Table of Contents</a></h2> <h2 id="toc">Table of Contents</h2>
<ul style='page-break-after:always'> <ul style='page-break-after:always'>
<li><a href='#lists'>Lists</a></li> <li><a href='#lists'>Lists</a></li>
<li><a href='#tables'>Tables</a></li> <li><a href='#tables'>Tables</a></li>
@ -25,7 +25,7 @@
<li><a href='#recursive'>Recursive link following</a></li> <li><a href='#recursive'>Recursive link following</a></li>
</ul> </ul>
<h2><a name='lists'>Lists</a></h2> <h2 id="lists">Lists</h2>
<h3>Nested lists</h3> <h3>Nested lists</h3>
<ol> <ol>
@ -54,8 +54,6 @@
</p> </p>
<h2><a name='tables'>Tables</a></h2> <h2><a name='tables'>Tables</a></h2>
<br/>
<table> <table>
<tr><td colspan=4><h3 style="text-align:center">A matrix</h3></td></tr> <tr><td colspan=4><h3 style="text-align:center">A matrix</h3></td></tr>
<tr><td></td><td style="text-align:center"><b>Column 1</b></td><td style="text-align:center"><b>Column 2</b></td><td style="text-align:center"><b>Column 3</b></td></tr> <tr><td></td><td style="text-align:center"><b>Column 1</b></td><td style="text-align:center"><b>Column 2</b></td><td style="text-align:center"><b>Column 3</b></td></tr>
@ -67,29 +65,27 @@
<p> <p>
html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells. html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells.
</p> </p>
<br/>
<p> <p>
Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables. Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
</p> </p>
<br />
<p style="page-break-after:always"> <p style="page-break-after:always">
On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan. On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
</p> </p>
<h3 align="center">Sample Complex Table of Contents</h3> <h3 align="center">Sample Complex Table of Contents</h3>
<table summary="TOC"> <table summary="TOC">
<tr><td colspan="3">&nbsp;</td><td align="right">PAGE</td></tr> <tr><td colspan="3">&nbsp;</td><td align="right">PAGE</td></tr>
<tr><td class="tocch" colspan="3"><a href="#PREFACE">Preface</a></td><td class="tocpn">v</td></tr> <tr><td class="tocch" colspan="3">Preface</td><td class="tocpn">v</td></tr>
<tr><td class="tocch" colspan="3"><a href="#REFERENCE_WORKS">List of Works of Reference</a></td><td class="tocpn">vii</td></tr> <tr><td class="tocch" colspan="3">List of Works of Reference</td><td class="tocpn">vii</td></tr>
<tr><td class="tocch" colspan="3"><a href="#LIST_OF_ILLUSTRATIONS">List of Illustrations</a></td><td class="tocpn">xi</td></tr> <tr><td class="tocch" colspan="3">List of Illustrations</td><td class="tocpn">xi</td></tr>
<tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch"><a href="#CHAPTER_I">History of the Foundation</a></td><td class="tocpn">3</td></tr> <tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch">History of the Foundation</td><td class="tocpn">3</td></tr>
<tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#CHAPTER_II">Exterior of the Church</a></td><td class="tocpn">25</td></tr> <tr><td class="tocchr" colspan="2">II.</td><td class="tocch">Exterior of the Church</td><td class="tocpn">25</td></tr>
<tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#CHAPTER_III">Interior of the Church</a></td><td class="tocpn">33</td></tr> <tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Interior of the Church</td><td class="tocpn">33</td></tr>
<tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#CHAPTER_IV">St. Bartholomew-the-Less and the Hospital</a></td><td class="tocpn">63</td></tr> <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">St. Bartholomew-the-Less and the Hospital</td><td class="tocpn">63</td></tr>
<tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch"><a href="#APPENDIX_I">The Priory Seals</a></td><td class="tocpn">73</td></tr> <tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch">The Priory Seals</td><td class="tocpn">73</td></tr>
<tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#APPENDIX_II">The Priors and Rectors</a></td><td class="tocpn">77</td></tr> <tr><td class="tocchr" colspan="2">II.</td><td class="tocch">The Priors and Rectors</td><td class="tocpn">77</td></tr>
<tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#APPENDIX_III">Inventory of Vestments, etc.</a></td><td class="tocpn">79</td></tr> <tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Inventory of Vestments, etc.</td><td class="tocpn">79</td></tr>
<tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#APPENDIX_IV">The Organ</a></td><td class="tocpn">80</td></tr> <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">The Organ</td><td class="tocpn">80</td></tr>
<tr><td class="tocch" colspan="3"><a href="#INDEX">Index</a></td><td class="tocpn">83</td></tr> <tr><td class="tocch" colspan="3">Index</td><td class="tocpn">83</td></tr>
</table> </table>
<p class='toc'> <p class='toc'>
@ -120,8 +116,7 @@
<hr/> <hr/>
<p style='text-indent:10em'>A very indented paragraph</p> <p style='text-indent:10em'>A very indented paragraph</p>
<p style='text-indent:0em'>An unindented paragraph</p> <p style='text-indent:0em'>An unindented paragraph</p>
<p>A default indented paragraph</p><br/> <p>A default indented paragraph</p>
<hr/>
<p class='toc'> <p class='toc'>
<hr /> <hr />
<a href='#toc'>Table of Contents</a> <a href='#toc'>Table of Contents</a>

View File

@ -140,7 +140,7 @@ class Cell(object):
ts = tb.textStyle.attrs ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize'])) default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
parindent = self.pts_to_pixels(ts['parindent']) parindent = self.pts_to_pixels(ts['parindent'])
mwidth = 0
for token, attrs in tokens(tb): for token, attrs in tokens(tb):
font = default_font font = default_font
if isinstance(token, int): # Handle para and line breaks if isinstance(token, int): # Handle para and line breaks
@ -155,8 +155,10 @@ class Cell(object):
continue continue
word = token.split() word = token.split()
word = word[0] if word else "" word = word[0] if word else ""
width, height = font.getsize(word) width = font.getsize(word)[0]
return parindent + width + 2 if width > mwidth:
mwidth = width
return parindent + mwidth + 2
def text_block_size(self, tb, maxwidth=sys.maxint, debug=False): def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
ts = tb.textStyle.attrs ts = tb.textStyle.attrs
@ -338,7 +340,7 @@ class Table(object):
adjustable_columns.append(i) adjustable_columns.append(i)
itercount = 0 itercount = 0
min_widths = [self.minimum_width(i) for i in xrange(cols)] min_widths = [self.minimum_width(i)+10 for i in xrange(cols)]
while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100: while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
for i in adjustable_columns: for i in adjustable_columns:
widths[i] = ceil((95./100.)*widths[i]) if \ widths[i] = ceil((95./100.)*widths[i]) if \

View File

@ -249,7 +249,7 @@ class LrsContainer(object):
self.parent = None self.parent = None
self.contents = [] self.contents = []
self.validChildren = validChildren self.validChildren = validChildren
self.must_append = False self.must_append = False #: If True even an empty container is appended by append_to
def has_text(self): def has_text(self):
''' Return True iff this container has non whitespace text ''' ''' Return True iff this container has non whitespace text '''
@ -261,7 +261,7 @@ class LrsContainer(object):
if child.has_text(): if child.has_text():
return True return True
for item in self.contents: for item in self.contents:
if isinstance(item, (Plot, ImageBlock, Canvas)): if isinstance(item, (Plot, ImageBlock, Canvas, CR)):
return True return True
return False return False
@ -270,7 +270,7 @@ class LrsContainer(object):
Append self to C{parent} iff self has non whitespace textual content Append self to C{parent} iff self has non whitespace textual content
@type parent: LrsContainer @type parent: LrsContainer
''' '''
if self.has_text() or self.must_append: if self.contents or self.must_append:
parent.append(self) parent.append(self)