Refinements for wasteland

2025-07-07 10:14:46 -04:00 · 2007-08-21 23:33:33 +00:00 · 2007-08-21 23:33:33 +00:00 · d6c08b7da3
commit d6c08b7da3
parent 7402d7c4d8
4 changed files with 63 additions and 74 deletions
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -293,12 +293,13 @@ class HTMLConverter(object):
        if self.pseudo_css.has_key(tagname):
            pprop.update(self.pseudo_css[tagname])
        if tag.has_key("class"):
-            cls = tag["class"].lower()            
+            cls = tag["class"].lower()
-            for classname in ["."+cls, tagname+"."+cls]:
+            for cls in cls.split():            
-                if self.css.has_key(classname):
+                for classname in ["."+cls, tagname+"."+cls]:
-                    prop.update(self.css[classname])
+                    if self.css.has_key(classname):
-                if self.pseudo_css.has_key(classname):
+                        prop.update(self.css[classname])
-                    pprop.update(self.pseudo_css[classname])
+                    if self.pseudo_css.has_key(classname):
                        pprop.update(self.pseudo_css[classname])
        if tag.has_key("style"):
            prop.update(self.parse_style_properties(tag["style"]))
        return prop, pprop
@ -625,7 +626,6 @@ class HTMLConverter(object):
                    unneeded.append(prop)
            for prop in unneeded:
                fp.pop(prop)
            elem = Span(text=src, **fp) if (fp or force_span_use) else src
            self.current_para.append(elem)
@ -651,24 +651,25 @@ class HTMLConverter(object):
    def end_current_para(self):
        ''' 
-        End current paragraph with a paragraph break after it. If the current
+        End current paragraph with a paragraph break after it. 
-        paragraph has no non whitespace text in it do nothing.
+        '''
        if self.current_para.contents:
            self.current_block.append(self.current_para)
        self.current_block.append(CR())
        self.current_para = Paragraph()
    def end_current_block(self):
        '''
        End current TextBlock. Create new TextBlock with the same styles.
        '''
        if not self.current_para.has_text():
            return
        if self.current_para.contents:
            self.current_block.append(self.current_para)
            self.current_para = Paragraph()
-        if self.current_block.contents and \
+        if self.current_block.contents or self.current_block.must_append:
-            not isinstance(self.current_block.contents[-1], CR):
+            self.current_page.append(self.current_block)
-            self.current_block.append(CR())
+            self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
    def end_current_block(self):
        self.current_para.append_to(self.current_block)
        self.current_block.append_to(self.current_page)
        self.current_para = Paragraph()
        self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
                                                         blockStyle=self.current_block.blockStyle)
    def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
        original_path = path
@ -1033,7 +1034,7 @@ class HTMLConverter(object):
        return fp
-    def process_block(self, tag, tag_css, tkey):        
+    def process_block(self, tag, tag_css):        
        ''' Ensure padding and text-indent properties are respected '''
        text_properties = self.text_properties(tag_css)
        block_properties = self.block_properties(tag_css)
@ -1057,7 +1058,8 @@ class HTMLConverter(object):
                self.block_styles.append(bs)
            self.current_block = self.book.create_text_block(blockStyle=bs,
                                                             textStyle=ts)
-            self.targets[tkey] = self.current_block
+            return True
        return False            
    def parse_tag(self, tag, parent_css):
        try:
@ -1298,20 +1300,13 @@ class HTMLConverter(object):
            self.current_para.append(elem(text))
        elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-            tkey = None
+            new_block = self.process_block(tag, tag_css)
-            if self.anchor_ids and tag.has_key('id'):                
+            if self.anchor_ids and tag.has_key('id'):
                target = self.book.create_text_block(textStyle=self.current_block.textStyle,
                                                     blockStyle=self.current_block.blockStyle)
                tkey = self.target_prefix+tag['id']
-                self.targets[tkey] = target
+                if not new_block:
                if len(self.current_block.contents) > 2:
                    self.end_current_block()
-                    self.current_page.append(target)
+                self.current_block.must_append = True
-                    self.unused_target_blocks.append(target)
+                self.targets[tkey] = self.current_block
                else:
                    self.targets[tkey] = self.current_block
                    self.current_block.must_append = True
            src = self.get_text(tag, limit=1000)
            if not self.disable_chapter_detection and tagname.startswith('h'):
                if self.chapter_regex.search(src):
@ -1320,18 +1315,16 @@ class HTMLConverter(object):
                    self.page_break_found = True
            if not tag.contents:
                self.current_block.append(CR())
                self.current_block.must_append = True
                return
-            self.process_block(tag, tag_css, tkey)
+            
-            if self.current_para.contents:
+            if self.current_para.has_text():
-                self.current_block.append(self.current_para)            
+                self.current_para.append_to(self.current_block)
            if self.current_block.contents:
                self.current_block.append(CR())
            self.previous_text = '\n'
            self.current_para = Paragraph()
            self.process_children(tag, tag_css, tag_pseudo_css)
-            if self.current_para.contents:
+            if self.current_para.contents :
                self.current_block.append(self.current_para)
            self.current_para = Paragraph()
            if tagname.startswith('h') or self.blank_after_para:
@ -1346,25 +1339,24 @@ class HTMLConverter(object):
            self.line_break()
            self.previous_text = '\n'
        elif tagname in ['hr', 'tr']: # tr needed for nested tables
            self.end_current_para()
            self.line_break()
            self.end_current_block()
            if tagname == 'hr':
                self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
            self.previous_text = '\n'
            self.process_children(tag, tag_css, tag_pseudo_css)
        elif tagname == 'td': # Needed for nested tables
-            self.current_para.append(' ')
+            if not self.in_table:
-            self.previous_text = ' '
+                self.current_para.append(' ')
                self.previous_text = ' '
            self.process_children(tag, tag_css, tag_pseudo_css)
        elif tagname == 'table' and not self.ignore_tables and not self.in_table:
            tag_css = self.tag_css(tag)[0] # Table should not inherit CSS
            try:
                self.process_table(tag, tag_css)
            except Exception, err:
-                self.logger.warning('An error occurred while processing a table: %s', str(err))
+                self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err))
                self.logger.debug('', exc_info=True)
-                self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
+                self.logger.debug('Bad table:\n%s', str(tag)[:300])
                self.in_table = False
                self.process_children(tag, tag_css, tag_pseudo_css)
            finally:                
@ -1468,7 +1460,7 @@ def process_file(path, options, logger=None):
            fheader = re.sub(r'%%a','%a',fheader)
            fheader = re.sub(r'%%t','%t',fheader)                
            header.append(fheader + "  ")            
-        book, fonts = Book(options, header=header, **args)
+        book, fonts = Book(options, logger, header=header, **args)
        le = re.compile(options.link_exclude) if options.link_exclude else \
             re.compile('$')
        pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@ -14,7 +14,7 @@
  This file contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter   from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
  </p>
  <br/>
-  <h2><a name='toc'>Table of Contents</a></h2>
+  <h2 id="toc">Table of Contents</h2>
  <ul style='page-break-after:always'>
    <li><a href='#lists'>Lists</a></li>
    <li><a href='#tables'>Tables</a></li>
@ -25,7 +25,7 @@
    <li><a href='#recursive'>Recursive link following</a></li>
 </ul>
- <h2><a name='lists'>Lists</a></h2>
+ <h2 id="lists">Lists</h2>
 <h3>Nested lists</h3>
 <ol>
@ -54,8 +54,6 @@
 </p>
 <h2><a name='tables'>Tables</a></h2>
 <br/>
 <table>
 <tr><td colspan=4><h3 style="text-align:center">A matrix</h3></td></tr>
 <tr><td></td><td style="text-align:center"><b>Column 1</b></td><td style="text-align:center"><b>Column 2</b></td><td style="text-align:center"><b>Column 3</b></td></tr>
@ -67,29 +65,27 @@
 <p>
 html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells. 
 </p>
 <br/>
 <p>
 Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
 </p>
 <br />
 <p style="page-break-after:always">
 On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
 </p>
 <h3 align="center">Sample Complex Table of Contents</h3>
 <table summary="TOC">
 <tr><td colspan="3">&nbsp;</td><td align="right">PAGE</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#PREFACE">Preface</a></td><td class="tocpn">v</td></tr>
+ <tr><td class="tocch" colspan="3">Preface</td><td class="tocpn">v</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#REFERENCE_WORKS">List of Works of Reference</a></td><td class="tocpn">vii</td></tr>
+ <tr><td class="tocch" colspan="3">List of Works of Reference</td><td class="tocpn">vii</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#LIST_OF_ILLUSTRATIONS">List of Illustrations</a></td><td class="tocpn">xi</td></tr>
+ <tr><td class="tocch" colspan="3">List of Illustrations</td><td class="tocpn">xi</td></tr>
- <tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch"><a href="#CHAPTER_I">History of the Foundation</a></td><td class="tocpn">3</td></tr>
+ <tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch">History of the Foundation</td><td class="tocpn">3</td></tr>
- <tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#CHAPTER_II">Exterior of the Church</a></td><td class="tocpn">25</td></tr>
+ <tr><td class="tocchr" colspan="2">II.</td><td class="tocch">Exterior of the Church</td><td class="tocpn">25</td></tr>
- <tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#CHAPTER_III">Interior of the Church</a></td><td class="tocpn">33</td></tr>
+ <tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Interior of the Church</td><td class="tocpn">33</td></tr>
- <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#CHAPTER_IV">St. Bartholomew-the-Less and the Hospital</a></td><td class="tocpn">63</td></tr>
+ <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">St. Bartholomew-the-Less and the Hospital</td><td class="tocpn">63</td></tr>
- <tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch"><a href="#APPENDIX_I">The Priory Seals</a></td><td class="tocpn">73</td></tr>
+ <tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch">The Priory Seals</td><td class="tocpn">73</td></tr>
- <tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#APPENDIX_II">The Priors and Rectors</a></td><td class="tocpn">77</td></tr>
+ <tr><td class="tocchr" colspan="2">II.</td><td class="tocch">The Priors and Rectors</td><td class="tocpn">77</td></tr>
- <tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#APPENDIX_III">Inventory of Vestments, etc.</a></td><td class="tocpn">79</td></tr>
+ <tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Inventory of Vestments, etc.</td><td class="tocpn">79</td></tr>
- <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#APPENDIX_IV">The Organ</a></td><td class="tocpn">80</td></tr>
+ <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">The Organ</td><td class="tocpn">80</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#INDEX">Index</a></td><td class="tocpn">83</td></tr>
+ <tr><td class="tocch" colspan="3">Index</td><td class="tocpn">83</td></tr>
 </table>
 <p class='toc'>
@ -120,8 +116,7 @@
 <hr/>
 <p style='text-indent:10em'>A very indented paragraph</p>
 <p style='text-indent:0em'>An unindented paragraph</p>
- <p>A default indented paragraph</p><br/>
+ <p>A default indented paragraph</p>
 <hr/>
 <p class='toc'>
 <hr />
 <a href='#toc'>Table of Contents</a>
--- a/src/libprs500/ebooks/lrf/html/table.py
+++ b/src/libprs500/ebooks/lrf/html/table.py
@ -140,7 +140,7 @@ class Cell(object):
        ts = tb.textStyle.attrs
        default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
        parindent = self.pts_to_pixels(ts['parindent'])
-        
+        mwidth = 0
        for token, attrs in tokens(tb):
            font = default_font
            if isinstance(token, int): # Handle para and line breaks        
@ -155,8 +155,10 @@ class Cell(object):
                continue
            word = token.split()
            word = word[0] if word else ""
-            width, height = font.getsize(word)            
+            width = font.getsize(word)[0]
-            return parindent + width + 2
+            if width > mwidth:
                mwidth = width
        return parindent + mwidth + 2
    def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
        ts = tb.textStyle.attrs
@ -338,7 +340,7 @@ class Table(object):
                adjustable_columns.append(i)
        itercount = 0
-        min_widths = [self.minimum_width(i) for i in xrange(cols)]
+        min_widths = [self.minimum_width(i)+10 for i in xrange(cols)]
        while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
            for i in adjustable_columns:
                widths[i] = ceil((95./100.)*widths[i]) if \
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@ -249,7 +249,7 @@ class LrsContainer(object):
        self.parent = None
        self.contents = []
        self.validChildren = validChildren
-        self.must_append = False
+        self.must_append = False #: If True even an empty container is appended by append_to
    def has_text(self):
        ''' Return True iff this container has non whitespace text '''
@ -261,7 +261,7 @@ class LrsContainer(object):
                if child.has_text():
                    return True
        for item in self.contents:
-            if isinstance(item, (Plot, ImageBlock, Canvas)):
+            if isinstance(item, (Plot, ImageBlock, Canvas, CR)):
                return True
        return False
@ -270,7 +270,7 @@ class LrsContainer(object):
        Append self to C{parent} iff self has non whitespace textual content        
        @type parent: LrsContainer
        '''
-        if self.has_text() or self.must_append:
+        if self.contents or self.must_append:
            parent.append(self)