Refinements for wasteland

2025-06-23 15:30:45 -04:00 · 2007-08-21 23:33:33 +00:00 · 2007-08-21 23:33:33 +00:00 · d6c08b7da3
commit d6c08b7da3
parent 7402d7c4d8
4 changed files with 63 additions and 74 deletions
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@ -294,6 +294,7 @@ class HTMLConverter(object):
            pprop.update(self.pseudo_css[tagname])
        if tag.has_key("class"):
            cls = tag["class"].lower()
+            for cls in cls.split():            
                for classname in ["."+cls, tagname+"."+cls]:
                    if self.css.has_key(classname):
                        prop.update(self.css[classname])
@ -625,7 +626,6 @@ class HTMLConverter(object):
                    unneeded.append(prop)
            for prop in unneeded:
                fp.pop(prop)
-                
            elem = Span(text=src, **fp) if (fp or force_span_use) else src
            self.current_para.append(elem)
        
@ -651,25 +651,26 @@ class HTMLConverter(object):
        
    def end_current_para(self):
        ''' 
-        End current paragraph with a paragraph break after it. If the current
-        paragraph has no non whitespace text in it do nothing.
+        End current paragraph with a paragraph break after it. 
+        '''
+        if self.current_para.contents:
+            self.current_block.append(self.current_para)
+        self.current_block.append(CR())
+        self.current_para = Paragraph()
+            
+    def end_current_block(self):
+        '''
+        End current TextBlock. Create new TextBlock with the same styles.
        '''
-        if not self.current_para.has_text():
-            return
        if self.current_para.contents:
            self.current_block.append(self.current_para)
            self.current_para = Paragraph()
-        if self.current_block.contents and \
-            not isinstance(self.current_block.contents[-1], CR):
-            self.current_block.append(CR())
-            
-    def end_current_block(self):
-        self.current_para.append_to(self.current_block)
-        self.current_block.append_to(self.current_page)
-        self.current_para = Paragraph()
+        if self.current_block.contents or self.current_block.must_append:
+            self.current_page.append(self.current_block)
            self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
                                                         blockStyle=self.current_block.blockStyle)
        
+    
    def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
        original_path = path
        if self.rotated_images.has_key(path):
@ -1033,7 +1034,7 @@ class HTMLConverter(object):
        return fp
        
    
-    def process_block(self, tag, tag_css, tkey):        
+    def process_block(self, tag, tag_css):        
        ''' Ensure padding and text-indent properties are respected '''
        text_properties = self.text_properties(tag_css)
        block_properties = self.block_properties(tag_css)
@ -1057,7 +1058,8 @@ class HTMLConverter(object):
                self.block_styles.append(bs)
            self.current_block = self.book.create_text_block(blockStyle=bs,
                                                             textStyle=ts)
-            self.targets[tkey] = self.current_block
+            return True
+        return False            
    
    def parse_tag(self, tag, parent_css):
        try:
@ -1298,20 +1300,13 @@ class HTMLConverter(object):
            self.current_para.append(elem(text))
                                
        elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-            tkey = None
+            new_block = self.process_block(tag, tag_css)
            if self.anchor_ids and tag.has_key('id'):
-                target = self.book.create_text_block(textStyle=self.current_block.textStyle,
-                                                     blockStyle=self.current_block.blockStyle)
                tkey = self.target_prefix+tag['id']
-                self.targets[tkey] = target
-                
-                if len(self.current_block.contents) > 2:
+                if not new_block:
                    self.end_current_block()
-                    self.current_page.append(target)
-                    self.unused_target_blocks.append(target)
-                else:
-                    self.targets[tkey] = self.current_block
                self.current_block.must_append = True
+                self.targets[tkey] = self.current_block
            src = self.get_text(tag, limit=1000)
            if not self.disable_chapter_detection and tagname.startswith('h'):
                if self.chapter_regex.search(src):
@ -1320,18 +1315,16 @@ class HTMLConverter(object):
                    self.page_break_found = True
            if not tag.contents:
                self.current_block.append(CR())
-                self.current_block.must_append = True
                return
-            self.process_block(tag, tag_css, tkey)
-            if self.current_para.contents:
-                self.current_block.append(self.current_para)            
+            
+            if self.current_para.has_text():
+                self.current_para.append_to(self.current_block)
            if self.current_block.contents:
                self.current_block.append(CR())
            self.previous_text = '\n'
            self.current_para = Paragraph()
-            
            self.process_children(tag, tag_css, tag_pseudo_css)
-            if self.current_para.contents:
+            if self.current_para.contents :
                self.current_block.append(self.current_para)
            self.current_para = Paragraph()
            if tagname.startswith('h') or self.blank_after_para:
@ -1346,14 +1339,13 @@ class HTMLConverter(object):
            self.line_break()
            self.previous_text = '\n'
        elif tagname in ['hr', 'tr']: # tr needed for nested tables
-            self.end_current_para()
-            self.line_break()
            self.end_current_block()
            if tagname == 'hr':
                self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
            self.previous_text = '\n'
            self.process_children(tag, tag_css, tag_pseudo_css)
        elif tagname == 'td': # Needed for nested tables
+            if not self.in_table:
                self.current_para.append(' ')
                self.previous_text = ' '
            self.process_children(tag, tag_css, tag_pseudo_css)
@ -1362,9 +1354,9 @@ class HTMLConverter(object):
            try:
                self.process_table(tag, tag_css)
            except Exception, err:
-                self.logger.warning('An error occurred while processing a table: %s', str(err))
+                self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err))
                self.logger.debug('', exc_info=True)
-                self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
+                self.logger.debug('Bad table:\n%s', str(tag)[:300])
                self.in_table = False
                self.process_children(tag, tag_css, tag_pseudo_css)
            finally:                
@ -1468,7 +1460,7 @@ def process_file(path, options, logger=None):
            fheader = re.sub(r'%%a','%a',fheader)
            fheader = re.sub(r'%%t','%t',fheader)                
            header.append(fheader + "  ")            
-        book, fonts = Book(options, header=header, **args)
+        book, fonts = Book(options, logger, header=header, **args)
        le = re.compile(options.link_exclude) if options.link_exclude else \
             re.compile('$')
        pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@ -14,7 +14,7 @@
  This file contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter   from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
  </p>
  <br/>
-  <h2><a name='toc'>Table of Contents</a></h2>
+  <h2 id="toc">Table of Contents</h2>
  <ul style='page-break-after:always'>
    <li><a href='#lists'>Lists</a></li>
    <li><a href='#tables'>Tables</a></li>
@ -25,7 +25,7 @@
    <li><a href='#recursive'>Recursive link following</a></li>
 </ul>

- <h2><a name='lists'>Lists</a></h2>
+ <h2 id="lists">Lists</h2>
 
 <h3>Nested lists</h3>
 <ol>
@ -54,8 +54,6 @@
 </p>

 <h2><a name='tables'>Tables</a></h2>
- <br/>
-
 <table>
 <tr><td colspan=4><h3 style="text-align:center">A matrix</h3></td></tr>
 <tr><td></td><td style="text-align:center"><b>Column 1</b></td><td style="text-align:center"><b>Column 2</b></td><td style="text-align:center"><b>Column 3</b></td></tr>
@ -67,29 +65,27 @@
 <p>
 html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells. 
 </p>
- <br/>
 <p>
 Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
 </p>
- <br />
 <p style="page-break-after:always">
 On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
 </p>
 <h3 align="center">Sample Complex Table of Contents</h3>
 <table summary="TOC">
 <tr><td colspan="3">&nbsp;</td><td align="right">PAGE</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#PREFACE">Preface</a></td><td class="tocpn">v</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#REFERENCE_WORKS">List of Works of Reference</a></td><td class="tocpn">vii</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#LIST_OF_ILLUSTRATIONS">List of Illustrations</a></td><td class="tocpn">xi</td></tr>
- <tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch"><a href="#CHAPTER_I">History of the Foundation</a></td><td class="tocpn">3</td></tr>
- <tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#CHAPTER_II">Exterior of the Church</a></td><td class="tocpn">25</td></tr>
- <tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#CHAPTER_III">Interior of the Church</a></td><td class="tocpn">33</td></tr>
- <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#CHAPTER_IV">St. Bartholomew-the-Less and the Hospital</a></td><td class="tocpn">63</td></tr>
- <tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch"><a href="#APPENDIX_I">The Priory Seals</a></td><td class="tocpn">73</td></tr>
- <tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#APPENDIX_II">The Priors and Rectors</a></td><td class="tocpn">77</td></tr>
- <tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#APPENDIX_III">Inventory of Vestments, etc.</a></td><td class="tocpn">79</td></tr>
- <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#APPENDIX_IV">The Organ</a></td><td class="tocpn">80</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#INDEX">Index</a></td><td class="tocpn">83</td></tr>
+ <tr><td class="tocch" colspan="3">Preface</td><td class="tocpn">v</td></tr>
+ <tr><td class="tocch" colspan="3">List of Works of Reference</td><td class="tocpn">vii</td></tr>
+ <tr><td class="tocch" colspan="3">List of Illustrations</td><td class="tocpn">xi</td></tr>
+ <tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch">History of the Foundation</td><td class="tocpn">3</td></tr>
+ <tr><td class="tocchr" colspan="2">II.</td><td class="tocch">Exterior of the Church</td><td class="tocpn">25</td></tr>
+ <tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Interior of the Church</td><td class="tocpn">33</td></tr>
+ <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">St. Bartholomew-the-Less and the Hospital</td><td class="tocpn">63</td></tr>
+ <tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch">The Priory Seals</td><td class="tocpn">73</td></tr>
+ <tr><td class="tocchr" colspan="2">II.</td><td class="tocch">The Priors and Rectors</td><td class="tocpn">77</td></tr>
+ <tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Inventory of Vestments, etc.</td><td class="tocpn">79</td></tr>
+ <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">The Organ</td><td class="tocpn">80</td></tr>
+ <tr><td class="tocch" colspan="3">Index</td><td class="tocpn">83</td></tr>
 </table>
 
 <p class='toc'>
@ -120,8 +116,7 @@
 <hr/>
 <p style='text-indent:10em'>A very indented paragraph</p>
 <p style='text-indent:0em'>An unindented paragraph</p>
- <p>A default indented paragraph</p><br/>
- <hr/>
+ <p>A default indented paragraph</p>
 <p class='toc'>
 <hr />
 <a href='#toc'>Table of Contents</a>
--- a/src/libprs500/ebooks/lrf/html/table.py
+++ b/src/libprs500/ebooks/lrf/html/table.py
@ -140,7 +140,7 @@ class Cell(object):
        ts = tb.textStyle.attrs
        default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
        parindent = self.pts_to_pixels(ts['parindent'])
-        
+        mwidth = 0
        for token, attrs in tokens(tb):
            font = default_font
            if isinstance(token, int): # Handle para and line breaks        
@ -155,8 +155,10 @@ class Cell(object):
                continue
            word = token.split()
            word = word[0] if word else ""
-            width, height = font.getsize(word)            
-            return parindent + width + 2
+            width = font.getsize(word)[0]
+            if width > mwidth:
+                mwidth = width
+        return parindent + mwidth + 2
    
    def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
        ts = tb.textStyle.attrs
@ -338,7 +340,7 @@ class Table(object):
                adjustable_columns.append(i)
                
        itercount = 0
-        min_widths = [self.minimum_width(i) for i in xrange(cols)]
+        min_widths = [self.minimum_width(i)+10 for i in xrange(cols)]
        while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
            for i in adjustable_columns:
                widths[i] = ceil((95./100.)*widths[i]) if \
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@ -249,7 +249,7 @@ class LrsContainer(object):
        self.parent = None
        self.contents = []
        self.validChildren = validChildren
-        self.must_append = False
+        self.must_append = False #: If True even an empty container is appended by append_to
            
    def has_text(self):
        ''' Return True iff this container has non whitespace text '''
@ -261,7 +261,7 @@ class LrsContainer(object):
                if child.has_text():
                    return True
        for item in self.contents:
-            if isinstance(item, (Plot, ImageBlock, Canvas)):
+            if isinstance(item, (Plot, ImageBlock, Canvas, CR)):
                return True
        return False
    
@ -270,7 +270,7 @@ class LrsContainer(object):
        Append self to C{parent} iff self has non whitespace textual content        
        @type parent: LrsContainer
        '''
-        if self.has_text() or self.must_append:
+        if self.contents or self.must_append:
            parent.append(self)