From d6c08b7da336a7f1399261b01be3f37f49d9a3e2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 21 Aug 2007 23:33:33 +0000
Subject: [PATCH] Refinements for wasteland

---
 src/libprs500/ebooks/lrf/html/convert_from.py | 86 +++++++++----------
 src/libprs500/ebooks/lrf/html/demo/demo.html  | 35 ++++----
 src/libprs500/ebooks/lrf/html/table.py        | 10 ++-
 src/libprs500/ebooks/lrf/pylrs/pylrs.py       |  6 +-
 4 files changed, 63 insertions(+), 74 deletions(-)

diff --git a/src/libprs500/ebooks/lrf/html/convert_from.py b/src/libprs500/ebooks/lrf/html/convert_from.py
index d8cee34e2d..765ff8eed5 100644
--- a/src/libprs500/ebooks/lrf/html/convert_from.py
+++ b/src/libprs500/ebooks/lrf/html/convert_from.py
@@ -293,12 +293,13 @@ class HTMLConverter(object):
         if self.pseudo_css.has_key(tagname):
             pprop.update(self.pseudo_css[tagname])
         if tag.has_key("class"):
-            cls = tag["class"].lower()            
-            for classname in ["."+cls, tagname+"."+cls]:
-                if self.css.has_key(classname):
-                    prop.update(self.css[classname])
-                if self.pseudo_css.has_key(classname):
-                    pprop.update(self.pseudo_css[classname])
+            cls = tag["class"].lower()
+            for cls in cls.split():            
+                for classname in ["."+cls, tagname+"."+cls]:
+                    if self.css.has_key(classname):
+                        prop.update(self.css[classname])
+                    if self.pseudo_css.has_key(classname):
+                        pprop.update(self.pseudo_css[classname])
         if tag.has_key("style"):
             prop.update(self.parse_style_properties(tag["style"]))
         return prop, pprop
@@ -625,7 +626,6 @@ class HTMLConverter(object):
                     unneeded.append(prop)
             for prop in unneeded:
                 fp.pop(prop)
-                
             elem = Span(text=src, **fp) if (fp or force_span_use) else src
             self.current_para.append(elem)
         
@@ -651,24 +651,25 @@ class HTMLConverter(object):
         
     def end_current_para(self):
         ''' 
-        End current paragraph with a paragraph break after it. If the current
-        paragraph has no non whitespace text in it do nothing.
+        End current paragraph with a paragraph break after it. 
+        '''
+        if self.current_para.contents:
+            self.current_block.append(self.current_para)
+        self.current_block.append(CR())
+        self.current_para = Paragraph()
+            
+    def end_current_block(self):
+        '''
+        End current TextBlock. Create new TextBlock with the same styles.
         '''
-        if not self.current_para.has_text():
-            return
         if self.current_para.contents:
             self.current_block.append(self.current_para)
             self.current_para = Paragraph()
-        if self.current_block.contents and \
-            not isinstance(self.current_block.contents[-1], CR):
-            self.current_block.append(CR())
-            
-    def end_current_block(self):
-        self.current_para.append_to(self.current_block)
-        self.current_block.append_to(self.current_page)
-        self.current_para = Paragraph()
-        self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
+        if self.current_block.contents or self.current_block.must_append:
+            self.current_page.append(self.current_block)
+            self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
                                                          blockStyle=self.current_block.blockStyle)
+        
     
     def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
         original_path = path
@@ -1033,7 +1034,7 @@ class HTMLConverter(object):
         return fp
         
     
-    def process_block(self, tag, tag_css, tkey):        
+    def process_block(self, tag, tag_css):        
         ''' Ensure padding and text-indent properties are respected '''
         text_properties = self.text_properties(tag_css)
         block_properties = self.block_properties(tag_css)
@@ -1057,7 +1058,8 @@ class HTMLConverter(object):
                 self.block_styles.append(bs)
             self.current_block = self.book.create_text_block(blockStyle=bs,
                                                              textStyle=ts)
-            self.targets[tkey] = self.current_block
+            return True
+        return False            
     
     def parse_tag(self, tag, parent_css):
         try:
@@ -1298,20 +1300,13 @@ class HTMLConverter(object):
             self.current_para.append(elem(text))
                                 
         elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
-            tkey = None
-            if self.anchor_ids and tag.has_key('id'):                
-                target = self.book.create_text_block(textStyle=self.current_block.textStyle,
-                                                     blockStyle=self.current_block.blockStyle)
+            new_block = self.process_block(tag, tag_css)
+            if self.anchor_ids and tag.has_key('id'):
                 tkey = self.target_prefix+tag['id']
-                self.targets[tkey] = target
-                
-                if len(self.current_block.contents) > 2:
+                if not new_block:
                     self.end_current_block()
-                    self.current_page.append(target)
-                    self.unused_target_blocks.append(target)
-                else:
-                    self.targets[tkey] = self.current_block
-                    self.current_block.must_append = True
+                self.current_block.must_append = True
+                self.targets[tkey] = self.current_block
             src = self.get_text(tag, limit=1000)
             if not self.disable_chapter_detection and tagname.startswith('h'):
                 if self.chapter_regex.search(src):
@@ -1320,18 +1315,16 @@ class HTMLConverter(object):
                     self.page_break_found = True
             if not tag.contents:
                 self.current_block.append(CR())
-                self.current_block.must_append = True
                 return
-            self.process_block(tag, tag_css, tkey)
-            if self.current_para.contents:
-                self.current_block.append(self.current_para)            
+            
+            if self.current_para.has_text():
+                self.current_para.append_to(self.current_block)
             if self.current_block.contents:
                 self.current_block.append(CR())
             self.previous_text = '\n'
             self.current_para = Paragraph()
-            
             self.process_children(tag, tag_css, tag_pseudo_css)
-            if self.current_para.contents:
+            if self.current_para.contents :
                 self.current_block.append(self.current_para)
             self.current_para = Paragraph()
             if tagname.startswith('h') or self.blank_after_para:
@@ -1346,25 +1339,24 @@ class HTMLConverter(object):
             self.line_break()
             self.previous_text = '\n'
         elif tagname in ['hr', 'tr']: # tr needed for nested tables
-            self.end_current_para()
-            self.line_break()
             self.end_current_block()
             if tagname == 'hr':
                 self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
             self.previous_text = '\n'
             self.process_children(tag, tag_css, tag_pseudo_css)
         elif tagname == 'td': # Needed for nested tables
-            self.current_para.append(' ')
-            self.previous_text = ' '
+            if not self.in_table:
+                self.current_para.append(' ')
+                self.previous_text = ' '
             self.process_children(tag, tag_css, tag_pseudo_css)
         elif tagname == 'table' and not self.ignore_tables and not self.in_table:
             tag_css = self.tag_css(tag)[0] # Table should not inherit CSS
             try:
                 self.process_table(tag, tag_css)
             except Exception, err:
-                self.logger.warning('An error occurred while processing a table: %s', str(err))
+                self.logger.warning('An error occurred while processing a table: %s. Ignoring table markup.', str(err))
                 self.logger.debug('', exc_info=True)
-                self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
+                self.logger.debug('Bad table:\n%s', str(tag)[:300])
                 self.in_table = False
                 self.process_children(tag, tag_css, tag_pseudo_css)
             finally:                
@@ -1468,7 +1460,7 @@ def process_file(path, options, logger=None):
             fheader = re.sub(r'%%a','%a',fheader)
             fheader = re.sub(r'%%t','%t',fheader)                
             header.append(fheader + "  ")            
-        book, fonts = Book(options, header=header, **args)
+        book, fonts = Book(options, logger, header=header, **args)
         le = re.compile(options.link_exclude) if options.link_exclude else \
              re.compile('$')
         pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
diff --git a/src/libprs500/ebooks/lrf/html/demo/demo.html b/src/libprs500/ebooks/lrf/html/demo/demo.html
index d0142ce120..3085c6a6fe 100644
--- a/src/libprs500/ebooks/lrf/html/demo/demo.html
+++ b/src/libprs500/ebooks/lrf/html/demo/demo.html
@@ -14,7 +14,7 @@
   This file contains a demonstration of the capabilities of <span style='font-family:monospace'>html2lrf</span>, the HTML to LRF converter   from <em>libprs500.</em> To obtain libprs500 visit<br/><span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
   </p>
   <br/>
-  <h2><a name='toc'>Table of Contents</a></h2>
+  <h2 id="toc">Table of Contents</h2>
   <ul style='page-break-after:always'>
     <li><a href='#lists'>Lists</a></li>
     <li><a href='#tables'>Tables</a></li>
@@ -25,7 +25,7 @@
     <li><a href='#recursive'>Recursive link following</a></li>
  </ul>
 
- <h2><a name='lists'>Lists</a></h2>
+ <h2 id="lists">Lists</h2>
  
  <h3>Nested lists</h3>
  <ol>
@@ -54,8 +54,6 @@
  </p>
 
  <h2><a name='tables'>Tables</a></h2>
- <br/>
-
  <table>
  <tr><td colspan=4><h3 style="text-align:center">A matrix</h3></td></tr>
  <tr><td></td><td style="text-align:center"><b>Column 1</b></td><td style="text-align:center"><b>Column 2</b></td><td style="text-align:center"><b>Column 3</b></td></tr>
@@ -67,29 +65,27 @@
  <p>
  html2lrf supports both rowspan and colspan, but no other HTML table attributes, as it uses its own algorithm to determine optimal placement of cells. 
  </p>
- <br/>
  <p>
  Note that if you have custom fonts on your reader, the table may not be properly aligned. Also html2lrf does not support nested tables.
  </p>
- <br />
  <p style="page-break-after:always">
  On the next page you'll see a real life example taken from a Project Gutenberg text with no modifications. It shows off html2lrf's handling of rowspan and colspan.
  </p>
  <h3 align="center">Sample Complex Table of Contents</h3>
  <table summary="TOC">
  <tr><td colspan="3">&nbsp;</td><td align="right">PAGE</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#PREFACE">Preface</a></td><td class="tocpn">v</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#REFERENCE_WORKS">List of Works of Reference</a></td><td class="tocpn">vii</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#LIST_OF_ILLUSTRATIONS">List of Illustrations</a></td><td class="tocpn">xi</td></tr>
- <tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch"><a href="#CHAPTER_I">History of the Foundation</a></td><td class="tocpn">3</td></tr>
- <tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#CHAPTER_II">Exterior of the Church</a></td><td class="tocpn">25</td></tr>
- <tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#CHAPTER_III">Interior of the Church</a></td><td class="tocpn">33</td></tr>
- <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#CHAPTER_IV">St. Bartholomew-the-Less and the Hospital</a></td><td class="tocpn">63</td></tr>
- <tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch"><a href="#APPENDIX_I">The Priory Seals</a></td><td class="tocpn">73</td></tr>
- <tr><td class="tocchr" colspan="2">II.</td><td class="tocch"><a href="#APPENDIX_II">The Priors and Rectors</a></td><td class="tocpn">77</td></tr>
- <tr><td class="tocchr" colspan="2">III.</td><td class="tocch"><a href="#APPENDIX_III">Inventory of Vestments, etc.</a></td><td class="tocpn">79</td></tr>
- <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch"><a href="#APPENDIX_IV">The Organ</a></td><td class="tocpn">80</td></tr>
- <tr><td class="tocch" colspan="3"><a href="#INDEX">Index</a></td><td class="tocpn">83</td></tr>
+ <tr><td class="tocch" colspan="3">Preface</td><td class="tocpn">v</td></tr>
+ <tr><td class="tocch" colspan="3">List of Works of Reference</td><td class="tocpn">vii</td></tr>
+ <tr><td class="tocch" colspan="3">List of Illustrations</td><td class="tocpn">xi</td></tr>
+ <tr><td class="tocch">Chapter</td><td class="tocchr">I.</td><td class="tocch">History of the Foundation</td><td class="tocpn">3</td></tr>
+ <tr><td class="tocchr" colspan="2">II.</td><td class="tocch">Exterior of the Church</td><td class="tocpn">25</td></tr>
+ <tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Interior of the Church</td><td class="tocpn">33</td></tr>
+ <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">St. Bartholomew-the-Less and the Hospital</td><td class="tocpn">63</td></tr>
+ <tr><td class="tocch">Appendix</td><td class="tocchr">I.</td><td class="tocch">The Priory Seals</td><td class="tocpn">73</td></tr>
+ <tr><td class="tocchr" colspan="2">II.</td><td class="tocch">The Priors and Rectors</td><td class="tocpn">77</td></tr>
+ <tr><td class="tocchr" colspan="2">III.</td><td class="tocch">Inventory of Vestments, etc.</td><td class="tocpn">79</td></tr>
+ <tr><td class="tocchr" colspan="2">IV.</td><td class="tocch">The Organ</td><td class="tocpn">80</td></tr>
+ <tr><td class="tocch" colspan="3">Index</td><td class="tocpn">83</td></tr>
  </table>
  
  <p class='toc'>
@@ -120,8 +116,7 @@
  <hr/>
  <p style='text-indent:10em'>A very indented paragraph</p>
  <p style='text-indent:0em'>An unindented paragraph</p>
- <p>A default indented paragraph</p><br/>
- <hr/>
+ <p>A default indented paragraph</p>
  <p class='toc'>
  <hr />
  <a href='#toc'>Table of Contents</a>
diff --git a/src/libprs500/ebooks/lrf/html/table.py b/src/libprs500/ebooks/lrf/html/table.py
index c63b1dfeba..f21dc6f3b6 100644
--- a/src/libprs500/ebooks/lrf/html/table.py
+++ b/src/libprs500/ebooks/lrf/html/table.py
@@ -140,7 +140,7 @@ class Cell(object):
         ts = tb.textStyle.attrs
         default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
         parindent = self.pts_to_pixels(ts['parindent'])
-        
+        mwidth = 0
         for token, attrs in tokens(tb):
             font = default_font
             if isinstance(token, int): # Handle para and line breaks        
@@ -155,8 +155,10 @@ class Cell(object):
                 continue
             word = token.split()
             word = word[0] if word else ""
-            width, height = font.getsize(word)            
-            return parindent + width + 2
+            width = font.getsize(word)[0]
+            if width > mwidth:
+                mwidth = width
+        return parindent + mwidth + 2
     
     def text_block_size(self, tb, maxwidth=sys.maxint, debug=False):
         ts = tb.textStyle.attrs
@@ -338,7 +340,7 @@ class Table(object):
                 adjustable_columns.append(i)
                 
         itercount = 0
-        min_widths = [self.minimum_width(i) for i in xrange(cols)]
+        min_widths = [self.minimum_width(i)+10 for i in xrange(cols)]
         while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
             for i in adjustable_columns:
                 widths[i] = ceil((95./100.)*widths[i]) if \
diff --git a/src/libprs500/ebooks/lrf/pylrs/pylrs.py b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
index a3e05acd0e..9f7302e8c1 100644
--- a/src/libprs500/ebooks/lrf/pylrs/pylrs.py
+++ b/src/libprs500/ebooks/lrf/pylrs/pylrs.py
@@ -249,7 +249,7 @@ class LrsContainer(object):
         self.parent = None
         self.contents = []
         self.validChildren = validChildren
-        self.must_append = False
+        self.must_append = False #: If True even an empty container is appended by append_to
             
     def has_text(self):
         ''' Return True iff this container has non whitespace text '''
@@ -261,7 +261,7 @@ class LrsContainer(object):
                 if child.has_text():
                     return True
         for item in self.contents:
-            if isinstance(item, (Plot, ImageBlock, Canvas)):
+            if isinstance(item, (Plot, ImageBlock, Canvas, CR)):
                 return True
         return False
     
@@ -270,7 +270,7 @@ class LrsContainer(object):
         Append self to C{parent} iff self has non whitespace textual content        
         @type parent: LrsContainer
         '''
-        if self.has_text() or self.must_append:
+        if self.contents or self.must_append:
             parent.append(self)