Various improvements to html2lrf. Added a demo for html2lrf

2025-06-23 15:30:45 -04:00 · 2007-04-26 04:56:51 +00:00 · 2007-04-26 04:56:51 +00:00 · 42c4acd360
commit 42c4acd360
parent f25cc305a1
10 changed files with 307 additions and 86 deletions
--- a/.pydevproject
+++ b/.pydevproject
@ -5,6 +5,5 @@
 <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.5</pydev_property>
 <pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
 <path>/libprs500/src</path>
 <path>/libprs500/libprs500.lrf.txt</path>
 </pydev_pathproperty>
 </pydev_project>
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
 suit your distribution.
 """
-__version__   = "0.3.13"
+__version__   = "0.3.14"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
--- a/src/libprs500/lrf/init.py
+++ b/src/libprs500/lrf/init.py
@ -29,6 +29,16 @@ __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"
 class ConversionError(Exception):
    pass
 def get_text(elem):
    ''' Return the textual content of a pylrs element '''
    txt = ''
    if hasattr(elem, 'text'):
        txt += elem.text
    if hasattr(elem, 'contents'):
        for child in elem.contents:
            txt += get_text(child)
    return txt
 def option_parser(usage):
    parser = OptionParser(usage=usage, version='libprs500 '+VERSION)
    parser.add_option('--header', action='store_true', default=False, dest='header',
--- a/src/libprs500/lrf/html/convert_from.py
+++ b/src/libprs500/lrf/html/convert_from.py
@ -14,14 +14,13 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """ 
 Code to convert HTML ebooks into LRF ebooks.
 I am indebted to esperanc for the CSS->Xylog Style conversion routines
 and to Falstaff for pylrs.
 """
-import os, re, sys, shutil
+import os, re, sys, shutil, traceback
 from htmlentitydefs import name2codepoint
 from urllib import urlopen
 from urlparse import urlparse
@ -31,9 +30,10 @@ from operator import itemgetter
 from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, \
                                             NavigableString, Declaration
 from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
-                                      ImageBlock, JumpButton, CharButton, Page, Bold
+                                      ImageBlock, JumpButton, CharButton, \
                                      Page, Bold, Space, Plot, TextStyle, Image
 from libprs500.lrf.pylrs.pylrs import Span as _Span
-from libprs500.lrf import ConversionError, option_parser, Book
+from libprs500.lrf import ConversionError, option_parser, Book, get_text
 from libprs500 import extract
 def ImagePage():
@ -155,6 +155,8 @@ class Span(_Span):
                ans = font_weight(val)
                if ans:
                    t['fontweight'] = ans
                    if ans > 140:
                        t['wordspace'] = '50'
            elif key.startswith("margin"):
                if key == "margin":
                    u = []
@ -181,19 +183,12 @@ class Span(_Span):
                    t["topskip"] = str(u[0])
                if u[1] is not None:
                    t["sidemargin"] = str(u[1])                
            elif key == "text-align" or key == "align":
                if val in ["right", "foot"]:
                    t["align"] = "foot"
                elif val == "center":
                    t["align"] = "center"
                else:
                    t["align"] = "head"
            else:
                print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
        return t        
    def __init__(self, ns, css, font_delta=0):
-        src = ns.string
+        src = ns.string if hasattr(ns, 'string') else str(ns)
        src = re.sub(r'\s{2,}', ' ', src)  # Remove multiple spaces
        for pat, repl in Span.rules:
            src = pat.sub(repl, src)
@ -228,11 +223,35 @@ class HTMLConverter(object):
            i      = {"font-style"  :"italic"},
            em     = {"font-style"  :"italic"},
            small  = {'font-size'   :'small'},
            pre    = {'font-family' :'monospace' },
            center = {'text-align'  : 'center'}
            )
    processed_files = {} #: Files that have been processed
-    def __init__(self, book, path, font_delta=0, verbose=False, cover=None):
+    def __init__(self, book, path, width=575, height=747, 
                 font_delta=0, verbose=False, cover=None):
        '''
        Convert HTML file at C{path} and add it to C{book}. After creating
        the object, you must call L{self.process_links} on it to create the links and
        then L{self.writeto} to output the LRF/S file.
        @param book: The LRF book 
        @type book:  L{libprs500.lrf.pylrs.Book}
        @param path: path to the HTML file to process
        @type path:  C{str}
        @param width: Width of the device on which the LRF file is to be read
        @type width: C{int}
        @param height: Height of the device on which the LRF file is to be read
        @type height: C{int}
        @param font_delta: The amount in pts by which all fonts should be changed
        @type font_delta: C{int}
        @param verbose: Whether processing should be verbose or not
        @type verbose: C{bool}
        @param cover: Path to an image to use as the cover of this book
        @type cover: C{str}
        '''
        self.page_width = width   #: The width of the page
        self.page_height = height #: The height of the page
        self.images  = {}         #: Images referenced in the HTML document
        self.targets = {}         #: <a name=...> elements
        self.links   = []         #: <a href=...> elements        
@ -240,6 +259,7 @@ class HTMLConverter(object):
        self.links_processed = False #: Whether links_processed has been called on this object
        self.font_delta = font_delta
        self.cover = cover
        self.in_ol = False #: Flag indicating we're in an <ol> element
        self.book = book #: The Book object representing a BBeB book        
        path = os.path.abspath(path)
        os.chdir(os.path.dirname(path))
@ -301,7 +321,8 @@ class HTMLConverter(object):
        def merge_parent_css(prop, pcss):
            temp = {}
            for key in pcss.keys():
-                if key.lower().startswith('font'):
+                chk = key.lower()
                if chk.startswith('font') or chk == 'text-align':
                    temp[key] = pcss[key]
            prop.update(temp)
@ -330,11 +351,11 @@ class HTMLConverter(object):
        self.top = self.current_block
        self.process_children(self.soup, {})
-        if self.current_para:
+        if self.current_para and get_text(self.current_para).strip():
            self.current_block.append(self.current_para)
-        if self.current_block:
+        if self.current_block and get_text(self.current_block).strip():
            self.current_page.append(self.current_block)
-        if self.current_page:
+        if self.current_page and get_text(self.current_page).strip():
            self.book.append(self.current_page)
@ -356,7 +377,7 @@ class HTMLConverter(object):
        cwd = os.getcwd()
        for link in self.links:
            purl = urlparse(link.tag['href'])
-            if purl[1]: # Not a local link
+            if purl[1]: # Not a link to a file on the local filesystem
                continue
            path, fragment = purl[2], purl[5]
            para, tag = link.para, link.tag
@ -366,6 +387,7 @@ class HTMLConverter(object):
                    jb = JumpButton(tb)
                    self.book.append(jb)
                    cb = CharButton(jb, text=self.get_text(tag))
                    para.contents = []
                    para.append(cb)
            else:                
                if not os.access(path, os.R_OK):
@ -378,7 +400,9 @@ class HTMLConverter(object):
                        self.files[path] = HTMLConverter(self.book, path, \
                                     font_delta=self.font_delta, verbose=self.verbose)
                        HTMLConverter.processed_files[path] = self.files[path]
-                    except:
+                    except Exception, e:
                        print >>sys.stderr, 'Unable to process', path
                        traceback.print_exc()
                        continue
                    finally:
                        os.chdir(cwd)
@ -392,6 +416,7 @@ class HTMLConverter(object):
                jb = JumpButton(tb)                
                self.book.append(jb)
                cb = CharButton(jb, text=self.get_text(tag))
                para.contents = []
                para.append(cb)                
        self.links_processed = True        
@ -411,13 +436,13 @@ class HTMLConverter(object):
        End the current page, ensuring that any further content is displayed
        on a new page.
        """
-        if self.current_para.contents:
+        if get_text(self.current_para).strip():
            self.current_block.append(self.current_para)
            self.current_para = Paragraph()
-        if self.current_block.contents:
+        if get_text(self.current_block).strip():
            self.current_page.append(self.current_block)
            self.current_block = TextBlock()
-        if self.current_page.contents: 
+        if get_text(self.current_page).strip(): 
            self.book.append(self.current_page)
            self.current_page = Page()
@ -442,8 +467,33 @@ class HTMLConverter(object):
                self.add_text(c, pcss)
    def add_text(self, tag, css):
        '''
        Add text to the current paragraph taking CSS into account.
        @param tag: Either a BeautifulSoup tag or a string
        @param css:
        @type css:
        '''
        src = tag.string if hasattr(tag, 'string') else str(tag)
        if not src.strip():
            self.current_para.append(' ')
        else:
            align = 'head'
            if css.has_key('text-align'):
                val = css['text-align']                
                if val in ["right", "foot"]:
                    align = "foot"
                elif val == "center":
                    align = "center"
                css.pop('text-align')
            if align != self.current_block.textStyle.attrs['align']:
                if get_text(self.current_para).strip():
                    self.current_block.append(self.current_para)
                if get_text(self.current_block).strip():
                    self.current_page.append(self.current_block)
                self.current_block = TextBlock(TextStyle(align=align))
                self.current_para = Paragraph()
            try:
-            self.current_para.append(Span(tag, self.sanctify_css(css), \
+                self.current_para.append(Span(src, self.sanctify_css(css), \
                                              font_delta=self.font_delta))
            except ConversionError, err:
                if self.verbose:
@ -461,22 +511,21 @@ class HTMLConverter(object):
                css.pop(key)
        return css
    def end_current_para(self):
        ''' 
        End current paragraph with a paragraph break after it. If the current
        paragraph has no non whitespace text in it do nothing.
        '''
        if not get_text(self.current_para).strip():
            return
        if self.current_para.contents:
            self.current_block.append(self.current_para)
            self.current_para = Paragraph()
        if self.current_block.contents and \
            not isinstance(self.current_block.contents[-1], CR):
            self.current_block.append(CR())
    def parse_tag(self, tag, parent_css):
        def process_text_tag(tag, tag_css):
            if 'page-break-before' in tag_css.keys():
                if tag_css['page-break-before'].lower() != 'avoid':
                    self.end_page()
                tag_css.pop('page-break-before')
            end_page = False
            if 'page-break-after' in tag_css.keys():
                end_page = True
                tag_css.pop('page-break-after')
            self.process_children(tag, tag_css)
            if end_page:
                self.end_page()
        try:
            tagname = tag.name.lower()
        except AttributeError:
@ -488,17 +537,47 @@ class HTMLConverter(object):
                return
        except KeyError:
            pass
        if 'page-break-before' in tag_css.keys():
            if tag_css['page-break-before'].lower() != 'avoid':
                self.end_page()
            tag_css.pop('page-break-before')
        end_page = False
        if 'page-break-after' in tag_css.keys() and \
           tag_css['page-break-after'].lower() != 'avoid':
            end_page = True
            tag_css.pop('page-break-after')
        if tagname in ["title", "script", "meta", 'del']:            
            pass
        elif tagname == 'a':
            if tag.has_key('name'):
                if get_text(self.current_para).strip():
                    self.current_block.append(self.current_para)
                if get_text(self.current_block).strip():
                    self.current_page.append(self.current_block)
                previous = self.current_block
                tb = TextBlock()
                self.current_block = tb
                self.current_para = Paragraph()
                self.targets[tag['name']] = tb
                self.process_children(tag, tag_css)
                if tb.parent == None:
                    if self.current_block == tb:
                        if get_text(self.current_para):
                            self.current_block.append(self.current_para)
                            self.current_para = Paragraph()
                        self.current_page.append(self.current_block)
                        self.current_block = TextBlock()
                    else:
                        found, marked = False, False
                        for item in self.current_page.contents:
                            if item == previous:
                                found = True
                            if found and isinstance(item, TextBlock):
                                self.targets[tag['name']] = item
                                marked = True
                        if not marked:
                            self.current_page.append(tb)
            elif tag.has_key('href'):
                purl = urlparse(tag['href'])
                path = purl[2]
@ -506,19 +585,18 @@ class HTMLConverter(object):
                    ['png', 'jpg', 'bmp', 'jpeg']:
                    self.add_image_page(path)
                else:
-                    span = _Span()
+                    self.add_text('Link: '+tag['href'], tag_css)
-                    self.current_para.append(span)
+                    self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
                    self.links.append(HTMLConverter.Link(span, tag))
        elif tagname == 'img':
            if tag.has_key('src') and os.access(tag['src'], os.R_OK):
-                width, height = 600, 800
+                width, height = self.page_width, self.page_height
                try:
                    try:
-                        from PIL import Image
+                        from PIL import Image as PILImage
                    except:
                        pass
                    else:
-                        im = Image.open(tag['src'])
+                        im = PILImage.open(tag['src'])
                        width, height = im.size
                    if tag.has_key('width'):
                        width = int(tag['width'])
@ -526,13 +604,23 @@ class HTMLConverter(object):
                        height = int(tag['height'])
                except:
                    pass
                path = os.path.abspath(tag['src'])
                if not self.images.has_key(path):
                    self.images[path] = ImageStream(path)
                if max(width, height) <= min(self.page_width, self.page_height)/5.:
                    im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
                               xsize=width, ysize=height)
                    self.current_para.append(Plot(im, xsize=width*10, ysize=width*10))
                elif max(width, height) <= min(self.page_width, self.page_height)/2.:
                    self.end_current_para()
                    im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
                               xsize=width, ysize=height)
                    self.current_para.append(Plot(im, xsize=width*10, ysize=width*10))
                else:
                    self.current_block.append(self.current_para)
                    self.current_page.append(self.current_block)
                    self.current_para = Paragraph()
                    self.current_block = TextBlock()
                path = os.path.abspath(tag['src'])
                if not self.images.has_key(path):
                    self.images[path] = ImageStream(path)
                    im = ImageBlock(self.images[path], x1=width, y1=height, 
                                    xsize=width, ysize=height)
                    self.current_page.append(im)                        
@ -557,31 +645,73 @@ class HTMLConverter(object):
                    f.close()
                except IOError:
                    pass
-        elif tagname in ['p', 'div', 'ul', 'ol', 'tr', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+        elif tagname == 'pre':
-            # TODO: Implement ol
+            self.end_current_para()
-            indent = tag_css.pop('text-indent', '')
+            src = ''.join([str(i) for i in tag.contents])
-            if indent:
+            lines = src.split('\n')
-                # TODO: If indent is different from current textblock's parindent
+            for line in lines:
-                # start a new TextBlock
+                try:
                    self.current_para.append(Span(line, tag_css))
                except ConversionError:
                    pass
-            self.current_para.CR() # Put a paragraph end             
+                self.current_para.CR()
        elif tagname in ['ul', 'ol']:
            self.in_ol = 1 if tagname == 'ol' else 0
            self.end_current_para()
            self.process_children(tag, tag_css)
            self.in_ol = 0
            self.end_current_para()
        elif tagname == 'li':
            prepend = str(self.in_ol)+'. ' if self.in_ol else u'\u2022' + ' '
            if get_text(self.current_para).strip():
                self.current_para.append(CR())
                self.current_block.append(self.current_para)
            self.current_para = Paragraph()
-            process_text_tag(tag, tag_css)
+            self.current_para.append(Space(xsize=100))
            self.current_para.append(prepend)
            self.process_children(tag, tag_css)
            if self.in_ol:
                self.in_ol += 1
        elif tagname in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            self.end_current_para()
            if self.current_block.contents:
                self.current_block.append(CR())
            self.process_children(tag, tag_css)
            self.end_current_para()
            self.current_block.append(CR())
        elif tagname in ['p', 'div']:
            # TODO: Implement ol
            #indent = tag_css.pop('text-indent', '')
            #if indent:
                # TODO: If indent is different from current textblock's parindent
                # start a new TextBlock
                #pass
            self.end_current_para()
            self.process_children(tag, tag_css)
            self.end_current_para()
        elif tagname in ['b', 'strong', 'i', 'em', 'span']:
-            process_text_tag(tag, tag_css)
+            self.process_children(tag, tag_css)
        elif tagname == 'font':
            if tag.has_key('face'):
                tag_css['font-family'] = tag['face']
-            process_text_tag(tag, tag_css)
+            self.process_children(tag, tag_css)
-        elif tagname == 'br':
+        elif tagname in ['br', 'tr']:
            self.current_para.append(CR())
            self.process_children(tag, tag_css)
        elif tagname == 'hr':
-            self.current_para.append(CR())
+            if self.current_para.contents:
-            # TODO: Horizontal line?
+                self.current_block.append(self.current_para)
                self.current_para = Paragraph()
            self.current_block.append(CR())
            self.current_page.append(self.current_block)
            self.current_block = TextBlock()            
            self.current_page.RuledLine(linelength=self.page_width)
        else:            
            self.process_children(tag, tag_css)
        if end_page:
                self.end_page()
    def writeto(self, path, lrs=False):
        self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
@ -632,6 +762,7 @@ def process_file(path, options):
            suffix = '.lrs' if options.lrs else '.lrf'
            name = os.path.splitext(os.path.basename(path))[0] + suffix
            oname = os.path.join(cwd,name)
        oname = os.path.abspath(os.path.expanduser(oname))
        conv.writeto(oname, lrs=options.lrs)
        print 'Output written to', oname
    finally:
@ -692,7 +823,7 @@ def console_query(dirpath, candidate, docs):
 def get_path(path, query=console_query):
-    path = os.path.abspath(path)
+    path = os.path.abspath(os.path.expanduser(path))
    ext = os.path.splitext(path)[1][1:].lower()
    if ext in ['htm', 'html', 'xhtml']:
        return None, path
--- a/src/libprs500/lrf/html/demo/demo.html
+++ b/src/libprs500/lrf/html/demo/demo.html
@ -0,0 +1,73 @@
 <html>
  <h1>Demo of <span style='font-family:monospace'>html2lrf</span></h1>
  <p>
  This file contains a demonstration of the capabilities of   <span style='font-family:monospace'>html2lrf,</span>   the HTML to LRF converter   from <em>libprs500.</em> To obtain libprs500 visit  <span style='font:sans-serif'>https://libprs500.kovidgoyal.net</span>
  </p>
  <h2><a name='toc'>Table of Contents</a></h2>
  <ul style='page-break-after:always'>
    <li><a href='#lists'>Demonstration of Lists</a></li>
    <li><a href='#text'>Text formatting and ruled lines</a></li>
    <li><a href='#images'>Inline images</a></li>
    <li><a href='#recursive'>Recursive link following</a></li>
    <li><a href='demo_ext.html'>The HTML used to create this file</a>
 </ul>
 <h2><a name='lists'>Lists</a></h2>
 <p><h3>Unordered lists</h3>
 <ul>
   <li>Item 1</li>
   <li>Item 2</li>
 </ul>
 </p>
 <p><h3>Ordered lists</h3>
 <ol>
   <li>Item 1</li>
   <li>Item 2</li>
 </ol>
 </p>
 <br/>
 <p>
 Note that nested lists are not supported.
 </p>
 <p style='page-break-after:always'>
 <hr />
 <a href='#toc'>Table of Contents</a>
 </p>
 <h2><a name='text'>Text formatting</a></h2>
 <p>
 A simple <i>paragraph</i> of <b>formatted 
 <i>text</i></b> with a ruled line following it.
 </p>
 <hr/>
 <p>  A 
 <span style='font-style:italic'>similar</span> 
 paragraph, but now using 
 <span style='font-weight:bold'>CSS</span> 
 to perform the text formatting.</p>
 <hr/>
 <center>A centered phrase</center>
 <span style='text-align:right'>A right aligned phrase</span>
 A normal phrase
 <p style='page-break-after:always'>
 <hr />
 <a href='#toc'>Table of Contents</a>
 </p>
 <h2 style='page-break-before:always'><a name='images'>Inline images</a></h2>
 <p>
 Here I demonstrate the use of inline images in the midst of text. Here is a  small image <img src='small.jpg' /> embedded in a sentence. Now we have a  slightly larger image that is automatically put in its own block  <img src='medium.jpg' /> and finally we have a large image which is  automatically placed on a page by itself and prevented from being  autoscaled when the user changes from S to M to L. Try changing sizes  and see how the different embedding styles behave.  <img src='large.jpg' />
 </p>
 <p style='page-break-after:always'>
 <hr />
 <a href='#toc'>Table of Contents</a>
 </p>
 <h2 style='page-break-before:always'><a name='recursive'>Recursive link following</a></h2>
 <span style='font:monospace'>html2lrf</span> follows links in HTML files that point to other files, recursively. Thus it can be used to convert a whole tree of HTML files into a single LRF file.
 <p style='page-break-after:always'>
 <hr />
 <a href='#toc'>Table of Contents</a>
 </p>
 </html>
--- a/src/libprs500/lrf/html/demo/large.jpg
+++ b/src/libprs500/lrf/html/demo/large.jpg
--- a/src/libprs500/lrf/html/demo/medium.jpg
+++ b/src/libprs500/lrf/html/demo/medium.jpg
--- a/src/libprs500/lrf/html/demo/small.jpg
+++ b/src/libprs500/lrf/html/demo/small.jpg
--- a/src/libprs500/lrf/txt/convert_from.py
+++ b/src/libprs500/lrf/txt/convert_from.py
@ -40,7 +40,7 @@ def main():
    if len(args) != 1:
        parser.print_help()
        sys.exit(1)
-    src = args[0]
+    src = os.path.abspath(os.path.expanduser(args[0]))
    if options.title == None:
        options.title = os.path.splitext(os.path.basename(src))[0]
    try:
@ -78,6 +78,7 @@ def convert_txt(path, options):
            buffer = ''
    basename = os.path.basename(path)
    oname = options.output
    oname = os.path.abspath(os.path.expanduser(oname))
    if not oname:
        oname = os.path.splitext(basename)[0]+'.lrf'
    try: 
--- a/7
+++ b/7
@ -7,6 +7,13 @@ DOWNLOADS=$PREFIX/httpdocs/downloads
 DOCS=$PREFIX/httpdocs/apidocs
 exe=`cd dist && ls -1 libprs500-*.exe | tail -n1 && cd ..`
 echo "<h2>The HTML</h2><pre>" > src/libprs500/lrf/html/demo/demo_ext.html
 cat src/libprs500/lrf/html/demo/demo.html >> src/libprs500/lrf/html/demo/demo_ext.html
 echo '</pre>' >> src/libprs500/lrf/html/demo/demo_ext.html
 html2lrf --title='Demonstration of html2lrf' --author='Kovid Goyal' --header --output=/tmp/html2lrf.lrf src/libprs500/lrf/html/demo/demo.html
 scp /tmp/html2lrf.lrf castalia:$DOWNLOADS/
 ssh castalia rm -f $DOWNLOADS/libprs500\*.exe
 scp dist/$exe castalia:$DOWNLOADS/
 ssh castalia chmod a+r $DOWNLOADS/\*