Rationalized CLI of html2lrf

Fixed link handling to show text rather than href by default Fine tuned image handling Added automatic page breaks if page-break not found
2025-07-09 03:04:10 -04:00 · 2007-05-12 21:21:21 +00:00 · 2007-05-12 21:21:21 +00:00 · a29bf8eea0
commit a29bf8eea0
parent 69f20f634d
5 changed files with 194 additions and 123 deletions
--- a/src/libprs500/init.py
+++ b/src/libprs500/init.py
@ -33,7 +33,7 @@ You may have to adjust the GROUP and the location of the rules file to
 suit your distribution.
 """

-__version__   = "0.3.25"
+__version__   = "0.3.26"
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"

--- a/src/libprs500/lrf/init.py
+++ b/src/libprs500/lrf/init.py
@ -17,7 +17,7 @@ This package contains logic to read and write LRF files. The LRF file format is
 At the time fo writing, this package only supports reading and writing LRF meat information. See L{meta}.
 """

-from optparse import OptionParser
+from optparse import OptionParser, OptionValueError

 from libprs500.lrf.pylrs.pylrs import Book as _Book
 from libprs500.lrf.pylrs.pylrs import TextBlock, Header, PutObj, Paragraph, TextStyle
@ -26,31 +26,53 @@ from libprs500 import __version__ as VERSION
 __docformat__ = "epytext"
 __author__    = "Kovid Goyal <kovid@kovidgoyal.net>"

+class PRS500_PROFILE(object):
+    screen_width  = 600
+    screen_height = 800
+    page_width    = 575
+    page_height   = 747
+    dpi           = 166
+    
+def profile_from_string(option, opt_str, value, parser):
+    if value == 'prs500':
+        setattr(parser.values, option.dest, PRS500_PROFILE)
+    else:
+        raise OptionValueError('Profile: '+value+' is not implemented')
+    
 class ConversionError(Exception):
    pass

 def option_parser(usage):
-    parser = OptionParser(usage=usage, version='libprs500 '+VERSION)
-    parser.add_option('--header', action='store_true', default=False, dest='header',
+    parser = OptionParser(usage=usage, version='libprs500 '+VERSION,
+                          epilog='html2lrf created by Kovid Goyal')
+    metadata = parser.add_option_group('METADATA OPTIONS')
+    metadata.add_option('--header', action='store_true', default=False, dest='header',
                      help='Add a header to all the pages with title and author.')
-    parser.add_option("-t", "--title", action="store", type="string", \
-                    dest="title", help="Set the title")
-    parser.add_option("-a", "--author", action="store", type="string", \
-                    dest="author", help="Set the author", default='Unknown')
-    parser.add_option("--freetext", action="store", type="string", \
-                    dest="freetext", help="Set the comments in the metadata", default='  ')
-    parser.add_option("--category", action="store", type="string", \
+    metadata.add_option("-t", "--title", action="store", type="string", \
+                    dest="title", help="Set the title. Default: filename.")
+    metadata.add_option("-a", "--author", action="store", type="string", \
+                    dest="author", help="Set the author. Default: %default", default='Unknown')
+    metadata.add_option("--freetext", action="store", type="string", \
+                    dest="freetext", help="Set the comments.", default='  ')
+    metadata.add_option("--category", action="store", type="string", \
                    dest="category", help="Set the category", default='  ')    
+    metadata.add_option('--title-sort', action='store', default='', dest='title_sort',
+                      help='Sort key for the title')
+    metadata.add_option('--author-sort', action='store', default='', dest='author_sort',
+                      help='Sort key for the author')
+    profiles=['prs500']    
    parser.add_option('-o', '--output', action='store', default=None, \
                      help='Output file name. Default is derived from input filename')
-    parser.add_option('--title-sort', action='store', default='', dest='title_sort',
-                      help='Sort key for the title')
-    parser.add_option('--author-sort', action='store', default='', dest='author_sort',
-                      help='Sort key for the author')
+    parser.add_option('-p', '--profile', default=PRS500_PROFILE, dest='profile', type='choice',
+                      choices=profiles, action='callback', callback=profile_from_string,
+                      help='''Profile of the target device for which this LRF is '''
+                      '''being generated. Default: ''' + profiles[0] + '''
+                      Supported profiles: '''+', '.join(profiles))
    return parser

-def Book(font_delta=0, header=None, **settings):
-    ps = dict(textwidth=575, textheight=747)
+def Book(font_delta=0, header=None, profile=PRS500_PROFILE, **settings):
+    ps = dict(textwidth=profile.page_width, 
+              textheight=profile.page_height)
    if header:
        hdr = Header()
        hb = TextBlock(textStyle=TextStyle(align='foot', fontsize=60))
@ -62,5 +84,4 @@ def Book(font_delta=0, header=None, **settings):
        ps['topmargin'] = 10
    return _Book(textstyledefault=dict(fontsize=100+font_delta*20, 
                                       parindent=80, linespace=12), \
-                 pagestyledefault=ps, \
-                  **settings)
+                 pagestyledefault=ps, **settings)
--- a/src/libprs500/lrf/html/convert_from.py
+++ b/src/libprs500/lrf/html/convert_from.py
@ -39,7 +39,7 @@ from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBl
                                      Bold, Space, Plot, Image, BlockSpace,\
                                      RuledLine, BookSetting
 from libprs500.lrf.pylrs.pylrs import Span as _Span
-from libprs500.lrf import ConversionError, option_parser, Book
+from libprs500.lrf import ConversionError, option_parser, Book, PRS500_PROFILE
 from libprs500 import extract, filename_to_utf8
 from libprs500.ptempfile import PersistentTemporaryFile

@ -158,7 +158,7 @@ class Span(_Span):
                ans = font_weight(val)                
                if ans:
                    t['fontweight'] = ans
-                    if int(ans) > 1400:                        
+                    if int(ans) > 140:                        
                        t['wordspace'] = '50'
            elif key.startswith("margin"):
                if key == "margin":
@ -215,6 +215,7 @@ class Span(_Span):
        
 class HTMLConverter(object):
    SELECTOR_PAT   = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
+    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
    IGNORED_TAGS   = (Comment, Declaration, ProcessingInstruction)
    # Fix <a /> elements 
    MARKUP_MASSAGE   = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"), 
@ -234,12 +235,14 @@ class HTMLConverter(object):
            
    processed_files = {} #: Files that have been processed
    
-    def __init__(self, book, path, dpi=166, width=575, height=747, 
+    def __init__(self, book, path, 
                 font_delta=0, verbose=False, cover=None,
                 max_link_levels=sys.maxint, link_level=0,
                 is_root=True, baen=False, chapter_detection=True,
                 chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE),
-                 link_exclude=re.compile('$')):
+                 link_exclude=re.compile('$'), 
+                 page_break=re.compile('h[12]', re.IGNORECASE),
+                 profile=PRS500_PROFILE, hide_broken_links=False):
        '''
        Convert HTML file at C{path} and add it to C{book}. After creating
        the object, you must call L{self.process_links} on it to create the links and
@ -270,6 +273,11 @@ class HTMLConverter(object):
        @type chapter_detection: C{bool}
        @param chapter_regex: The compiled regular expression used to search for chapter titles
        @param link_exclude: Compiled regex. Matching hrefs are ignored.
+        @param page_break: Compiled regex. Page breaks are inserted before matching
+                           tags if no page-breaks are found and no chapter headings
+                           are detected.
+        @param profile: Defines the geometry of the display device
+        @param hide_broken_links: Don't display broken links
        '''
        # Defaults for various formatting tags        
        self.css = dict(
@ -286,9 +294,7 @@ class HTMLConverter(object):
            pre    = {'font-family' :'monospace' },
            center = {'text-align'  : 'center'}
            )        
-        self.page_width = width   #: The width of the page
-        self.page_height = height #: The height of the page
-        self.dpi         = dpi    #: The DPI of the intended display device
+        self.profile     = profile #: Defines the geometry of the display device
        self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
        self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
        self.link_exclude = link_exclude #: Ignore matching hrefs
@ -298,6 +304,7 @@ class HTMLConverter(object):
        self.blockquote_style = book.create_block_style(sidemargin=60, 
                                                        topskip=20, footskip=20)
        self.unindented_style = book.create_text_style(parindent=0)
+        self.page_break       = page_break #: Regex controlling forced page-break behavior
        self.text_styles      = []#: Keep track of already used textstyles
        self.block_styles     = []#: Keep track of already used blockstyles
        self.images  = {}         #: Images referenced in the HTML document
@ -311,7 +318,8 @@ class HTMLConverter(object):
        self.in_ol = False #: Flag indicating we're in an <ol> element
        self.book = book #: The Book object representing a BBeB book
        self.is_root = is_root           #: Are we converting the root HTML file
-        self.lstrip_toggle = False #; If true the next add_text call will do an lstrip
+        self.lstrip_toggle = False #: If true the next add_text call will do an lstrip
+        self.hide_broken_links = hide_broken_links
        path = os.path.abspath(path)
        os.chdir(os.path.dirname(path))
        self.file_name = os.path.basename(path)
@ -332,6 +340,10 @@ class HTMLConverter(object):
        self.current_page = None
        self.current_para = None
        self.current_style = {}
+        self.page_break_found = False
+        match = self.PAGE_BREAK_PAT.search(unicode(self.soup))
+        if match and not re.match('avoid', match.group(1), re.IGNORECASE):
+            self.page_break_found = True
        self.parse_file()
        HTMLConverter.processed_files[path] = self
        print 'done'
@ -440,7 +452,8 @@ class HTMLConverter(object):
            
    def get_text(self, tag):
            css = self.tag_css(tag)
-            if css.has_key('display') and css['display'].lower() == 'none':
+            if (css.has_key('display') and css['display'].lower() == 'none') or \
+               (css.has_key('visibility') and css['visibility'].lower() == 'hidden'):
                return ''
            text = ''
            for c in tag.contents:
@ -487,20 +500,24 @@ class HTMLConverter(object):
        
        cwd = os.getcwd()        
        for link in self.links:
+            para, tag = link.para, link.tag
+            text = self.get_text(tag)
+            if self.hide_broken_links:
+                    para.contents = []
+                    para.append(_Span(text=text))
            purl = urlparse(link.tag['href'])
            if purl[1]: # Not a link to a file on the local filesystem
                continue
            path, fragment = unquote(purl[2]), purl[5]            
-            para, tag = link.para, link.tag
            if not path or os.path.basename(path) == self.file_name:
                if fragment in self.targets.keys():
                    tb = get_target_block(fragment, self.targets)
                    if self.is_root:
-                        self.book.addTocEntry(self.get_text(tag), tb)                 
+                        self.book.addTocEntry(text, tb)                 
                    sys.stdout.flush()
                    jb = JumpButton(tb)
                    self.book.append(jb)
-                    cb = CharButton(jb, text=self.get_text(tag))
+                    cb = CharButton(jb, text=text)
                    para.contents = []
                    para.append(cb)
            elif self.link_level < self.max_link_levels:
@ -515,15 +532,16 @@ class HTMLConverter(object):
                if not path in HTMLConverter.processed_files.keys():                    
                    try:                        
                        self.files[path] = HTMLConverter(self.book, path, 
-                                     width=self.page_width, height=self.page_height,
-                                     dpi=self.dpi,
+                                     profile=self.profile,
                                     font_delta=self.font_delta, verbose=self.verbose,
                                     link_level=self.link_level+1,
                                     max_link_levels=self.max_link_levels,
                                     is_root = False, baen=self.baen,
                                     chapter_detection=self.chapter_detection,
                                     chapter_regex=self.chapter_regex,
-                                     link_exclude=self.link_exclude)
+                                     link_exclude=self.link_exclude,
+                                     page_break=self.page_break,
+                                     hide_broken_links=self.hide_broken_links)
                        HTMLConverter.processed_files[path] = self.files[path]
                    except Exception:
                        print >>sys.stderr, 'Unable to process', path
@ -540,10 +558,10 @@ class HTMLConverter(object):
                else:
                    tb = conv.top
                if self.is_root:
-                    self.book.addTocEntry(self.get_text(tag), tb)      
+                    self.book.addTocEntry(text, tb)      
                jb = JumpButton(tb)                
                self.book.append(jb)
-                cb = CharButton(jb, text=self.get_text(tag))
+                cb = CharButton(jb, text=text)
                para.contents = []
                para.append(cb)                
                    
@ -576,8 +594,10 @@ class HTMLConverter(object):
        if os.access(path, os.R_OK):
            self.end_page()            
            page = self.book.create_page(evensidemargin=0, oddsidemargin=0, 
-                                         topmargin=0, textwidth=self.page_width,
-                                         textheight=self.page_height)
+                                         topmargin=0, textwidth=self.profile.screen_width,
+                                         headheight=0, headsep=0, footspace=0,
+                                         footheight=0,
+                                         textheight=self.profile.screen_height)
            if not self.images.has_key(path):
                self.images[path] = ImageStream(path)
            page.append(ImageBlock(self.images[path]))
@ -651,11 +671,8 @@ class HTMLConverter(object):
               'padding' in test or 'border' in test or 'page-break' in test \
               or test.startswith('mso') or test.startswith('background')\
               or test.startswith('line') or test in ['color', 'display', \
-                           'letter-spacing',  
-                           'font-variant']:
+                           'letter-spacing', 'font-variant']:
                css.pop(key)              
-                if self.verbose:
-                    print 'Ignoring CSS key:', key
        return css
    
    def end_current_para(self):
@ -688,7 +705,8 @@ class HTMLConverter(object):
            return
        tag_css = self.tag_css(tag, parent_css=parent_css)
        try: # Skip element if its display attribute is set to none
-            if tag_css['display'].lower() == 'none':
+            if tag_css['display'].lower() == 'none' or \
+               tag_css['visibility'].lower() == 'hidden':
                return
        except KeyError:
            pass
@ -701,7 +719,11 @@ class HTMLConverter(object):
           tag_css['page-break-after'].lower() != 'avoid':
            end_page = True
            tag_css.pop('page-break-after')
-            
+        if not self.page_break_found and self.page_break.match(tagname):
+            if len(self.current_page.contents) > 3:
+                self.end_page()
+                if self.verbose:
+                    print 'Forcing page break at', tagname
        if tagname in ["title", "script", "meta", 'del', 'frameset']:            
            pass
        elif tagname == 'a' and self.max_link_levels >= 0:
@ -744,7 +766,7 @@ class HTMLConverter(object):
                self.targets[tag['name']] = target
            elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
                purl = urlparse(tag['href'])
-                path = purl[2]
+                path = unquote(purl[2])
                if path and os.path.splitext(path)[1][1:].lower() in \
                    ['png', 'jpg', 'bmp', 'jpeg']:
                    self.add_image_page(path)
@ -772,31 +794,32 @@ class HTMLConverter(object):
                    return pt.name
                    
                    
-                if height > self.page_height:
-                    corrf = self.page_height/(1.*height)
-                    width, height = floor(corrf*width), self.page_height-1                        
-                    if width > self.page_width:
-                        corrf = (self.page_width)/(1.*width)
-                        width, height = self.page_width-1, floor(corrf*height)
+                if height > self.profile.page_height:
+                    corrf = self.profile.page_height/(1.*height)
+                    width, height = floor(corrf*width), self.profile.page_height-1                        
+                    if width > self.profile.page_width:
+                        corrf = (self.profile.page_width)/(1.*width)
+                        width, height = self.profile.page_width-1, floor(corrf*height)
                    path = scale_image(width, height)
-                if width > self.page_width:
-                    corrf = self.page_width/(1.*width)
-                    width, height = self.page_width-1, floor(corrf*height)
-                    if height > self.page_height:
-                        corrf = (self.page_height)/(1.*height)
-                        width, height = floor(corrf*width), self.page_height-1                        
+                if width > self.profile.page_width:
+                    corrf = self.profile.page_width/(1.*width)
+                    width, height = self.profile.page_width-1, floor(corrf*height)
+                    if height > self.profile.page_height:
+                        corrf = (self.profile.page_height)/(1.*height)
+                        width, height = floor(corrf*width), self.profile.page_height-1                        
                    path = scale_image(width, height)
                width, height = int(width), int(height)
                
                if not self.images.has_key(path):
                    self.images[path] = ImageStream(path)
-                factor = 720./self.dpi
-                if max(width, height) <= min(self.page_width, self.page_height)/5.:
+                factor = 720./self.profile.dpi
+                if max(width, height) <= min(self.profile.page_width, 
+                                             self.profile.page_height)/5.:
                    im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
                               xsize=width, ysize=height)                    
                    self.current_para.append(Plot(im, xsize=ceil(width*factor), 
                                                  ysize=ceil(height*factor)))
-                elif height <= self.page_height/1.5:
+                else:
                    pb = self.current_block
                    self.end_current_para()                    
                    self.process_alignment(tag_css)
@ -810,15 +833,6 @@ class HTMLConverter(object):
                                                    textStyle=pb.textStyle,
                                                    blockStyle=pb.blockStyle)
                    self.current_para = Paragraph()                
-                else:
-                    self.current_block.append(self.current_para)
-                    self.current_page.append(self.current_block)
-                    self.current_para = Paragraph()
-                    self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
-                                                         blockStyle=self.current_block.blockStyle)
-                    im = ImageBlock(self.images[path], x1=width, y1=height, 
-                                    xsize=width, ysize=height)
-                    self.current_page.append(im)                        
            else:
                print >>sys.stderr, "Failed to process:", tag
        elif tagname in ['style', 'link']:
@ -835,14 +849,16 @@ class HTMLConverter(object):
                        ncss.update(self.parse_css(str(c)))
            elif tag.has_key('type') and tag['type'] == "text/css" \
                    and tag.has_key('href'):
-                url = tag['href']
+                purl = urlparse(tag['href'])
+                path = unquote(purl[2])                
                try:
-                    if url.startswith('http://'):
-                        f = urlopen(url)
-                    else:
-                        f = open(unquote(url))
-                    ncss = self.parse_css(f.read())
+                    f = open(path, 'rb')
+                    src = f.read()
                    f.close()
+                    match = self.PAGE_BREAK_PAT.search(src) 
+                    if match and not re.match('avoid', match.group(1), re.IGNORECASE):
+                        self.page_break_found = True
+                    ncss = self.parse_css(f.read())
                except IOError:
                    pass
            if ncss:
@ -917,6 +933,7 @@ class HTMLConverter(object):
                    if self.verbose:
                        print 'Detected chapter', src
                    self.end_page()
+                    self.page_break_found = True
            self.end_current_para()
            self.lstrip_toggle = True
            if tag_css.has_key('text-indent'):
@ -953,7 +970,7 @@ class HTMLConverter(object):
            self.end_current_para()            
            self.current_block.append(CR())
            self.end_current_block()
-            self.current_page.RuledLine(linelength=self.page_width)
+            self.current_page.RuledLine(linelength=self.profile.page_width)
        else:            
            self.process_children(tag, tag_css)
        
@ -967,18 +984,21 @@ class HTMLConverter(object):
        for _file in self.scaled_images.values():   
            _file.__del__()
        
-
 def process_file(path, options):
    cwd = os.getcwd()
    dirpath = None
    try:
        dirpath, path = get_path(path)
-        cpath, tpath = options.cover, ''
-        if options.cover and os.access(options.cover, os.R_OK):            
-            try:
+        cpath, tpath = '', '' 
+        if options.cover:
+            options.cover = os.path.abspath(os.path.expanduser(options.cover))
+            cpath = options.cover
+            if os.access(options.cover, os.R_OK):        
                from libprs500.prs500 import PRS500                
                im = PILImage.open(os.path.join(cwd, cpath))
-                cim = im.resize((600, 800), PILImage.BICUBIC)
+                cim = im.resize((options.profile.screen_width, 
+                                 options.profile.screen_height), 
+                                PILImage.BICUBIC)
                cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
                cf.close()                
                cim.save(cf.name)
@ -989,17 +1009,16 @@ def process_file(path, options):
                tf.close()
                tim.save(tf.name)
                tpath = tf.name
-            except ImportError:
-                print >>sys.stderr, "WARNING: You don't have PIL installed. ",
-                'Cover and thumbnails wont work'
-                pass
+            else:
+                raise ConversionError, 'Cannot read from: %s', (options.cover,)
        title = (options.title, options.title_sort)
        author = (options.author, options.author_sort)
        args = dict(font_delta=options.font_delta, title=title, \
                    author=author, sourceencoding='utf8',\
                    freetext=options.freetext, category=options.category,
-                    booksetting=BookSetting(dpi=10*options.dpi,screenheight=800,
-                                            screenwidth=600))
+                    booksetting=BookSetting(dpi=10*options.profile.dpi,
+                                            screenheight=options.profile.screen_height,
+                                            screenwidth=options.profile.screen_width))
        if tpath:
            args['thumbnail'] = tpath
        header = None
@ -1011,13 +1030,16 @@ def process_file(path, options):
        book = Book(header=header, **args)
        le = re.compile(options.link_exclude) if options.link_exclude else \
             re.compile('$')
-        conv = HTMLConverter(book, path, dpi=options.dpi,
+        pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
+             re.compile('$')
+        conv = HTMLConverter(book, path, profile=options.profile,
                             font_delta=options.font_delta, 
                             cover=cpath, max_link_levels=options.link_levels,
-                             baen=options.baen, 
+                             verbose=options.verbose, baen=options.baen, 
                             chapter_detection=options.chapter_detection,
                             chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
-                             link_exclude=re.compile(le))
+                             link_exclude=re.compile(le), page_break=pb,
+                             hide_broken_links=not options.show_broken_links)
        conv.process_links()
        oname = options.output
        if not oname:
@ -1033,47 +1055,73 @@ def process_file(path, options):
        if dirpath:
            shutil.rmtree(dirpath, True)
        
-def main():
+def parse_options(argv=None, cli=True):
    """ CLI for html -> lrf conversions """
+    if not argv:
+        argv = sys.argv[1:]
    parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]

         %prog converts mybook.html to mybook.lrf""")
    parser.add_option('--cover', action='store', dest='cover', default=None, \
                      help='Path to file containing image to be used as cover')
-    parser.add_option('--lrs', action='store_true', dest='lrs', \
-                      help='Convert to LRS', default=False)
    parser.add_option('--font-delta', action='store', type='int', default=0, \
                      help="""Increase the font size by 2 * FONT_DELTA pts. 
                      If FONT_DELTA is negative, the font size is decreased.""",
                      dest='font_delta')
-    parser.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
+    link = parser.add_option_group('LINK PROCESSING OPTIONS')
+    link.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
                      dest='link_levels',
                      help=r'''The maximum number of levels to recursively process '''
                              '''links. A value of 0 means thats links are not followed. '''
                              '''A negative value means that <a> tags are ignored.''')
-    parser.add_option('--baen', action='store_true', default=False, dest='baen',
-                      help='''Preprocess Baen HTML files to improve generated LRF.''')
-    parser.add_option('--dpi', action='store', type='int', default=166, dest='dpi',
-                      help='''The DPI of the target device. Default is 166 for the
-                              Sony PRS 500''')
-    parser.add_option('--disable-chapter-detection', action='store_false', 
+    link.add_option('--link-exclude', dest='link_exclude', default='$',
+                      help='''A regular expression. <a> tags whoose href '''
+                      '''matches will be ignored. Defaults to %default''')
+    chapter = parser.add_option_group('CHAPTER OPTIONS')
+    chapter.add_option('--disable-chapter-detection', action='store_false', 
                      default=True, dest='chapter_detection', 
                      help='''Prevent html2lrf from automatically inserting page breaks'''
                      '''before what it thinks are chapters.''')
-    parser.add_option('--chapter-regex', dest='chapter_regex', 
+    chapter.add_option('--chapter-regex', dest='chapter_regex', 
                      default='chapter|book|appendix',
                      help='''The regular expression used to detect chapter titles.'''
-                      '''It is searched for in heading tags. Default is chapter|book|appendix''') 
-    parser.add_option('--link-exclude', dest='link_exclude', default='',
-                      help='''A regular expression. <a> tags whoose href '''
-                      '''matches will be ignored''')
-    options, args = parser.parse_args()
+                      '''It is searched for in heading tags. Defaults to %default''')     
+    chapter.add_option('--page-break-before', dest='page_break', default='h[12]',
+                      help='''If html2lrf does not find any page breaks in the '''
+                      '''html file and cannot detect chapter headings, it will '''
+                      '''automatically insert page-breaks before the tags whose '''
+                      '''names match this regular expression. Defaults to %default. '''
+                      '''You can disable it by setting the regexp to "$". '''
+                      '''The purpose of this option is to try to ensure that '''
+                      '''there are no really long pages as this degrades the page '''
+                      '''turn performance of the LRF. Thus this option is ignored '''
+                      '''if the current page has only a few elements.''')
+    prepro = parser.add_option_group('PREPROCESSING OPTIONS')
+    prepro.add_option('--baen', action='store_true', default=False, dest='baen',
+                      help='''Preprocess Baen HTML files to improve generated LRF.''')
+    debug = parser.add_option_group('DEBUG OPTIONS')
+    debug.add_option('--verbose', dest='verbose', action='store_true', default=False,
+                      help='''Be verbose while processing''')
+    debug.add_option('--lrs', action='store_true', dest='lrs', \
+                      help='Convert to LRS', default=False)
+    debug.add_option('--show-broken-links', dest='show_broken_links', action='store_true',
+                    default=False, help='''Show the href of broken links in generated LRF''')   
+    options, args = parser.parse_args(args=argv)
    if len(args) != 1:
+        if cli:
            parser.print_help()
-        sys.exit(1)
-    src = args[0]
+        raise ConversionError, 'no filename specified'
    if options.title == None:
-        options.title = filename_to_utf8(os.path.splitext(os.path.basename(src))[0])
+        options.title = filename_to_utf8(os.path.splitext(os.path.basename(args[0]))[0])
+    return options, args
+
+
+def main():    
+    try:
+        options, args = parse_options()
+        src = args[0]
+    except:        
+        sys.exit(1)    
    process_file(src, options)

 def console_query(dirpath, candidate, docs):
--- a/src/libprs500/lrf/html/demo/demo.html
+++ b/src/libprs500/lrf/html/demo/demo.html
@ -70,7 +70,7 @@

 <h2><a name='images'>Inline images</a></h2>
 <p>
- Here I demonstrate the use of inline images in the midst of text. Here is a  small image <img src='small.jpg' /> embedded in a sentence. Now we have a  slightly larger image that is automatically put in its own block  <img style="text-align:center" src='medium.jpg' /> and finally we have a large image which is  automatically placed on a page by itself and prevented from being  autoscaled when the user changes from S to M to L. Try changing sizes  and see how the different embedding styles behave.  <img src='large.jpg' />
+ Here I demonstrate the use of inline images in the midst of text. Here is a  small image <img src='small.jpg' /> embedded in a sentence. Now we have a  slightly larger image that is automatically put in its own block  <img style="text-align:center" src='medium.jpg' /> and finally we have a large image which wont fit on this page. Try changing sizes from S to M to L and see how the images behave.  <img src='large.jpg' />
 </p>
 <p class='toc'>
 <hr />
--- a/src/libprs500/lrf/txt/convert_from.py
+++ b/src/libprs500/lrf/txt/convert_from.py
@ -69,7 +69,9 @@ def convert_txt(path, options):
    book = Book(header=header, title=title, author=author, \
                sourceencoding=options.encoding, freetext=options.freetext, \
                category=options.category, booksetting=BookSetting
-                (dpi=10*options.dpi,screenheight=800, screenwidth=600))
+                (dpi=10*options.profile.dpi,
+                 screenheight=options.profile.screen_height, 
+                 screenwidth=options.profile.screen_height))
    buffer = ''
    pg = book.create_page()
    block = book.create_text_block()