IGN:Full implementation of HTML traversal

2025-07-09 03:04:10 -04:00 · 2008-08-25 16:42:07 -07:00 · 2008-08-25 16:42:07 -07:00 · 39afcb27f7
commit 39afcb27f7
parent 2efa1ec708
6 changed files with 597 additions and 236 deletions
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -8,6 +8,7 @@ Conversion to EPUB.
 '''
 import sys
 from calibre.utils.config import Config, StringConfig
+from calibre.ebooks.html import config as common_config

 def config(defaults=None):
    desc = _('Options to control the conversion to EPUB')
@ -16,22 +17,11 @@ def config(defaults=None):
    else:
        c = StringConfig(defaults, desc)
    
+    c.update(common_config())
+    c.remove_opt('output')
+    
    c.add_opt('output', ['-o', '--output'], default=None,
             help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
-    c.add_opt('encoding', ['--encoding'], default=None, 
-              help=_('Character encoding for HTML files. Default is to auto detect.'))
-    
-    metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
-    metadata('title', ['-t', '--title'], default=None,
-             help=_('Set the title. Default is to autodetect.'))
-    metadata('authors', ['-a', '--authors'], default=_('Unknown'),
-             help=_('The author(s) of the ebook, as a comma separated list.'))
-        
-    traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
-    traversal('breadth_first', ['--breadth-first'], default=False,
-              help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
-    traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
-              help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
    
    structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
    structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
@ -46,8 +36,5 @@ help on using this feature.
              help=_('Don\'t add detected chapters to the Table of Contents'))
    structure('no_links_in_toc', ['--no-links-in-toc'], default=False,
              help=_('Don\'t add links in the root HTML file to the Table of Contents'))
-    debug = c.add_group('debug', _('Options useful for debugging'))
-    debug('verbose', ['-v', '--verbose'], default=0, action='count',
-          help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
    
    return c
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -2,44 +2,22 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
-import os, sys, logging, re, shutil, tempfile
-from lxml import html
+import os, sys, re, shutil
 from lxml.etree import XPath
-get_text = XPath("//text()")

-from calibre import LoggingInterface
-from calibre.ebooks.html import PreProcessor
+from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist
 from calibre.ebooks.epub import config as common_config
-from calibre.ebooks.epub.traverse import traverse, opf_traverse
-from calibre.ebooks.metadata import MetaInformation
-from calibre.ebooks.metadata.meta import get_metadata
-from calibre.ebooks.metadata.opf import OPFReader
 from calibre.ptempfile import PersistentTemporaryDirectory


-class HTMLProcessor(PreProcessor, LoggingInterface):
-    
-    ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
-                     re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
+class HTMLProcessor(Parser):
    
    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
-        LoggingInterface.__init__(self, logging.getLogger('html2epub'))
-        self.setup_cli_handler(opts.verbose)
-        
-        self.htmlfile = htmlfile
-        self.opts = opts
-        self.tdir = tdir
-        self.resource_map = resource_map
-        self.resource_dir = os.path.join(tdir, 'resources')
-        self.htmlfiles = htmlfiles
-        
-        self.parse_html()
-        
-        self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
-        
+        Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, 
+                        name='html2epub')
        if opts.verbose > 2:
            self.debug_tree('parsed')
-        
+        self.detected_chapters = self.opts.chapter(self.root)
        self.extract_css()
        
        if opts.verbose > 2:
@ -49,130 +27,6 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
        
        self.split()
        
-    def debug_tree(self, name):
-        '''
-        Dump source tree for later debugging.
-        '''
-        tdir = tempfile.gettempdir()
-        if not os.path.exists(tdir):
-            os.makedirs(tdir)
-        with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\
-                    (os.path.basename(self.htmlfile.path), name)), 'wb') as f:
-            f.write(html.tostring(self.root, encoding='utf-8'))
-            self.log_debug(_('Written processed HTML to ')+f.name)
-        
-    def parse_html(self):
-        ''' Create lxml ElementTree from HTML '''
-        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
-        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
-        src = self.preprocess(src)
-        # lxml chokes on unicode input when it contains encoding declarations
-        for pat in self.ENCODING_PATS: 
-            src = pat.sub('', src)
-        try:
-            self.root = html.document_fromstring(src)
-        except:
-            if self.opts.verbose:
-                self.log_exception('lxml based parsing failed')
-            self.root = html.soupparser.fromstring()
-        self.head = self.body = None
-        head = self.root.xpath('//head')
-        if head:
-            self.head = head[0]
-        body = self.root.xpath('//body')
-        if body:
-            self.body = body[0]
-        self.detected_chapters = self.opts.chapter(self.root)
-            
-    def rewrite_links(self, olink):
-        '''
-        Make all links in document relative so that they work in the EPUB container.
-        Also copies any resources (like images, stylesheets, scripts, etc.) into
-        the local tree.
-        '''
-        if not isinstance(olink, unicode):
-            olink = olink.decode(self.htmlfile.encoding)
-        link = self.htmlfile.resolve(olink)
-        if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
-            return olink
-        if link.path in self.htmlfiles:
-            return os.path.basename(link.path)
-        if link.path in self.resource_map.keys():
-            return self.resource_map[link.path]
-        name = os.path.basename(link.path)
-        name, ext = os.path.splitext(name)
-        name += ('_%d'%len(self.resource_map)) + ext
-        shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
-        name = 'resources/' + name
-        self.resource_map[link.path] = name 
-        return name
-        
-    
-    def extract_css(self):
-        '''
-        Remove all CSS information from the document and store in self.raw_css. 
-        This includes <font> tags.
-        '''
-        css = []
-        for link in self.root.xpath('//link'):
-            if 'css' in link.get('type', 'text/css').lower():
-                file = self.htmlfile.resolve(link.get('href', ''))
-                if os.path.exists(file) and os.path.isfile(file):
-                    css.append(open(file, 'rb').read().decode('utf-8'))
-                link.getparent().remove(link)
-                    
-        for style in self.root.xpath('//style'):
-            if 'css' in style.get('type', 'text/css').lower():
-                css.append('\n'.join(get_text(style)))
-                style.getparent().remove(style)
-        
-        font_id = 1
-        for font in self.root.xpath('//font'):
-            try:
-                size = int(font.attrib.pop('size', '3'))
-            except:
-                size = 3
-            setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
-            face = font.attrib.pop('face', None)
-            if face is not None:
-                setting += 'font-face:%s;'%face
-            color = font.attrib.pop('color', None)
-            if color is not None:
-                setting += 'color:%s'%color
-            id = 'calibre_font_id_%d'%font_id
-            font.set('id', 'calibre_font_id_%d'%font_id)
-            font_id += 1
-            css.append('#%s { %s }'%(id, setting))
-            
-        
-        css_counter = 1
-        for elem in self.root.xpath('//*[@style]'):
-            if 'id' not in elem.keys():
-                elem.set('id', 'calibre_css_id_%d'%css_counter)
-                css_counter += 1
-            css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
-            elem.attrib.pop('style')
-        chapter_counter = 1
-        for chapter in self.detected_chapters:
-            if chapter.tag.lower() == 'a':
-                if 'name' in chapter.keys():
-                    chapter.attrib['id'] = id = chapter.get('name')
-                elif 'id' in chapter.keys():
-                    id = chapter.get('id')
-                else:
-                    id = 'calibre_detected_chapter_%d'%chapter_counter
-                    chapter_counter += 1
-                    chapter.set('id', id)
-            else:
-                if 'id' not in chapter.keys():
-                    id = 'calibre_detected_chapter_%d'%chapter_counter
-                    chapter_counter += 1
-                    chapter.set('id', id)
-            css.append('#%s {%s}'%(id, 'page-break-before:always'))
-                     
-        self.raw_css = '\n\n'.join(css)
-        # TODO: Figure out what to do about CSS imports from linked stylesheets 
-                
    def collect_font_statistics(self):
        '''
        Collect font statistics to figure out the base font size used in this
@ -191,8 +45,8 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
        pass
            

-def config():
-    c = common_config()
+def config(defaults=None):
+    c = common_config(defaults=defaults)
    return c

 def option_parser():
@ -203,11 +57,6 @@ def option_parser():
 Convert a HTML file to an EPUB ebook. Follows links in the HTML file. 
 '''))

-def search_for_opf(dir):
-    for f in os.listdir(dir):
-        if f.lower().endswith('.opf'):
-            return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
-
 def parse_content(filelist, opts):
    tdir = PersistentTemporaryDirectory('_html2epub')
    os.makedirs(os.path.join(tdir, 'content', 'resources'))
@ -221,39 +70,17 @@ def convert(htmlfile, opts, notification=None):
    if opts.output is None:
        opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
    opts.output = os.path.abspath(opts.output)
-    opf = search_for_opf(os.path.dirname(htmlfile))
-    if opf:
-        mi = MetaInformation(opf)
-    else:
-        mi =  get_metadata(open(htmlfile, 'rb'), 'html')
-    if opts.title:
-        mi.title = opts.title
-    if opts.authors != _('Unknown'):
-        opts.authors   = opts.authors.split(',')
-        opts.authors = [a.strip() for a in opts.authors]
-        mi.authors = opts.authors
-    
-    if not mi.title:
-        mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
-    if not mi.authors:
-        mi.authors = [_('Unknown')]
-    
+    opf, filelist = get_filelist(htmlfile, opts)
+    mi = merge_metadata(htmlfile, opf, opts)
    opts.chapter = XPath(opts.chapter, 
                    namespaces={'re':'http://exslt.org/regular-expressions'})
-    
-    filelist = None
-    print 'Building file list...'
-    if opf is not None:
-        filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
-    if not filelist:
-        filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
-                    [0 if opts.breadth_first else 1]
-    if opts.verbose:
-        print '\tFound files...'
-        for f in filelist:
-            print '\t\t', f
-            
-    parse_content(filelist, opts)
+    resource_map = parse_content(filelist, opts)
+    resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
+    if opf.cover and os.access(opf.cover, os.R_OK):
+        shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
+        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
+        shutil.copyfile(opf.cover, cpath)
+        resources.append(cpath)
            
 def main(args=sys.argv):
    parser = option_parser()
@ -267,4 +94,3 @@ def main(args=sys.argv):
    
 if __name__ == '__main__':
    sys.exit(main())
-
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -1,8 +1,220 @@
+from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'

-import re
+import sys, re, os, shutil, logging, tempfile
+from urlparse import urlparse
+from urllib import unquote
+
+from lxml import html
+from lxml.etree import XPath
+get_text = XPath("//text()")
+
+from calibre import LoggingInterface, unicode_path
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.utils.config import Config, StringConfig
+from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.metadata.meta import get_metadata
+from calibre.ptempfile import PersistentTemporaryDirectory
+from calibre.utils.zipfile import ZipFile
+
+
+class Link(object):
+    '''
+    Represents a link in a HTML file.
+    '''
+    
+    @classmethod
+    def url_to_local_path(cls, url, base):
+        path = url.path
+        if os.path.isabs(path):
+            return path
+        return os.path.abspath(os.path.join(base, path))
+    
+    def __init__(self, url, base):
+        '''
+        :param url:  The url this link points to. Must be an unquoted unicode string.
+        :param base: The base directory that relative URLs are with respect to.
+                     Must be a unicode string.
+        '''
+        assert isinstance(url, unicode) and isinstance(base, unicode)
+        self.url         = url
+        self.parsed_url  = urlparse(unquote(self.url))
+        self.is_local    = self.parsed_url.scheme in ('', 'file')
+        self.is_internal = self.is_local and not bool(self.parsed_url.path)
+        self.path        = None
+        self.fragment    = self.parsed_url.fragment 
+        if self.is_local and not self.is_internal:
+            self.path = self.url_to_local_path(self.parsed_url, base)
+
+    def __hash__(self):
+        if self.path is None:
+            return hash(self.url)
+        return hash(self.path)
+
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+    
+    def __str__(self):
+        return u'Link: %s --> %s'%(self.url, self.path) 
+        
+
+class IgnoreFile(Exception):
+    
+    def __init__(self, msg, errno):
+        Exception.__init__(self, msg)
+        self.doesnt_exist = errno == 2
+        self.errno = errno
+
+class HTMLFile(object):
+    '''
+    Contains basic information about an HTML file. This
+    includes a list of links to other files as well as
+    the encoding of each file. Also tries to detect if the file is not a HTML
+    file in which case :member:`is_binary` is set to True.
+
+    The encoding of the file is available as :member:`encoding`.
+    '''
+    
+    HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
+    LINK_PAT = re.compile(
+    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
+    re.DOTALL|re.IGNORECASE)
+    
+    def __init__(self, path_to_html_file, level, encoding, verbose):
+        '''
+        :param level: The level of this file. Should be 0 for the root file.
+        :param encoding: Use `encoding` to decode HTML.
+        '''
+        self.path  = unicode_path(path_to_html_file, abs=True)
+        self.base  = os.path.dirname(self.path)
+        self.level = level
+        self.links = []
+        
+        try:
+            with open(self.path, 'rb') as f:
+                src = f.read()
+        except IOError, err:
+            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
+            if level == 0:
+                raise IOError(msg)
+            raise IgnoreFile(msg, err.errno)
+        
+        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
+        
+        if not self.is_binary:
+            if encoding is None:
+                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
+                self.encoding = encoding
+
+            src = src.decode(encoding, 'replace')
+            self.find_links(src)
+                
+        
+                    
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+    
+    def __str__(self):
+        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
+    
+    def __repr__(self):
+        return str(self)
+                    
+        
+    def find_links(self, src):
+        for match in self.LINK_PAT.finditer(src):
+            url = None
+            for i in ('url1', 'url2', 'url3'):
+                url = match.group(i)
+                if url:
+                    break
+            link = self.resolve(url)
+            if link not in self.links:
+                self.links.append(link)
+                
+    def resolve(self, url):
+        return Link(url, self.base)
+
+
+def depth_first(root, flat, visited=set([])):
+    yield root
+    visited.add(root)
+    for link in root.links:
+        if link.path is not None and link not in visited:
+            try:
+                index = flat.index(link)
+            except ValueError: # Can happen if max_levels is used
+                continue
+            hf = flat[index]
+            if hf not in visited:
+                yield hf
+                visited.add(hf)
+                for hf in depth_first(hf, flat, visited):
+                    if hf not in visited:
+                        yield hf
+                        visited.add(hf)
+        
+                                
+def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
+    '''
+    Recursively traverse all links in the HTML file.
+    
+    :param max_levels: Maximum levels of recursion. Must be non-negative. 0 
+                       implies that no links in the root HTML file are followed.
+    :param encoding:   Specify character encoding of HTML files. If `None` it is
+                       auto-detected.
+    :return:           A pair of lists (breadth_first, depth_first). Each list contains
+                       :class:`HTMLFile` objects.
+    '''
+    assert max_levels >= 0
+    level = 0
+    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
+    next_level = list(flat)
+    while level < max_levels and len(next_level) > 0:
+        level += 1
+        nl = []
+        for hf in next_level:
+            rejects = []
+            for link in hf.links:
+                if link.path is None or link.path in flat:
+                    continue
+                try:
+                    nf = HTMLFile(link.path, level, encoding, verbose)
+                    nl.append(nf)
+                    flat.append(nf)
+                except IgnoreFile, err:
+                    rejects.append(link)
+                    if not err.doesnt_exist or verbose > 1:
+                        print str(err)
+            for link in rejects:
+                hf.links.remove(link)
+                
+        next_level = list(nl)
+        
+    return flat, list(depth_first(flat[0], flat))
+    
+    
+def opf_traverse(opf_reader, verbose=0, encoding=None):
+    '''
+    Return a list of :class:`HTMLFile` objects in the order specified by the
+    `<spine>` element of the OPF.
+    
+    :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.  
+    :param encoding:   Specify character encoding of HTML files. If `None` it is
+                       auto-detected.
+    '''
+    if not opf_reader.spine:
+        raise ValueError('OPF does not have a spine')
+    flat = []
+    for path in opf_reader.spine.items():
+        if path not in flat:
+            flat.append(os.path.abspath(path))
+    flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
+    return flat
+            


 class PreProcessor(object):
@ -72,3 +284,287 @@ class PreProcessor(object):
        
        return html
    
+class Parser(PreProcessor):
+    
+    ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
+                     re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
+    
+    
+    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
+        LoggingInterface.__init__(self, logging.getLogger(name))
+        self.setup_cli_handler(opts.verbose)
+        self.htmlfile = htmlfile
+        self.opts = opts
+        self.tdir = tdir
+        self.resource_map = resource_map
+        self.htmlfiles = htmlfiles
+        self.resource_dir = os.path.join(tdir, 'resources')
+        
+        self.parse_html()
+        self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
+        
+    def parse_html(self):
+        ''' Create lxml ElementTree from HTML '''
+        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
+        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
+        src = self.preprocess(src)
+        # lxml chokes on unicode input when it contains encoding declarations
+        for pat in self.ENCODING_PATS: 
+            src = pat.sub('', src)
+        try:
+            self.root = html.document_fromstring(src)
+        except:
+            if self.opts.verbose:
+                self.log_exception('lxml based parsing failed')
+            self.root = html.soupparser.fromstring()
+        self.head = self.body = None
+        head = self.root.xpath('//head')
+        if head:
+            self.head = head[0]
+        body = self.root.xpath('//body')
+        if body:
+            self.body = body[0]
+    
+    def debug_tree(self, name):
+        '''
+        Dump source tree for later debugging.
+        '''
+        tdir = tempfile.gettempdir()
+        if not os.path.exists(tdir):
+            os.makedirs(tdir)
+        with open(os.path.join(tdir, '%s-%s-%s.html'%\
+                    (self.name, os.path.basename(self.htmlfile.path), name)), 'wb') as f:
+            f.write(html.tostring(self.root, encoding='utf-8'))
+            self.log_debug(_('Written processed HTML to ')+f.name)
+    
+            
+    def rewrite_links(self, olink):
+        '''
+        Make all links in document relative so that they work in the EPUB container.
+        Also copies any resources (like images, stylesheets, scripts, etc.) into
+        the local tree.
+        '''
+        if not isinstance(olink, unicode):
+            olink = olink.decode(self.htmlfile.encoding)
+        link = self.htmlfile.resolve(olink)
+        if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
+            return olink
+        if link.path in self.htmlfiles:
+            return os.path.basename(link.path)
+        if link.path in self.resource_map.keys():
+            return self.resource_map[link.path]
+        name = os.path.basename(link.path)
+        name, ext = os.path.splitext(name)
+        name += ('_%d'%len(self.resource_map)) + ext
+        shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
+        name = 'resources/' + name
+        self.resource_map[link.path] = name 
+        return name
+    
+    def extract_css(self):
+        '''
+        Remove all CSS information from the document and store in self.raw_css. 
+        This includes <font> tags.
+        '''
+        css = []
+        for link in self.root.xpath('//link'):
+            if 'css' in link.get('type', 'text/css').lower():
+                file = self.htmlfile.resolve(link.get('href', ''))
+                if os.path.exists(file) and os.path.isfile(file):
+                    css.append(open(file, 'rb').read().decode('utf-8'))
+                link.getparent().remove(link)
+                    
+        for style in self.root.xpath('//style'):
+            if 'css' in style.get('type', 'text/css').lower():
+                css.append('\n'.join(get_text(style)))
+                style.getparent().remove(style)
+        
+        font_id = 1
+        for font in self.root.xpath('//font'):
+            try:
+                size = int(font.attrib.pop('size', '3'))
+            except:
+                size = 3
+            setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
+            face = font.attrib.pop('face', None)
+            if face is not None:
+                setting += 'font-face:%s;'%face
+            color = font.attrib.pop('color', None)
+            if color is not None:
+                setting += 'color:%s'%color
+            id = 'calibre_font_id_%d'%font_id
+            font.set('id', 'calibre_font_id_%d'%font_id)
+            font_id += 1
+            css.append('#%s { %s }'%(id, setting))
+            
+        
+        css_counter = 1
+        for elem in self.root.xpath('//*[@style]'):
+            if 'id' not in elem.keys():
+                elem.set('id', 'calibre_css_id_%d'%css_counter)
+                css_counter += 1
+            css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
+            elem.attrib.pop('style')
+        chapter_counter = 1
+        for chapter in self.detected_chapters:
+            if chapter.tag.lower() == 'a':
+                if 'name' in chapter.keys():
+                    chapter.attrib['id'] = id = chapter.get('name')
+                elif 'id' in chapter.keys():
+                    id = chapter.get('id')
+                else:
+                    id = 'calibre_detected_chapter_%d'%chapter_counter
+                    chapter_counter += 1
+                    chapter.set('id', id)
+            else:
+                if 'id' not in chapter.keys():
+                    id = 'calibre_detected_chapter_%d'%chapter_counter
+                    chapter_counter += 1
+                    chapter.set('id', id)
+            css.append('#%s {%s}'%(id, 'page-break-before:always'))
+                     
+        self.raw_css = '\n\n'.join(css)
+        # TODO: Figure out what to do about CSS imports from linked stylesheets    
+
+def config(defaults=None):
+    desc = _('Options to control the traversal of HTML')
+    if defaults is None:
+        c = Config('html', desc)
+    else:
+        c = StringConfig(defaults, desc)
+        
+    c.add_opt('output', ['-o', '--output'], default=None,
+             help=_('The output directory. Default is the current directory.'))
+    c.add_opt('encoding', ['--encoding'], default=None, 
+              help=_('Character encoding for HTML files. Default is to auto detect.'))
+    
+    traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
+    traversal('breadth_first', ['--breadth-first'], default=False,
+              help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
+    traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
+              help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
+    
+    metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
+    metadata('title', ['-t', '--title'], default=None,
+             help=_('Set the title. Default is to autodetect.'))
+    metadata('authors', ['-a', '--authors'], default=_('Unknown'),
+             help=_('The author(s) of the ebook, as a comma separated list.'))
+        
+    debug = c.add_group('debug', _('Options useful for debugging'))
+    debug('verbose', ['-v', '--verbose'], default=0, action='count',
+          help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
+    
+    return c
+
+def option_parser():
+    c = config()
+    return c.option_parser(usage=_('''\
+%prog [options] file.html
+
+Follow all links in an HTML file and collect them into the specified directory.
+Also collects any references resources like images, stylesheets, scripts, etc. 
+'''))
+
+def safe_option_parser():
+    return option_parser(safe=True)
+
+def search_for_opf(dir):
+    for f in os.listdir(dir):
+        if f.lower().endswith('.opf'):
+            return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
+
+
+def get_filelist(htmlfile, opts):
+    print 'Building file list...'
+    
+    opf = search_for_opf(os.path.dirname(htmlfile))
+    if opf is not None:
+        filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
+    if not filelist:
+        filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
+                    [0 if opts.breadth_first else 1]
+    if opts.verbose:
+        print '\tFound files...'
+        for f in filelist:
+            print '\t\t', f
+    
+    return opf, filelist
+
+def parse_content(filelist, opts):
+    if not opts.output:
+        opts.output = '.'
+    opts.output = os.path.abspath(opts.output)
+    rdir = os.path.join(opts.output, 'content', 'resources')
+    if not os.path.exists(rdir):
+        os.makedirs(rdir)
+    resource_map = {}
+    for htmlfile in filelist:
+        Parser(htmlfile, opts, os.path.join(opts.output, 'content'), 
+                           resource_map, filelist)
+    return resource_map
+
+def merge_metadata(htmlfile, opf, opts):
+    if opf:
+        mi = MetaInformation(opf)
+    else:
+        mi =  get_metadata(open(htmlfile, 'rb'), 'html')
+    if opts.title:
+        mi.title = opts.title
+    if opts.authors != _('Unknown'):
+        opts.authors   = opts.authors.split(',')
+        opts.authors = [a.strip() for a in opts.authors]
+        mi.authors = opts.authors
+    
+    if not mi.title:
+        mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
+    if not mi.authors:
+        mi.authors = [_('Unknown')]
+
+def create_metadata(basepath, mi, filelist, resources):
+    mi = OPFCreator(basepath, mi)
+    entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources]
+    mi.create_manifest(entries)
+    mi.create_spine([f.path for f in filelist])
+    return mi
+
+def create_dir(htmlfile, opts):
+    opf, filelist = get_filelist(htmlfile, opts)
+    mi = merge_metadata(htmlfile, opf, opts)
+    resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()]
+    if opf.cover and os.access(opf.cover, os.R_OK):
+        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
+        shutil.copyfile(opf.cover, cpath)
+        resources.append(cpath)
+    mi = create_metadata(opts.output, mi, filelist, resources)
+    with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
+        mi.render(f)
+    print 'Open ebook created in', opts.output
+    
+def create_oebzip(htmlfile, opts):
+    tdir = PersistentTemporaryDirectory('_create_oebzip')
+    if opts.output is None:
+        opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip')
+    ofile = opts.output
+    opts.output = tdir
+    create_dir(htmlfile, opts)
+    zf = ZipFile(ofile, 'w')
+    zf.add_dir(opts.output)
+    print 'Output saved to', ofile
+
+def main(args=sys.argv):
+    parser = option_parser()
+    opts, args = parser.parse_args(args)
+    if len(args) < 2:
+        parser.print_help()
+        print _('You must specify an input HTML file')
+        return 1
+    
+    htmlfile = args[1]
+    create_dir(htmlfile, opts)
+        
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+        
+    
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@ -25,6 +25,7 @@ entry_points = {
                             'epub-meta = calibre.ebooks.metadata.epub:main',
                             'txt2lrf   = calibre.ebooks.lrf.txt.convert_from:main',
                             'html2lrf  = calibre.ebooks.lrf.html.convert_from:main',
+                             'html2oeb  = calibre.ebooks.html:main',
                             'html2epub = calibre.ebooks.epub.from_html:main',
                             'markdown-calibre  = calibre.ebooks.markdown.markdown:main',
                             'lit2lrf   = calibre.ebooks.lrf.lit.convert_from:main',
@ -168,6 +169,8 @@ def setup_completion(fatal_errors):
        from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
        from calibre.ebooks.metadata.epub import option_parser as epub_meta
        from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
+        from calibre.ebooks.epub.from_html import option_parser as html2epub
+        from calibre.ebooks.html import option_parser as html2oeb 

        f = open_file('/etc/bash_completion.d/libprs500')
        f.close()
@ -203,6 +206,8 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
        f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
+        f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml']))
+        f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml']))
        f.write('''
 _prs500_ls()
 {
--- a/src/calibre/utils/config.py
+++ b/src/calibre/utils/config.py
@ -169,7 +169,7 @@ class Option(object):
        self.metavar  = metavar
        
    def __eq__(self, other):
-        return self.name == getattr(other, 'name', None)
+        return self.name == getattr(other, 'name', other)
        
 class OptionValues(object):
    
@ -203,6 +203,19 @@ class OptionSet(object):
        self.group_list.append(name)
        return partial(self.add_opt, group=name)
    
+    def update(self, other):
+        for name in other.groups.keys():
+            self.groups[name] = other.groups[name]
+        for pref in other.preferences:
+            if pref in self.preferences:
+                self.preferences.remove(pref)
+            self.preferences.append(pref)
+            
+    def remove_opt(self, name):
+        if name in self.preferences:
+            self.preferences.remove(name)
+        
+        
    def add_opt(self, name, switches=[], help=None, type=None, choices=None, 
                 group=None, default=None, action=None, metavar=None):
        '''
@ -307,19 +320,34 @@ class OptionSet(object):
                                        for name in [None] + self.group_list]
        return src + '\n\n'.join(groups)

-class Config(object):
+class ConfigInterface(object):
    
-    def __init__(self, basename, description=''):
-        self.config_file_path = os.path.join(config_dir, basename+'.py')
+    def __init__(self, description):
        self.option_set       = OptionSet(description=description)
        self.add_opt          = self.option_set.add_opt
        self.add_group        = self.option_set.add_group
+        self.remove_opt       = self.option_set.remove_opt
+        
+    def update(self, other):
+        self.option_set.update(other.option_set)
        
    def option_parser(self, usage='', gui_mode=False):
        return self.option_set.option_parser(user_defaults=self.parse(), 
                                             usage=usage, gui_mode=gui_mode)
    
+class Config(ConfigInterface):
+    '''
+    A file based configuration.
+    '''
+    
+    def __init__(self, basename, description=''):
+        ConfigInterface.__init__(self, description)
+        self.config_file_path = os.path.join(config_dir, basename+'.py')
+                
+        
    def parse(self):
+        src = ''
+        if os.path.exists(self.config_file_path):
            try:
                with ExclusiveFile(self.config_file_path) as f:
                    src = f.read()
@ -352,17 +380,14 @@ class Config(object):
        except LockError:
            raise IOError('Could not lock config file: %s'%self.config_file_path)
            
-class StringConfig(object):
+class StringConfig(ConfigInterface):
+    '''
+    A string based configuration
+    '''
    
    def __init__(self, src, description=''):
+        ConfigInterface.__init__(self, description)
        self.src = src
-        self.option_set       = OptionSet(description=description)
-        self.add_opt          = self.option_set.add_opt
-        self.option_parser    = self.option_set.option_parser
-        
-    def option_parser(self, usage='', gui_mode=False):
-        return self.option_set.option_parser(user_defaults=self.parse(), 
-                                             usage=usage, gui_mode=gui_mode)
        
    def parse(self):
        return self.option_set.parse_string(self.src)
--- a/src/calibre/utils/zipfile.py
+++ b/src/calibre/utils/zipfile.py
@ -1034,6 +1034,7 @@ class ZipFile:
            os.makedirs(upperdirs)
        
        source = self.open(member, pwd=pwd)
+        if not os.path.exists(targetpath): # Could be a previously automatically created directory
            target = open(targetpath, "wb")
            shutil.copyfileobj(source, target)
            source.close()
@ -1067,6 +1068,8 @@ class ZipFile:
    def write(self, filename, arcname=None, compress_type=None):
        """Put the bytes from filename into the archive under the name
        arcname."""
+        if isinstance(filename, unicode):
+            filename = filename.encode('utf-8')
        if not self.fp:
            raise RuntimeError(
                  "Attempt to write to ZIP archive that was already closed")
@ -1133,15 +1136,17 @@ class ZipFile:
        self.filelist.append(zinfo)
        self.NameToInfo[zinfo.filename] = zinfo

-    def writestr(self, zinfo_or_arcname, bytes):
+    def writestr(self, zinfo_or_arcname, bytes, permissions=0600):
        """Write a file into the archive.  The contents is the string
        'bytes'.  'zinfo_or_arcname' is either a ZipInfo instance or
        the name of the file in the archive."""
        if not isinstance(zinfo_or_arcname, ZipInfo):
+            if isinstance(zinfo_or_arcname, unicode):
+                zinfo_or_arcname = zinfo_or_arcname.encode('utf-8')
            zinfo = ZipInfo(filename=zinfo_or_arcname,
                            date_time=time.localtime(time.time())[:6])
            zinfo.compress_type = self.compression
-            zinfo.external_attr = 0600 << 16
+            zinfo.external_attr = permissions << 16
        else:
            zinfo = zinfo_or_arcname

@ -1172,6 +1177,23 @@ class ZipFile:
        self.filelist.append(zinfo)
        self.NameToInfo[zinfo.filename] = zinfo
        
+    def add_dir(self, path, prefix=''):
+        if prefix:
+            self.writestr(prefix+'/', '', 0700)
+        cwd = os.path.abspath(os.getcwd())
+        try:
+            os.chdir(path)
+            fp = (prefix + ('/' if prefix else '')).replace('//', '/')
+            for f in os.listdir('.'):
+                arcname = fp + f
+                if os.path.isdir(f):
+                    self.add_dir(f, prefix=arcname)
+                else:
+                    self.write(f, arcname) 
+        finally:
+            os.chdir(cwd)
+            
+
    def __del__(self):
        """Call the "close()" method in case the user forgot."""
        self.close()