IGN:Full implementation of HTML traversal

2025-08-30 23:00:21 -04:00 · 2008-08-25 16:42:07 -07:00 · 2008-08-25 16:42:07 -07:00 · 39afcb27f7
commit 39afcb27f7
parent 2efa1ec708
6 changed files with 597 additions and 236 deletions
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -8,6 +8,7 @@ Conversion to EPUB.
 '''
 import sys
 from calibre.utils.config import Config, StringConfig
 from calibre.ebooks.html import config as common_config
 def config(defaults=None):
    desc = _('Options to control the conversion to EPUB')
@ -15,23 +16,12 @@ def config(defaults=None):
        c = Config('epub', desc)
    else:
        c = StringConfig(defaults, desc)
-        
+    
    c.update(common_config())
    c.remove_opt('output')
    c.add_opt('output', ['-o', '--output'], default=None,
             help=_('The output EPUB file. If not specified, it is derived from the input file name.'))
    c.add_opt('encoding', ['--encoding'], default=None, 
              help=_('Character encoding for HTML files. Default is to auto detect.'))
    metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
    metadata('title', ['-t', '--title'], default=None,
             help=_('Set the title. Default is to autodetect.'))
    metadata('authors', ['-a', '--authors'], default=_('Unknown'),
             help=_('The author(s) of the ebook, as a comma separated list.'))
    traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
    traversal('breadth_first', ['--breadth-first'], default=False,
              help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
    traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
              help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
    structure = c.add_group('structure detection', _('Control auto-detection of document structure.'))
    structure('chapter', ['--chapter'], default="//*[re:match(name(), 'h[1-2]') and re:test(., 'chapter|book|section', 'i')]",
@ -46,8 +36,5 @@ help on using this feature.
              help=_('Don\'t add detected chapters to the Table of Contents'))
    structure('no_links_in_toc', ['--no-links-in-toc'], default=False,
              help=_('Don\'t add links in the root HTML file to the Table of Contents'))
    debug = c.add_group('debug', _('Options useful for debugging'))
    debug('verbose', ['-v', '--verbose'], default=0, action='count',
          help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
    return c
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -2,44 +2,22 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
-import os, sys, logging, re, shutil, tempfile
+import os, sys, re, shutil
 from lxml import html
 from lxml.etree import XPath
 get_text = XPath("//text()")
-from calibre import LoggingInterface
+from calibre.ebooks.html import Parser, get_text, merge_metadata, get_filelist
 from calibre.ebooks.html import PreProcessor
 from calibre.ebooks.epub import config as common_config
 from calibre.ebooks.epub.traverse import traverse, opf_traverse
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.meta import get_metadata
 from calibre.ebooks.metadata.opf import OPFReader
 from calibre.ptempfile import PersistentTemporaryDirectory
-class HTMLProcessor(PreProcessor, LoggingInterface):
+class HTMLProcessor(Parser):
    ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
                     re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
-        LoggingInterface.__init__(self, logging.getLogger('html2epub'))
+        Parser.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, 
-        self.setup_cli_handler(opts.verbose)
+                        name='html2epub')
        self.htmlfile = htmlfile
        self.opts = opts
        self.tdir = tdir
        self.resource_map = resource_map
        self.resource_dir = os.path.join(tdir, 'resources')
        self.htmlfiles = htmlfiles
        self.parse_html()
        self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
        if opts.verbose > 2:
            self.debug_tree('parsed')
-        
+        self.detected_chapters = self.opts.chapter(self.root)
        self.extract_css()
        if opts.verbose > 2:
@ -49,130 +27,6 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
        self.split()
    def debug_tree(self, name):
        '''
        Dump source tree for later debugging.
        '''
        tdir = tempfile.gettempdir()
        if not os.path.exists(tdir):
            os.makedirs(tdir)
        with open(os.path.join(tdir, 'html2epub-%s-%s.html'%\
                    (os.path.basename(self.htmlfile.path), name)), 'wb') as f:
            f.write(html.tostring(self.root, encoding='utf-8'))
            self.log_debug(_('Written processed HTML to ')+f.name)
    def parse_html(self):
        ''' Create lxml ElementTree from HTML '''
        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
        src = self.preprocess(src)
        # lxml chokes on unicode input when it contains encoding declarations
        for pat in self.ENCODING_PATS: 
            src = pat.sub('', src)
        try:
            self.root = html.document_fromstring(src)
        except:
            if self.opts.verbose:
                self.log_exception('lxml based parsing failed')
            self.root = html.soupparser.fromstring()
        self.head = self.body = None
        head = self.root.xpath('//head')
        if head:
            self.head = head[0]
        body = self.root.xpath('//body')
        if body:
            self.body = body[0]
        self.detected_chapters = self.opts.chapter(self.root)
    def rewrite_links(self, olink):
        '''
        Make all links in document relative so that they work in the EPUB container.
        Also copies any resources (like images, stylesheets, scripts, etc.) into
        the local tree.
        '''
        if not isinstance(olink, unicode):
            olink = olink.decode(self.htmlfile.encoding)
        link = self.htmlfile.resolve(olink)
        if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
            return olink
        if link.path in self.htmlfiles:
            return os.path.basename(link.path)
        if link.path in self.resource_map.keys():
            return self.resource_map[link.path]
        name = os.path.basename(link.path)
        name, ext = os.path.splitext(name)
        name += ('_%d'%len(self.resource_map)) + ext
        shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
        name = 'resources/' + name
        self.resource_map[link.path] = name 
        return name
    def extract_css(self):
        '''
        Remove all CSS information from the document and store in self.raw_css. 
        This includes <font> tags.
        '''
        css = []
        for link in self.root.xpath('//link'):
            if 'css' in link.get('type', 'text/css').lower():
                file = self.htmlfile.resolve(link.get('href', ''))
                if os.path.exists(file) and os.path.isfile(file):
                    css.append(open(file, 'rb').read().decode('utf-8'))
                link.getparent().remove(link)
        for style in self.root.xpath('//style'):
            if 'css' in style.get('type', 'text/css').lower():
                css.append('\n'.join(get_text(style)))
                style.getparent().remove(style)
        font_id = 1
        for font in self.root.xpath('//font'):
            try:
                size = int(font.attrib.pop('size', '3'))
            except:
                size = 3
            setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
            face = font.attrib.pop('face', None)
            if face is not None:
                setting += 'font-face:%s;'%face
            color = font.attrib.pop('color', None)
            if color is not None:
                setting += 'color:%s'%color
            id = 'calibre_font_id_%d'%font_id
            font.set('id', 'calibre_font_id_%d'%font_id)
            font_id += 1
            css.append('#%s { %s }'%(id, setting))
        css_counter = 1
        for elem in self.root.xpath('//*[@style]'):
            if 'id' not in elem.keys():
                elem.set('id', 'calibre_css_id_%d'%css_counter)
                css_counter += 1
            css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
            elem.attrib.pop('style')
        chapter_counter = 1
        for chapter in self.detected_chapters:
            if chapter.tag.lower() == 'a':
                if 'name' in chapter.keys():
                    chapter.attrib['id'] = id = chapter.get('name')
                elif 'id' in chapter.keys():
                    id = chapter.get('id')
                else:
                    id = 'calibre_detected_chapter_%d'%chapter_counter
                    chapter_counter += 1
                    chapter.set('id', id)
            else:
                if 'id' not in chapter.keys():
                    id = 'calibre_detected_chapter_%d'%chapter_counter
                    chapter_counter += 1
                    chapter.set('id', id)
            css.append('#%s {%s}'%(id, 'page-break-before:always'))
        self.raw_css = '\n\n'.join(css)
        # TODO: Figure out what to do about CSS imports from linked stylesheets 
    def collect_font_statistics(self):
        '''
        Collect font statistics to figure out the base font size used in this
@ -191,8 +45,8 @@ class HTMLProcessor(PreProcessor, LoggingInterface):
        pass
-def config():
+def config(defaults=None):
-    c = common_config()
+    c = common_config(defaults=defaults)
    return c
 def option_parser():
@ -203,11 +57,6 @@ def option_parser():
 Convert a HTML file to an EPUB ebook. Follows links in the HTML file. 
 '''))
 def search_for_opf(dir):
    for f in os.listdir(dir):
        if f.lower().endswith('.opf'):
            return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
 def parse_content(filelist, opts):
    tdir = PersistentTemporaryDirectory('_html2epub')
    os.makedirs(os.path.join(tdir, 'content', 'resources'))
@ -221,39 +70,17 @@ def convert(htmlfile, opts, notification=None):
    if opts.output is None:
        opts.output = os.path.splitext(os.path.basename(htmlfile))[0] + '.epub'
    opts.output = os.path.abspath(opts.output)
-    opf = search_for_opf(os.path.dirname(htmlfile))
+    opf, filelist = get_filelist(htmlfile, opts)
-    if opf:
+    mi = merge_metadata(htmlfile, opf, opts)
        mi = MetaInformation(opf)
    else:
        mi =  get_metadata(open(htmlfile, 'rb'), 'html')
    if opts.title:
        mi.title = opts.title
    if opts.authors != _('Unknown'):
        opts.authors   = opts.authors.split(',')
        opts.authors = [a.strip() for a in opts.authors]
        mi.authors = opts.authors
    if not mi.title:
        mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
    if not mi.authors:
        mi.authors = [_('Unknown')]
    opts.chapter = XPath(opts.chapter, 
                    namespaces={'re':'http://exslt.org/regular-expressions'})
-    
+    resource_map = parse_content(filelist, opts)
-    filelist = None
+    resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]
-    print 'Building file list...'
+    if opf.cover and os.access(opf.cover, os.R_OK):
-    if opf is not None:
+        shutil.copyfile(opf.cover, os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)))
-        filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
+        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
-    if not filelist:
+        shutil.copyfile(opf.cover, cpath)
-        filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
+        resources.append(cpath)
                    [0 if opts.breadth_first else 1]
    if opts.verbose:
        print '\tFound files...'
        for f in filelist:
            print '\t\t', f
    parse_content(filelist, opts)
 def main(args=sys.argv):
    parser = option_parser()
@ -266,5 +93,4 @@ def main(args=sys.argv):
    return 0
 if __name__ == '__main__':
-    sys.exit(main())
+    sys.exit(main())
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -1,8 +1,220 @@
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
-import re
+import sys, re, os, shutil, logging, tempfile
 from urlparse import urlparse
 from urllib import unquote
 from lxml import html
 from lxml.etree import XPath
 get_text = XPath("//text()")
 from calibre import LoggingInterface, unicode_path
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.utils.config import Config, StringConfig
 from calibre.ebooks.metadata.opf import OPFReader, OPFCreator
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.meta import get_metadata
 from calibre.ptempfile import PersistentTemporaryDirectory
 from calibre.utils.zipfile import ZipFile
 class Link(object):
    '''
    Represents a link in a HTML file.
    '''
    @classmethod
    def url_to_local_path(cls, url, base):
        path = url.path
        if os.path.isabs(path):
            return path
        return os.path.abspath(os.path.join(base, path))
    def __init__(self, url, base):
        '''
        :param url:  The url this link points to. Must be an unquoted unicode string.
        :param base: The base directory that relative URLs are with respect to.
                     Must be a unicode string.
        '''
        assert isinstance(url, unicode) and isinstance(base, unicode)
        self.url         = url
        self.parsed_url  = urlparse(unquote(self.url))
        self.is_local    = self.parsed_url.scheme in ('', 'file')
        self.is_internal = self.is_local and not bool(self.parsed_url.path)
        self.path        = None
        self.fragment    = self.parsed_url.fragment 
        if self.is_local and not self.is_internal:
            self.path = self.url_to_local_path(self.parsed_url, base)
    def __hash__(self):
        if self.path is None:
            return hash(self.url)
        return hash(self.path)
    def __eq__(self, other):
        return self.path == getattr(other, 'path', other)
    def __str__(self):
        return u'Link: %s --> %s'%(self.url, self.path) 
 class IgnoreFile(Exception):
    def __init__(self, msg, errno):
        Exception.__init__(self, msg)
        self.doesnt_exist = errno == 2
        self.errno = errno
 class HTMLFile(object):
    '''
    Contains basic information about an HTML file. This
    includes a list of links to other files as well as
    the encoding of each file. Also tries to detect if the file is not a HTML
    file in which case :member:`is_binary` is set to True.
    The encoding of the file is available as :member:`encoding`.
    '''
    HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
    LINK_PAT = re.compile(
    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s]+))',
    re.DOTALL|re.IGNORECASE)
    def __init__(self, path_to_html_file, level, encoding, verbose):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param encoding: Use `encoding` to decode HTML.
        '''
        self.path  = unicode_path(path_to_html_file, abs=True)
        self.base  = os.path.dirname(self.path)
        self.level = level
        self.links = []
        try:
            with open(self.path, 'rb') as f:
                src = f.read()
        except IOError, err:
            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
            if level == 0:
                raise IOError(msg)
            raise IgnoreFile(msg, err.errno)
        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
        if not self.is_binary:
            if encoding is None:
                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
                self.encoding = encoding
            src = src.decode(encoding, 'replace')
            self.find_links(src)
    def __eq__(self, other):
        return self.path == getattr(other, 'path', other)
    def __str__(self):
        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
    def __repr__(self):
        return str(self)
    def find_links(self, src):
        for match in self.LINK_PAT.finditer(src):
            url = None
            for i in ('url1', 'url2', 'url3'):
                url = match.group(i)
                if url:
                    break
            link = self.resolve(url)
            if link not in self.links:
                self.links.append(link)
    def resolve(self, url):
        return Link(url, self.base)
 def depth_first(root, flat, visited=set([])):
    yield root
    visited.add(root)
    for link in root.links:
        if link.path is not None and link not in visited:
            try:
                index = flat.index(link)
            except ValueError: # Can happen if max_levels is used
                continue
            hf = flat[index]
            if hf not in visited:
                yield hf
                visited.add(hf)
                for hf in depth_first(hf, flat, visited):
                    if hf not in visited:
                        yield hf
                        visited.add(hf)
 def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
    '''
    Recursively traverse all links in the HTML file.
    :param max_levels: Maximum levels of recursion. Must be non-negative. 0 
                       implies that no links in the root HTML file are followed.
    :param encoding:   Specify character encoding of HTML files. If `None` it is
                       auto-detected.
    :return:           A pair of lists (breadth_first, depth_first). Each list contains
                       :class:`HTMLFile` objects.
    '''
    assert max_levels >= 0
    level = 0
    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
    next_level = list(flat)
    while level < max_levels and len(next_level) > 0:
        level += 1
        nl = []
        for hf in next_level:
            rejects = []
            for link in hf.links:
                if link.path is None or link.path in flat:
                    continue
                try:
                    nf = HTMLFile(link.path, level, encoding, verbose)
                    nl.append(nf)
                    flat.append(nf)
                except IgnoreFile, err:
                    rejects.append(link)
                    if not err.doesnt_exist or verbose > 1:
                        print str(err)
            for link in rejects:
                hf.links.remove(link)
        next_level = list(nl)
    return flat, list(depth_first(flat[0], flat))
 def opf_traverse(opf_reader, verbose=0, encoding=None):
    '''
    Return a list of :class:`HTMLFile` objects in the order specified by the
    `<spine>` element of the OPF.
    :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.  
    :param encoding:   Specify character encoding of HTML files. If `None` it is
                       auto-detected.
    '''
    if not opf_reader.spine:
        raise ValueError('OPF does not have a spine')
    flat = []
    for path in opf_reader.spine.items():
        if path not in flat:
            flat.append(os.path.abspath(path))
    flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
    return flat
 class PreProcessor(object):
@ -70,5 +282,289 @@ class PreProcessor(object):
        for rule in self.PREPROCESS + rules:
            html = rule[0].sub(rule[1], html)
-        return html    
+        return html
 class Parser(PreProcessor):
    ENCODING_PATS = [re.compile(r'<[^<>]+encoding=[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE),
                     re.compile(r'<meta.*?content=[\'"].*?charset=([^\s\'"]+).*?[\'"].*?>', re.IGNORECASE)]
    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
        LoggingInterface.__init__(self, logging.getLogger(name))
        self.setup_cli_handler(opts.verbose)
        self.htmlfile = htmlfile
        self.opts = opts
        self.tdir = tdir
        self.resource_map = resource_map
        self.htmlfiles = htmlfiles
        self.resource_dir = os.path.join(tdir, 'resources')
        self.parse_html()
        self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
    def parse_html(self):
        ''' Create lxml ElementTree from HTML '''
        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace')
        src = self.preprocess(src)
        # lxml chokes on unicode input when it contains encoding declarations
        for pat in self.ENCODING_PATS: 
            src = pat.sub('', src)
        try:
            self.root = html.document_fromstring(src)
        except:
            if self.opts.verbose:
                self.log_exception('lxml based parsing failed')
            self.root = html.soupparser.fromstring()
        self.head = self.body = None
        head = self.root.xpath('//head')
        if head:
            self.head = head[0]
        body = self.root.xpath('//body')
        if body:
            self.body = body[0]
    def debug_tree(self, name):
        '''
        Dump source tree for later debugging.
        '''
        tdir = tempfile.gettempdir()
        if not os.path.exists(tdir):
            os.makedirs(tdir)
        with open(os.path.join(tdir, '%s-%s-%s.html'%\
                    (self.name, os.path.basename(self.htmlfile.path), name)), 'wb') as f:
            f.write(html.tostring(self.root, encoding='utf-8'))
            self.log_debug(_('Written processed HTML to ')+f.name)
    def rewrite_links(self, olink):
        '''
        Make all links in document relative so that they work in the EPUB container.
        Also copies any resources (like images, stylesheets, scripts, etc.) into
        the local tree.
        '''
        if not isinstance(olink, unicode):
            olink = olink.decode(self.htmlfile.encoding)
        link = self.htmlfile.resolve(olink)
        if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
            return olink
        if link.path in self.htmlfiles:
            return os.path.basename(link.path)
        if link.path in self.resource_map.keys():
            return self.resource_map[link.path]
        name = os.path.basename(link.path)
        name, ext = os.path.splitext(name)
        name += ('_%d'%len(self.resource_map)) + ext
        shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
        name = 'resources/' + name
        self.resource_map[link.path] = name 
        return name
    def extract_css(self):
        '''
        Remove all CSS information from the document and store in self.raw_css. 
        This includes <font> tags.
        '''
        css = []
        for link in self.root.xpath('//link'):
            if 'css' in link.get('type', 'text/css').lower():
                file = self.htmlfile.resolve(link.get('href', ''))
                if os.path.exists(file) and os.path.isfile(file):
                    css.append(open(file, 'rb').read().decode('utf-8'))
                link.getparent().remove(link)
        for style in self.root.xpath('//style'):
            if 'css' in style.get('type', 'text/css').lower():
                css.append('\n'.join(get_text(style)))
                style.getparent().remove(style)
        font_id = 1
        for font in self.root.xpath('//font'):
            try:
                size = int(font.attrib.pop('size', '3'))
            except:
                size = 3
            setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
            face = font.attrib.pop('face', None)
            if face is not None:
                setting += 'font-face:%s;'%face
            color = font.attrib.pop('color', None)
            if color is not None:
                setting += 'color:%s'%color
            id = 'calibre_font_id_%d'%font_id
            font.set('id', 'calibre_font_id_%d'%font_id)
            font_id += 1
            css.append('#%s { %s }'%(id, setting))
        css_counter = 1
        for elem in self.root.xpath('//*[@style]'):
            if 'id' not in elem.keys():
                elem.set('id', 'calibre_css_id_%d'%css_counter)
                css_counter += 1
            css.append('#%s {%s}'%(elem.get('id'), elem.get('style')))
            elem.attrib.pop('style')
        chapter_counter = 1
        for chapter in self.detected_chapters:
            if chapter.tag.lower() == 'a':
                if 'name' in chapter.keys():
                    chapter.attrib['id'] = id = chapter.get('name')
                elif 'id' in chapter.keys():
                    id = chapter.get('id')
                else:
                    id = 'calibre_detected_chapter_%d'%chapter_counter
                    chapter_counter += 1
                    chapter.set('id', id)
            else:
                if 'id' not in chapter.keys():
                    id = 'calibre_detected_chapter_%d'%chapter_counter
                    chapter_counter += 1
                    chapter.set('id', id)
            css.append('#%s {%s}'%(id, 'page-break-before:always'))
        self.raw_css = '\n\n'.join(css)
        # TODO: Figure out what to do about CSS imports from linked stylesheets    
 def config(defaults=None):
    desc = _('Options to control the traversal of HTML')
    if defaults is None:
        c = Config('html', desc)
    else:
        c = StringConfig(defaults, desc)
    c.add_opt('output', ['-o', '--output'], default=None,
             help=_('The output directory. Default is the current directory.'))
    c.add_opt('encoding', ['--encoding'], default=None, 
              help=_('Character encoding for HTML files. Default is to auto detect.'))
    traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
    traversal('breadth_first', ['--breadth-first'], default=False,
              help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
    traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
              help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))
    metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
    metadata('title', ['-t', '--title'], default=None,
             help=_('Set the title. Default is to autodetect.'))
    metadata('authors', ['-a', '--authors'], default=_('Unknown'),
             help=_('The author(s) of the ebook, as a comma separated list.'))
    debug = c.add_group('debug', _('Options useful for debugging'))
    debug('verbose', ['-v', '--verbose'], default=0, action='count',
          help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
    return c
 def option_parser():
    c = config()
    return c.option_parser(usage=_('''\
 %prog [options] file.html
 Follow all links in an HTML file and collect them into the specified directory.
 Also collects any references resources like images, stylesheets, scripts, etc. 
 '''))
 def safe_option_parser():
    return option_parser(safe=True)
 def search_for_opf(dir):
    for f in os.listdir(dir):
        if f.lower().endswith('.opf'):
            return OPFReader(open(os.path.join(dir, f), 'rb'), dir)
 def get_filelist(htmlfile, opts):
    print 'Building file list...'
    opf = search_for_opf(os.path.dirname(htmlfile))
    if opf is not None:
        filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
    if not filelist:
        filelist = traverse(htmlfile, verbose=opts.verbose, encoding=opts.encoding)\
                    [0 if opts.breadth_first else 1]
    if opts.verbose:
        print '\tFound files...'
        for f in filelist:
            print '\t\t', f
    return opf, filelist
 def parse_content(filelist, opts):
    if not opts.output:
        opts.output = '.'
    opts.output = os.path.abspath(opts.output)
    rdir = os.path.join(opts.output, 'content', 'resources')
    if not os.path.exists(rdir):
        os.makedirs(rdir)
    resource_map = {}
    for htmlfile in filelist:
        Parser(htmlfile, opts, os.path.join(opts.output, 'content'), 
                           resource_map, filelist)
    return resource_map
 def merge_metadata(htmlfile, opf, opts):
    if opf:
        mi = MetaInformation(opf)
    else:
        mi =  get_metadata(open(htmlfile, 'rb'), 'html')
    if opts.title:
        mi.title = opts.title
    if opts.authors != _('Unknown'):
        opts.authors   = opts.authors.split(',')
        opts.authors = [a.strip() for a in opts.authors]
        mi.authors = opts.authors
    if not mi.title:
        mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
    if not mi.authors:
        mi.authors = [_('Unknown')]
 def create_metadata(basepath, mi, filelist, resources):
    mi = OPFCreator(basepath, mi)
    entries = [(f.path, None) for f in filelist] + [(f, None) for f in resources]
    mi.create_manifest(entries)
    mi.create_spine([f.path for f in filelist])
    return mi
 def create_dir(htmlfile, opts):
    opf, filelist = get_filelist(htmlfile, opts)
    mi = merge_metadata(htmlfile, opf, opts)
    resources = [os.path.join(opts.output, 'content', f) for f in parse_content(filelist, opts).values()]
    if opf.cover and os.access(opf.cover, os.R_OK):
        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover))
        shutil.copyfile(opf.cover, cpath)
        resources.append(cpath)
    mi = create_metadata(opts.output, mi, filelist, resources)
    with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
        mi.render(f)
    print 'Open ebook created in', opts.output
 def create_oebzip(htmlfile, opts):
    tdir = PersistentTemporaryDirectory('_create_oebzip')
    if opts.output is None:
        opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip')
    ofile = opts.output
    opts.output = tdir
    create_dir(htmlfile, opts)
    zf = ZipFile(ofile, 'w')
    zf.add_dir(opts.output)
    print 'Output saved to', ofile
 def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) < 2:
        parser.print_help()
        print _('You must specify an input HTML file')
        return 1
    htmlfile = args[1]
    create_dir(htmlfile, opts)
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@ -25,6 +25,7 @@ entry_points = {
                             'epub-meta = calibre.ebooks.metadata.epub:main',
                             'txt2lrf   = calibre.ebooks.lrf.txt.convert_from:main',
                             'html2lrf  = calibre.ebooks.lrf.html.convert_from:main',
                             'html2oeb  = calibre.ebooks.html:main',
                             'html2epub = calibre.ebooks.epub.from_html:main',
                             'markdown-calibre  = calibre.ebooks.markdown.markdown:main',
                             'lit2lrf   = calibre.ebooks.lrf.lit.convert_from:main',
@ -168,6 +169,8 @@ def setup_completion(fatal_errors):
        from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
        from calibre.ebooks.metadata.epub import option_parser as epub_meta
        from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
        from calibre.ebooks.epub.from_html import option_parser as html2epub
        from calibre.ebooks.html import option_parser as html2oeb 
        f = open_file('/etc/bash_completion.d/libprs500')
        f.close()
@ -203,6 +206,8 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('comic2lrf', comicop, ['cbz', 'cbr']))
        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
        f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
        f.write(opts_and_exts('html2epub', html2epub, ['html', 'htm', 'xhtm', 'xhtml']))
        f.write(opts_and_exts('html2oeb', html2oeb, ['html', 'htm', 'xhtm', 'xhtml']))
        f.write('''
 _prs500_ls()
 {
--- a/src/calibre/utils/config.py
+++ b/src/calibre/utils/config.py
@ -169,7 +169,7 @@ class Option(object):
        self.metavar  = metavar
    def __eq__(self, other):
-        return self.name == getattr(other, 'name', None)
+        return self.name == getattr(other, 'name', other)
 class OptionValues(object):
@ -202,6 +202,19 @@ class OptionSet(object):
        self.groups[name] = description
        self.group_list.append(name)
        return partial(self.add_opt, group=name)
    def update(self, other):
        for name in other.groups.keys():
            self.groups[name] = other.groups[name]
        for pref in other.preferences:
            if pref in self.preferences:
                self.preferences.remove(pref)
            self.preferences.append(pref)
    def remove_opt(self, name):
        if name in self.preferences:
            self.preferences.remove(name)
    def add_opt(self, name, switches=[], help=None, type=None, choices=None, 
                 group=None, default=None, action=None, metavar=None):
@ -306,25 +319,40 @@ class OptionSet(object):
        groups = [self.render_group(name, self.groups.get(name, ''), opts) \
                                        for name in [None] + self.group_list]
        return src + '\n\n'.join(groups)
 class ConfigInterface(object):
-class Config(object):
+    def __init__(self, description):
    def __init__(self, basename, description=''):
        self.config_file_path = os.path.join(config_dir, basename+'.py')
        self.option_set       = OptionSet(description=description)
        self.add_opt          = self.option_set.add_opt
        self.add_group        = self.option_set.add_group
        self.remove_opt       = self.option_set.remove_opt
    def update(self, other):
        self.option_set.update(other.option_set)
    def option_parser(self, usage='', gui_mode=False):
        return self.option_set.option_parser(user_defaults=self.parse(), 
                                             usage=usage, gui_mode=gui_mode)
 class Config(ConfigInterface):
    '''
    A file based configuration.
    '''
    def __init__(self, basename, description=''):
        ConfigInterface.__init__(self, description)
        self.config_file_path = os.path.join(config_dir, basename+'.py')
    def parse(self):
-        try:
+        src = ''
-            with ExclusiveFile(self.config_file_path) as f:
+        if os.path.exists(self.config_file_path):
-                src = f.read()
+            try:
-        except LockError:
+                with ExclusiveFile(self.config_file_path) as f:
-            raise IOError('Could not lock config file: %s'%self.config_file_path)
+                    src = f.read()
            except LockError:
                raise IOError('Could not lock config file: %s'%self.config_file_path)
        return self.option_set.parse_string(src)
    def as_string(self):
@ -352,18 +380,15 @@ class Config(object):
        except LockError:
            raise IOError('Could not lock config file: %s'%self.config_file_path)
-class StringConfig(object):
+class StringConfig(ConfigInterface):
    '''
    A string based configuration
    '''
    def __init__(self, src, description=''):
        ConfigInterface.__init__(self, description)
        self.src = src
        self.option_set       = OptionSet(description=description)
        self.add_opt          = self.option_set.add_opt
        self.option_parser    = self.option_set.option_parser
    def option_parser(self, usage='', gui_mode=False):
        return self.option_set.option_parser(user_defaults=self.parse(), 
                                             usage=usage, gui_mode=gui_mode)
    def parse(self):
        return self.option_set.parse_string(self.src)
--- a/src/calibre/utils/zipfile.py
+++ b/src/calibre/utils/zipfile.py
@ -1034,10 +1034,11 @@ class ZipFile:
            os.makedirs(upperdirs)
        source = self.open(member, pwd=pwd)
-        target = open(targetpath, "wb")
+        if not os.path.exists(targetpath): # Could be a previously automatically created directory
-        shutil.copyfileobj(source, target)
+            target = open(targetpath, "wb")
-        source.close()
+            shutil.copyfileobj(source, target)
-        target.close()
+            source.close()
            target.close()
        return targetpath
@ -1067,6 +1068,8 @@ class ZipFile:
    def write(self, filename, arcname=None, compress_type=None):
        """Put the bytes from filename into the archive under the name
        arcname."""
        if isinstance(filename, unicode):
            filename = filename.encode('utf-8')
        if not self.fp:
            raise RuntimeError(
                  "Attempt to write to ZIP archive that was already closed")
@ -1133,15 +1136,17 @@ class ZipFile:
        self.filelist.append(zinfo)
        self.NameToInfo[zinfo.filename] = zinfo
-    def writestr(self, zinfo_or_arcname, bytes):
+    def writestr(self, zinfo_or_arcname, bytes, permissions=0600):
        """Write a file into the archive.  The contents is the string
        'bytes'.  'zinfo_or_arcname' is either a ZipInfo instance or
        the name of the file in the archive."""
        if not isinstance(zinfo_or_arcname, ZipInfo):
            if isinstance(zinfo_or_arcname, unicode):
                zinfo_or_arcname = zinfo_or_arcname.encode('utf-8')
            zinfo = ZipInfo(filename=zinfo_or_arcname,
                            date_time=time.localtime(time.time())[:6])
            zinfo.compress_type = self.compression
-            zinfo.external_attr = 0600 << 16
+            zinfo.external_attr = permissions << 16
        else:
            zinfo = zinfo_or_arcname
@ -1171,6 +1176,23 @@ class ZipFile:
                  zinfo.file_size))
        self.filelist.append(zinfo)
        self.NameToInfo[zinfo.filename] = zinfo
    def add_dir(self, path, prefix=''):
        if prefix:
            self.writestr(prefix+'/', '', 0700)
        cwd = os.path.abspath(os.getcwd())
        try:
            os.chdir(path)
            fp = (prefix + ('/' if prefix else '')).replace('//', '/')
            for f in os.listdir('.'):
                arcname = fp + f
                if os.path.isdir(f):
                    self.add_dir(f, prefix=arcname)
                else:
                    self.write(f, arcname) 
        finally:
            os.chdir(cwd)
    def __del__(self):
        """Call the "close()" method in case the user forgot."""