calibre/src/calibre/ebooks/html.py

from __future__ import with_statement
__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'

'''
Code to recursively parse HTML files and create an open ebook in a specified
directory or zip file. All the action starts in :function:`create_dir`.
'''

import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools
from urlparse import urlparse, urlunparse
from urllib import unquote

from lxml import etree
from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \
                      fromstring as _fromstring, tostring as _tostring, \
                      soupparser, HtmlElement
from lxml.etree import XPath
get_text = XPath("//text()")

from calibre import unicode_path, entity_to_unicode
from calibre.ebooks.chardet import xml_to_unicode, ENCODING_PATS
from calibre.utils.config import Config, StringConfig
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.zipfile import ZipFile
from cssutils import CSSParser

class HTMLElement(HtmlElement):

    @dynamic_property
    def specified_font_size(self):

        def fget(self):
            ans = self.get('specified_font_size', '')
            if not ans:
                return lambda x: x
            if ans.startswith('f'):
                return functools.partial(operator.mul, float(ans[1:]))
            return float(ans)

        def fset(self, val):
            self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val))

        return property(fget=fget, fset=fset)

    @dynamic_property
    def computed_font_size(self):
        def fget(self):
            ans = self.get('computed_font_size', '')
            if ans == '':
                return None
            return float(ans)

        def fset(self, val):
            self.set('computed_font_size', repr(val))

        return property(fget=fget, fset=fset)

    def remove_font_size_information(self):
        for elem in self.iter():
            for p in ('computed', 'specified'):
                elem.attrib.pop(p+'_font_size', None)

    def getpath(self):
        return self.getroottree().getpath(self)

class Lookup(HtmlElementClassLookup):

    def lookup(self, node_type, document, namespace, name):
        if node_type == 'element':
            return HTMLElement
        return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name)

class HTMLParser(_HTMLParser):

    def __init__(self, **kwargs):
        super(HTMLParser, self).__init__(**kwargs)
        self.set_element_class_lookup(Lookup())

parser = HTMLParser()

def fromstring(raw, **kw):
    return _fromstring(raw, parser=parser, **kw)

def tostring(root, pretty_print=False):
    return _tostring(root, encoding='utf-8', method='xml',
                         include_meta_content_type=True,
                         pretty_print=pretty_print)

class Link(object):
    '''
    Represents a link in a HTML file.
    '''

    @classmethod
    def url_to_local_path(cls, url, base):
        path = urlunparse(('', '', url.path, url.params, url.query, ''))
        path = unquote(path)
        if os.path.isabs(path):
            return path
        return os.path.abspath(os.path.join(base, path))

    def __init__(self, url, base):
        '''
        :param url:  The url this link points to. Must be an unquoted unicode string.
        :param base: The base directory that relative URLs are with respect to.
                     Must be a unicode string.
        '''
        assert isinstance(url, unicode) and isinstance(base, unicode)
        self.url         = url
        self.parsed_url  = urlparse(self.url)
        self.is_local    = self.parsed_url.scheme in ('', 'file')
        self.is_internal = self.is_local and not bool(self.parsed_url.path)
        self.path        = None
        self.fragment    = unquote(self.parsed_url.fragment)
        if self.is_local and not self.is_internal:
            self.path = self.url_to_local_path(self.parsed_url, base)

    def __hash__(self):
        if self.path is None:
            return hash(self.url)
        return hash(self.path)

    def __eq__(self, other):
        return self.path == getattr(other, 'path', other)

    def __str__(self):
        return u'Link: %s --> %s'%(self.url, self.path)


class IgnoreFile(Exception):

    def __init__(self, msg, errno):
        Exception.__init__(self, msg)
        self.doesnt_exist = errno == 2
        self.errno = errno

class HTMLFile(object):
    '''
    Contains basic information about an HTML file. This
    includes a list of links to other files as well as
    the encoding of each file. Also tries to detect if the file is not a HTML
    file in which case :member:`is_binary` is set to True.

    The encoding of the file is available as :member:`encoding`.
    '''

    HTML_PAT  = re.compile(r'<\s*html', re.IGNORECASE)
    TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
    LINK_PAT  = re.compile(
    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
    re.DOTALL|re.IGNORECASE)

    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param encoding: Use `encoding` to decode HTML.
        :param referrer: The :class:`HTMLFile` that first refers to this file.
        '''
        self.path     = unicode_path(path_to_html_file, abs=True)
        self.title    = os.path.splitext(os.path.basename(self.path))[0]
        self.base     = os.path.dirname(self.path)
        self.level    = level
        self.referrer = referrer
        self.links    = []

        try:
            with open(self.path, 'rb') as f:
                src = f.read()
        except IOError, err:
            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
            if level == 0:
                raise IOError(msg)
            raise IgnoreFile(msg, err.errno)

        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
        if not self.is_binary:
            if encoding is None:
                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
                self.encoding = encoding
            else:
                self.encoding = encoding

            src = src.decode(encoding, 'replace')
            match = self.TITLE_PAT.search(src)
            self.title = match.group(1) if match is not None else self.title
            self.find_links(src)


    def __eq__(self, other):
        return self.path == getattr(other, 'path', other)

    def __str__(self):
        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)

    def __repr__(self):
        return str(self)


    def find_links(self, src):
        for match in self.LINK_PAT.finditer(src):
            url = None
            for i in ('url1', 'url2', 'url3'):
                url = match.group(i)
                if url:
                    break
            link = self.resolve(url)
            if link not in self.links:
                self.links.append(link)

    def resolve(self, url):
        return Link(url, self.base)


def depth_first(root, flat, visited=set([])):
    yield root
    visited.add(root)
    for link in root.links:
        if link.path is not None and link not in visited:
            try:
                index = flat.index(link)
            except ValueError: # Can happen if max_levels is used
                continue
            hf = flat[index]
            if hf not in visited:
                yield hf
                visited.add(hf)
                for hf in depth_first(hf, flat, visited):
                    if hf not in visited:
                        yield hf
                        visited.add(hf)


def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
    '''
    Recursively traverse all links in the HTML file.

    :param max_levels: Maximum levels of recursion. Must be non-negative. 0
                       implies that no links in the root HTML file are followed.
    :param encoding:   Specify character encoding of HTML files. If `None` it is
                       auto-detected.
    :return:           A pair of lists (breadth_first, depth_first). Each list contains
                       :class:`HTMLFile` objects.
    '''
    assert max_levels >= 0
    level = 0
    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
    next_level = list(flat)
    while level < max_levels and len(next_level) > 0:
        level += 1
        nl = []
        for hf in next_level:
            rejects = []
            for link in hf.links:
                if link.path is None or link.path in flat:
                    continue
                try:
                    nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
                    if nf.is_binary:
                        raise IgnoreFile('%s is a binary file'%nf.path, -1)
                    nl.append(nf)
                    flat.append(nf)
                except IgnoreFile, err:
                    rejects.append(link)
                    if not err.doesnt_exist or verbose > 1:
                        print repr(err)
            for link in rejects:
                hf.links.remove(link)

        next_level = list(nl)
    orec = sys.getrecursionlimit()
    sys.setrecursionlimit(500000)
    try:
        return flat, list(depth_first(flat[0], flat))
    finally:
        sys.setrecursionlimit(orec)


def opf_traverse(opf_reader, verbose=0, encoding=None):
    '''
    Return a list of :class:`HTMLFile` objects in the order specified by the
    `<spine>` element of the OPF.

    :param opf_reader: An :class:`calibre.ebooks.metadata.opf.OPFReader` instance.
    :param encoding:   Specify character encoding of HTML files. If `None` it is
                       auto-detected.
    '''
    if not opf_reader.spine:
        raise ValueError('OPF does not have a spine')
    flat = []
    for path in opf_reader.spine.items():
        path = os.path.abspath(path)
        if path not in flat:
            flat.append(os.path.abspath(path))
    for item in opf_reader.manifest:
        if 'html' in item.mime_type:
            path = os.path.abspath(item.path)
            if path not in flat:
                flat.append(path)
    for i, path in enumerate(flat):
        if not os.path.exists(path):
            path = path.replace('&', '%26')
            if os.path.exists(path):
                flat[i] = path
                for item in opf_reader.itermanifest():
                    item.set('href', item.get('href').replace('&', '%26'))
    ans = []
    for path in flat:
        if os.path.exists(path):
            ans.append(HTMLFile(path, 0, encoding, verbose))
        else:
            print 'WARNING: OPF spine item %s does not exist'%path
    ans = [f for f in ans if not f.is_binary]
    return ans


convert_entities = functools.partial(entity_to_unicode, exceptions=['quot', 'apos', 'lt', 'gt', 'amp'])
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)

def sanitize_head(match):
    x = match.group(1)
    x = _span_pat.sub('', x)
    return '<head>\n'+x+'\n</head>'

class PreProcessor(object):
    PREPROCESS = [
                  # Some idiotic HTML generators (Frontpage I'm looking at you)
                  # Put all sorts of crap into <head>. This messes up lxml
                  (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
                   sanitize_head),
                  # Convert all entities, since lxml doesn't handle them well
                  (re.compile(r'&(\S+?);'), convert_entities),
                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
                   lambda match: ''),
                  ]

    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Remove <hr> tags
                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
                  # Remove page numbers
                  (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
                  # Remove <br> and replace <br><br> with <p>
                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
                  (re.compile(r'(.*)<br.*?>', re.IGNORECASE),
                   lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
                                else match.group(1)),
                  # Remove hyphenation
                  (re.compile(r'-\n\r?'), lambda match: ''),

                  # Remove gray background
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>')

                  ]

    # Fix Book Designer markup
    BOOK_DESIGNER = [
                     # HR
                     (re.compile('<hr>', re.IGNORECASE),
                      lambda match : '<span style="page-break-after:always"> </span>'),
                     # Create header tags
                     (re.compile('<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
                      lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
                     (re.compile('<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
                      lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
                     (re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]

    def is_baen(self, src):
        return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
                          re.IGNORECASE).search(src) is not None

    def is_book_designer(self, raw):
        return re.search('<H2[^><]*id=BookTitle', raw) is not None

    def is_pdftohtml(self, src):
        return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]

    def preprocess(self, html):
        opts = getattr(self, 'opts', False)
        if opts and hasattr(opts, 'profile') and getattr(opts.profile, 'remove_special_chars', False):
            html = opts.profile.remove_special_chars.sub('', html)
        if self.is_baen(html):
            rules = []
        elif self.is_book_designer(html):
            rules = self.BOOK_DESIGNER
        elif self.is_pdftohtml(html):
            rules = self.PDFTOHTML
        else:
            rules = []
        for rule in self.PREPROCESS + rules:
            html = rule[0].sub(rule[1], html)
        return html

class Parser(PreProcessor):
#    SELF_CLOSING_TAGS = 'hr|br|link|img|meta|input|area|base|basefont'
#    SELF_CLOSING_RULES = [re.compile(p[0]%SELF_CLOSING_TAGS, re.IGNORECASE) for p in
#                          [
#                           (r'<(?P<tag>%s)(?P<attrs>(\s+[^<>]*){0,1})(?<!/)>',
#                            '<\g<tag>\g<attrs> />'),
#                           (),
#                           ]
#                          ]

    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, name='htmlparser'):
        self.setup_cli_handler(opts.verbose)
        self.htmlfile = htmlfile
        self.opts = opts
        self.tdir = tdir
        self.resource_map = resource_map
        self.htmlfiles = htmlfiles
        self.resource_dir = os.path.join(tdir, 'resources')
        save_counter = 1
        self.htmlfile_map = {}
        self.level = self.htmlfile.level
        for f in self.htmlfiles:
            name = os.path.basename(f.path)
            name = os.path.splitext(name)[0] + '.xhtml'
            if name in self.htmlfile_map.values():
                name = os.path.splitext(name)[0] + '_cr_%d'%save_counter + os.path.splitext(name)[1]
                save_counter += 1
            self.htmlfile_map[f.path] = name

        self.parse_html()
        # Handle <image> tags inside embedded <svg>
        # At least one source of EPUB files (Penguin) uses xlink:href
        # without declaring the xlink namespace
        for image in self.root.xpath('//image'):
            for attr in image.attrib.keys():
                if attr.endswith(':href'):
                    nhref = self.rewrite_links(image.get(attr))
                    image.set(attr, nhref)

        self.root.rewrite_links(self.rewrite_links, resolve_base_href=False)
        for bad in ('xmlns', 'lang', 'xml:lang'): # lxml also adds these attributes for XHTML documents, leading to duplicates
            if self.root.get(bad, None) is not None:
                self.root.attrib.pop(bad)


    def save_path(self):
        return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])

    def save(self, strip_comments=False):
        '''
        Save processed HTML into the content directory.
        Should be called after all HTML processing is finished.
        '''
        self.root.set('xmlns', 'http://www.w3.org/1999/xhtml')
        self.root.set('xmlns:xlink', 'http://www.w3.org/1999/xlink')
        for svg in self.root.xpath('//svg'):
            svg.set('xmlns', 'http://www.w3.org/2000/svg')

        ans = tostring(self.root, pretty_print=self.opts.pretty_print)
        ans = re.compile(r'<head>', re.IGNORECASE).sub(
            '<head>\n\t<meta http-equiv="Content-Type" '
            'content="text/html; charset=utf-8" />\n', ans[:1000])+ans[1000:]
        if strip_comments:
            ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
        with open(self.save_path(), 'wb') as f:
            f.write(ans)
            return f.name


    def parse_html(self):
        ''' Create lxml ElementTree from HTML '''
        self.log_info('\tParsing '+os.sep.join(self.htmlfile.path.split(os.sep)[-3:]))
        if self.htmlfile.is_binary:
            raise ValueError('Not a valid HTML file: '+self.htmlfile.path)
        src = open(self.htmlfile.path, 'rb').read().decode(self.htmlfile.encoding, 'replace').strip()
        src = src.replace('\x00', '').replace('\r', ' ')
        src = self.preprocess(src)
        # lxml chokes on unicode input when it contains encoding declarations
        for pat in ENCODING_PATS:
            src = pat.sub('', src)
        src = src[src.find('<'):]
        # Remove unclosed <style> tag as that messes up lxml's parsing
        src = re.sub(r'<style>\s*</head>', '', src)
        try:
            self.root = fromstring(src)
        except:
            if self.opts.verbose:
                self.log_exception('lxml based parsing failed')
            self.root = soupparser.fromstring(src, makeelement=parser.makeelement)
        head = self.root.xpath('./head')
        if head:
            head = head[0]
        else:
            head = etree.SubElement(self.root, 'head')
            self.root.remove(head)
            self.root.insert(0, head)

        self.head = head
        try:
            self.body = self.root.body
        except:
            import traceback
            err = traceback.format_exc()
            self.root = fromstring(u'<html><head/><body><p>This page was too '
                                   'severely malformed for calibre to handle. '
                                   'It has been replaced by this error message.'
                                   '</p><pre>%s</pre></body></html>'%err)
            self.head = self.root.xpath('./head')[0]
            self.body = self.root.body
        invalid_counter = 0
        for a in self.root.xpath('//a[@name]'):
            try:
                a.set('id', a.get('name'))
            except:
                invalid_counter += 1
                for x in ('id', 'name'):
                    a.set(x, 'calibre_invalid_id_%d'%invalid_counter)
        if not self.head.xpath('./title'):
            title = etree.SubElement(self.head, 'title')
            title.text = _('Unknown')

    def debug_tree(self, name):
        '''
        Dump source tree for later debugging.
        '''
        tdir = tempfile.gettempdir()
        if not os.path.exists(tdir):
            os.makedirs(tdir)
        with open(os.path.join(tdir, '%s-%s.html'%\
                    (os.path.basename(self.htmlfile.path), name)), 'wb') as f:
            f.write(tostring(self.root))
            self.log_debug(_('Written processed HTML to ')+f.name)


    def rewrite_links(self, olink):
        '''
        Make all links in document relative so that they work in the EPUB container.
        Also copies any resources (like images, stylesheets, scripts, etc.) into
        the local tree.
        '''
        if not isinstance(olink, unicode):
            olink = olink.decode(self.htmlfile.encoding)
        link = self.htmlfile.resolve(olink)
        frag = (('#'+link.fragment) if link.fragment else '')
        if link.path == self.htmlfile.path:
            return frag if frag else '#'
        if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
            return olink
        if link.path in self.htmlfiles:
            return self.htmlfile_map[link.path] + frag
        if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None:
            return olink # This happens when --max-levels is used
        if link.path in self.resource_map.keys():
            return self.resource_map[link.path] + frag
        name = os.path.basename(link.path)
        name, ext = os.path.splitext(name)
        name += ('_%d'%len(self.resource_map)) + ext
        shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
        name = 'resources/' + name
        self.resource_map[link.path] = name
        return name + frag


class Processor(Parser):
    '''
    This class builds on :class:`Parser` to provide additional methods
    to perform various processing/modification tasks on HTML files.
    '''

    LINKS_PATH = XPath('//a[@href]')
    PIXEL_PAT  = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px')
    PAGE_PAT   = re.compile(r'@page[^{]*?{[^}]*?}')

    def __init__(self, *args, **kwargs):
        Parser.__init__(self, *args, **kwargs)
        temp = LoggingInterface(logging.getLogger('cssutils'))
        temp.setup_cli_handler(self.opts.verbose)
        self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR)
        self.stylesheet = self.font_css = self.override_css = None

    def detect_chapters(self):
        self.detected_chapters = self.opts.chapter(self.root)
        chapter_mark = self.opts.chapter_mark
        page_break_before = 'display: block; page-break-before: always'
        page_break_after = 'display: block; page-break-after: always'
        for elem in self.detected_chapters:
            text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
            self.log_info('\tDetected chapter: %s', text[:50])
            if chapter_mark == 'none':
                continue
            elif chapter_mark == 'rule':
                mark = etree.Element('hr')
            elif chapter_mark == 'pagebreak':
                mark = etree.Element('div', style=page_break_after)
            else: # chapter_mark == 'both':
                mark = etree.Element('hr', style=page_break_before)
            elem.addprevious(mark)

    def save(self, strip_comments=False):
        style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
        for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
            if sheet is not None:
                style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet',
                                                           'href':'resources/%s_%d.css'%(style_path, i),
                                                           'charset':'UTF-8'})
                style.tail = '\n'
                path = os.path.join(os.path.dirname(self.save_path()), *(style.get('href').split('/')))
                self.resource_map[path] = style.get('href')
                raw = getattr(sheet, 'cssText', sheet)
                if isinstance(raw, unicode):
                    raw = raw.encode('utf-8')
                open(path, 'wb').write(raw)
        return Parser.save(self, strip_comments=strip_comments)

    def populate_toc(self, toc):
        '''
        Populate the Table of Contents from detected chapters and links.
        '''
        class Adder(object):

            def __init__(self, toc):
                self.next_play_order = max([x.play_order for x in toc.flat()])

            def __call__(self, href, fragment, text, target, type='link'):
                for entry in toc.flat():
                    if entry.href == href and entry.fragment == fragment:
                        return entry
                if len(text) > 50:
                    text = text[:50] + u'\u2026'
                self.next_play_order += 1
                return target.add_item(href, fragment, text, type=type,
                                       play_order=self.next_play_order)
        add_item = Adder(toc)
        name = self.htmlfile_map[self.htmlfile.path]
        href = 'content/'+name

        # Add level* TOC items
        counter = 0

        def elem_to_link(elem, href, counter):
            text = (u''.join(elem.xpath('string()'))).strip()
            if not text:
                return None, None, None
            t = elem.xpath('descendant-or-self::a[@href]')
            if t:
                _href = 'content/' + t[0].get('href', '')
                parts = _href.split('#')
                _href = parts[0]
                frag = None if len(parts) == 1 else parts[-1]
            else:
                _href = href
                id = elem.get('id', 'calibre_chapter_%d'%counter)
                elem.set('id', id)
                frag = id
            return text, _href, frag


        if self.opts.level1_toc is not None:
            level1 = self.opts.level1_toc(self.root)
            level1_order = []
            if level1:
                added = {}
                for elem in level1:
                    text, _href, frag = elem_to_link(elem, href, counter)
                    counter += 1
                    if text:
                        level1_order.append(add_item(_href, frag, text, toc, type='chapter'))
                        added[elem] = level1_order[-1]
                        add_item(_href, frag, 'Top', added[elem], type='chapter')
                if self.opts.level2_toc is not None:
                    added2 = {}
                    level2 = list(self.opts.level2_toc(self.root))
                    for elem in level2:
                        level1 = None
                        for item in self.root.iterdescendants():
                            if item in added.keys():
                                level1 = added[item]
                            elif item == elem and level1 is not None:
                                text, _href, frag = elem_to_link(elem, href, counter)
                                counter += 1
                                if text:
                                    added2[elem] = \
                                    add_item(_href, frag, text, level1, type='chapter')
                    if self.opts.level3_toc is not None:
                        level3 = list(self.opts.level3_toc(self.root))
                        for elem in level3:
                            level2 = None
                            for item in self.root.iterdescendants():
                                if item in added2.keys():
                                    level2 = added2[item]
                                elif item == elem and level2 is not None:
                                    text, _href, frag = elem_to_link(elem, href, counter)
                                    counter += 1
                                    if text:
                                        add_item(_href, frag, text, level2, type='chapter')


            if level1_order: # Fix play order
                next_play_order = level1_order[0].play_order
                for x in level1_order:
                    for y in x.flat():
                        y.play_order = next_play_order
                        next_play_order += 1


            if len(toc) > 0:
                # Detected TOC entries using --level* options
                # so aborting all other toc processing
                return
        # Add chapters to TOC
        if not self.opts.no_chapters_in_toc:
            for elem in getattr(self, 'detected_chapters', []):
                text = (u''.join(elem.xpath('string()'))).strip()
                if text:
                    counter += 1
                    id = elem.get('id', 'calibre_chapter_%d'%counter)
                    elem.set('id', id)
                    add_item(href, id, text, toc, type='chapter')

        if len(list(toc.flat())) >= self.opts.toc_threshold:
            return
        referrer = toc
        if self.htmlfile.referrer is not None:
            try:
                name = self.htmlfile_map[self.htmlfile.referrer.path]
                href = 'content/'+name
                for i in toc.flat():
                    if href == i.href and i.fragment is None:
                        referrer = i
                        break
            except KeyError:
                pass
            if referrer is toc:
                text = self.htmlfile.title
                name = self.htmlfile_map[self.htmlfile.referrer.path]
                href = 'content/'+name
                referrer = add_item(href, None, text, toc)

        # Add links to TOC
        if int(self.opts.max_toc_links) > 0:
            for link in list(self.LINKS_PATH(self.root))[:self.opts.max_toc_links]:
                text = (u''.join(link.xpath('string()'))).strip()
                if text:
                    href = link.get('href', '')
                    if href and not (href.startswith('http://') or href.startswith('https://')):
                        href = href.strip()
                        if href.startswith('#'):
                            href = self.htmlfile_map[self.htmlfile.path] + href
                        href = 'content/'+href
                        parts = href.split('#')
                        href, fragment = parts[0], None
                        if len(parts) > 1:
                            fragment = parts[1]
                        add_item(href, fragment, text, referrer)

    @classmethod
    def preprocess_css(cls, css, dpi=96):
        def rescale(match):
            val = match.group(1)
            try:
                val = float(val)
            except ValueError:
                return ''
            return '%fpt'%(72 * val/dpi)

        css = cls.PIXEL_PAT.sub(rescale, css)
        css = cls.PAGE_PAT.sub('', css)
        return css

    def extract_css(self, parsed_sheets):
        '''
        Remove all CSS information from the document and store it as
        :class:`StyleSheet` objects.
        '''

        def get_id(chapter, counter, prefix='calibre_css_'):
            new_id = '%s_%d'%(prefix, counter)
            if chapter.tag.lower() == 'a' and  'name' in chapter.keys():
                chapter.attrib['id'] = id = chapter.get('name')
                if not id:
                    chapter.attrib['id'] = chapter.attrib['name'] = new_id
                return new_id
            if 'id' in chapter.keys():
                id = chapter.get('id')
            else:
                id = new_id
                chapter.set('id', id)
            return id

        self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('')
        self.specified_override_css = []
        for link in self.root.xpath('//link'):
            ltype = link.get('type', link.get('rel', 'text/css')).lower()
            if 'css' in ltype or 'style' in ltype:
                file = os.path.join(self.tdir, *(link.get('href', '').split('/')))
                if file and not 'http:' in file:
                    if not parsed_sheets.has_key(file):
                        try:
                            self.log_info('Processing stylesheet %s...'%file)
                            css = self.preprocess_css(open(file).read())
                        except (IOError, OSError):
                            self.log_error('Failed to open stylesheet: %s'%file)
                        else:
                            try:
                                try:
                                    parsed_sheets[file] = self.css_parser.parseString(css)
                                except ValueError:
                                    parsed_sheets[file] = \
                                        self.css_parser.parseString(\
                                                css.decode('utf8', 'replace'))
                            except:
                                parsed_sheets[file] = css.decode('utf8', 'replace')
                                self.log_warning('Failed to parse stylesheet: %s'%file)
                                if self.opts.verbose > 1:
                                    self.log_exception('')
                    if parsed_sheets.has_key(file):
                        self.external_stylesheets.append(parsed_sheets[file])


        for style in self.root.xpath('//style'):
            if 'css' in style.get('type', 'text/css').lower():
                override_css = style.get('title', '') == 'override_css'
                raw = '\n'.join(style.xpath('./text()'))
                css = self.preprocess_css(raw)
                try:
                    sheet = self.css_parser.parseString(css)
                except:
                    self.log_debug('Failed to parse style element')
                else:
                    for rule in sheet:
                        if override_css:
                            self.specified_override_css.append(rule)
                        else:
                            self.stylesheet.add(rule)
                style.getparent().remove(style)
        cache = {}
        class_counter = 0
        for font in self.root.xpath('//font'):
            try:
                size = font.attrib.pop('size', '3')
            except:
                size = '3'
            if size and size.strip() and size.strip()[0] in ('+', '-'):
                size = re.search(r'[+-]{0,1}[\d\.]+', size)
                try:
                    size = float(size.group())
                except:
                    size = 0
                size += 3 # Hack assumes basefont=3
            try:
                setting = 'font-size: %d%%;'%int((float(size)/3) * 100)
            except ValueError:
                setting = ''
            face = font.attrib.pop('face', None)
            if face:
                faces = []
                for face in face.split(','):
                    face = face.strip()
                    if ' ' in face and not (face[0] == face[-1] == '"'):
                        face = '"%s"' % face.replace('"', r'\"')
                    faces.append(face)
                for generic in ('serif', 'sans-serif', 'monospace'):
                    if generic in faces:
                        break
                else:
                    faces.append('serif')
                family = ', '.join(faces)
                setting += 'font-family: %s;' % family
            color = font.attrib.pop('color', None)
            if color is not None:
                setting += 'color:%s'%color
            classname = cache.get(setting, None)
            if classname is None:
                classname = 'calibre_class_%d'%class_counter
                class_counter += 1
                cache[setting] = classname
            cn = font.get('class', '')
            if cn: cn += ' '
            cn += classname
            font.set('class', cn)
            font.tag = 'span'

        id_css, id_css_counter = {}, 0
        for elem in self.root.xpath('//*[@style]'):
            setting = elem.get('style')
            if elem.get('id', False) or elem.get('class', False):
                elem.set('id', elem.get('id', 'calibre_css_id_%d'%id_css_counter))
                id_css_counter += 1
                id_css[elem.tag+'#'+elem.get('id')] = setting
            else:
                classname = cache.get(setting, None)
                if classname is None:
                    classname = 'calibre_class_%d'%class_counter
                    class_counter += 1
                    cache[setting] = classname
                cn = elem.get('class', classname)
                elem.set('class', cn)
            elem.attrib.pop('style')

        css = '\n'.join(['.%s {%s;}'%(cn, setting) for \
                         setting, cn in cache.items()])
        css += '\n\n'
        css += '\n'.join(['%s {%s;}'%(selector, setting) for \
                         selector, setting in id_css.items()])
        sheet = self.css_parser.parseString(self.preprocess_css(css.replace(';;}', ';}')))
        for rule in sheet:
            self.stylesheet.add(rule)
        css = ''
        css += '\n\n' + 'body {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt;}'
        css += '\n\n@page {margin-top: %fpt; margin-bottom: %fpt; }'%(self.opts.margin_top, self.opts.margin_bottom)
        css += '\n\nbody {margin-left: %fpt; margin-right: %fpt}'%(self.opts.margin_left, self.opts.margin_right)
        # Workaround for anchor rendering bug in ADE
        css += '\n\na { color: inherit; text-decoration: inherit; cursor: default; }\na[href] { color: blue; text-decoration: underline; cursor:pointer; }'
        if self.opts.remove_paragraph_spacing:
            css += '\n\np {text-indent: 1.5em; margin-top:0pt; margin-bottom:0pt; padding:0pt; border:0pt;}'
        if not self.opts.no_justification:
            css += '\n\nbody {text-align: justify}'
        if self.opts.override_css:
            css += '\n\n' + self.opts.override_css
        self.override_css = self.css_parser.parseString(self.preprocess_css(css))
        for rule in reversed(self.specified_override_css):
            self.override_css.insertRule(rule, index=0)


def config(defaults=None, config_name='html',
           desc=_('Options to control the traversal of HTML')):
    if defaults is None:
        c = Config(config_name, desc)
    else:
        c = StringConfig(defaults, desc)

    c.add_opt('output', ['-o', '--output'], default=None,
             help=_('The output directory. Default is the current directory.'))
    c.add_opt('encoding', ['--encoding'], default=None,
              help=_('Character encoding for HTML files. Default is to auto detect.'))
    c.add_opt('zip', ['--zip'], default=False,
              help=_('Create the output in a zip file. If this option is specified, the --output should be the name of a file not a directory.'))

    traversal = c.add_group('traversal', _('Control the following of links in HTML files.'))
    traversal('breadth_first', ['--breadth-first'], default=False,
              help=_('Traverse links in HTML files breadth first. Normally, they are traversed depth first'))
    traversal('max_levels', ['--max-levels'], default=sys.getrecursionlimit(), group='traversal',
              help=_('Maximum levels of recursion when following links in HTML files. Must be non-negative. 0 implies that no links in the root HTML file are followed.'))

    metadata = c.add_group('metadata', _('Set metadata of the generated ebook'))
    metadata('title', ['-t', '--title'], default=None,
             help=_('Set the title. Default is to autodetect.'))
    metadata('authors', ['-a', '--authors'], default=None,
             help=_('The author(s) of the ebook, as a & separated list.'))
    metadata('tags', ['--subjects'], default=None,
             help=_('The subject(s) of this book, as a comma separated list.'))
    metadata('publisher', ['--publisher'], default=None,
             help=_('Set the publisher of this book.'))
    metadata('comments', ['--comment'], default=None,
             help=_('A summary of this book.'))
    metadata('from_opf', ['--metadata-from'], default=None,
              help=_('Load metadata from the specified OPF file'))

    debug = c.add_group('debug', _('Options useful for debugging'))
    debug('verbose', ['-v', '--verbose'], default=0, action='count',
          help=_('Be more verbose while processing. Can be specified multiple times to increase verbosity.'))
    debug('pretty_print', ['--pretty-print'], default=False,
          help=_('Output HTML is "pretty printed" for easier parsing by humans'))

    return c

def option_parser():
    c = config()
    return c.option_parser(usage=_('''\
%prog [options] file.html|opf

Follow all links in an HTML file and collect them into the specified directory.
Also collects any resources like images, stylesheets, scripts, etc.
If an OPF file is specified instead, the list of files in its <spine> element
is used.
'''))

def search_for_opf(dir):
    for f in os.listdir(dir):
        if f.lower().endswith('.opf'):
            return OPF(open(os.path.join(dir, f), 'rb'), dir)


def get_filelist(htmlfile, opts):
    '''
    Build list of files referenced by html file or try to detect and use an
    OPF file instead.
    '''
    print 'Building file list...'
    dir = os.path.dirname(htmlfile)
    if not dir:
        dir = os.getcwd()
    opf = search_for_opf(dir)
    filelist = None
    if opf is not None:
        try:
            filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
        except:
            pass
    if not filelist:
        filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
                            verbose=opts.verbose, encoding=opts.encoding)\
                    [0 if opts.breadth_first else 1]
    if opts.verbose:
        print '\tFound files...'
        for f in filelist:
            print '\t\t', f
    return opf, filelist

def parse_content(filelist, opts):
    '''
    Parse content, rewriting links and copying resources.
    '''
    if not opts.output:
        opts.output = '.'
    opts.output = os.path.abspath(opts.output)
    rdir = os.path.join(opts.output, 'content', 'resources')
    if not os.path.exists(rdir):
        os.makedirs(rdir)
    resource_map = {}
    for htmlfile in filelist:
        p = Parser(htmlfile, opts, os.path.join(opts.output, 'content'),
                           resource_map, filelist)
        p.save()
    return resource_map, p.htmlfile_map

def merge_metadata(htmlfile, opf, opts):
    '''
    Merge metadata from various sources.
    '''
    if opf:
        mi = MetaInformation(opf)
    elif htmlfile:
        from calibre.ebooks.metadata.meta import get_metadata
        try:
            mi =  get_metadata(open(htmlfile, 'rb'), 'html')
        except:
            mi = MetaInformation(None, None)
    if opts.from_opf is not None and os.access(opts.from_opf, os.R_OK):
        mi.smart_update(OPF(open(opts.from_opf, 'rb'), os.path.abspath(os.path.dirname(opts.from_opf))))
    for attr in ('title', 'authors', 'publisher', 'tags', 'comments'):
        val = getattr(opts, attr, None)
        if val is None or val == _('Unknown') or val == [_('Unknown')]:
            continue
        if attr =='authors':
            val = [i.strip() for i in val.split('&') if i.strip()]
        elif attr == 'tags':
            val = [i.strip() for i in val.split(',') if i.strip()]
        setattr(mi, attr, val)

    cover = getattr(opts, 'cover', False)
    if cover and os.path.exists(cover):
        mi.cover = os.path.abspath(cover)

    if not mi.title:
        if htmlfile:
            mi.title = os.path.splitext(os.path.basename(htmlfile))[0]
        else:
            mi.title = _('Unknown')
    if not mi.authors:
        mi.authors = [_('Unknown')]
    return mi

def create_metadata(basepath, mi, filelist, resources):
    '''
    Create an OPF metadata object with correct spine and manifest.
    '''
    mi = OPFCreator(basepath, mi)
    mi.guide = None
    entries = [('content/'+f, 'application/xhtml+xml') for f in filelist] + [(f, None) for f in resources]
    for f in filelist:
        if os.path.exists(os.path.join(basepath, 'content', 'resources', f+'.css')):
            entries.append(('content/resources/'+f+'.css', 'text/css'))
    mi.create_manifest(entries)
    mi.create_spine(['content/'+f for f in filelist])
    return mi

def rebase_toc(toc, htmlfile_map, basepath, root=True):
    '''
    Rebase a :class:`calibre.ebooks.metadata.toc.TOC` object. Maps all entries
    in the TOC to point to their new locations relative to the new OPF file.
    '''
    def fix_entry(entry):
        if entry.abspath in htmlfile_map.keys():
            entry.href = 'content/' +  htmlfile_map[entry.abspath]

    for entry in toc:
        rebase_toc(entry, htmlfile_map, basepath, root=False)
        fix_entry(entry)
    if root:
        toc.base_path = basepath

def create_dir(htmlfile, opts):
    '''
    Create a directory that contains the open ebook
    '''
    if htmlfile.lower().endswith('.opf'):
        opf = OPF(open(htmlfile, 'rb'), os.path.dirname(os.path.abspath(htmlfile)))
        filelist = opf_traverse(opf, verbose=opts.verbose, encoding=opts.encoding)
        mi = MetaInformation(opf)
    else:
        opf, filelist = get_filelist(htmlfile, opts)
        mi = merge_metadata(htmlfile, opf, opts)

    resource_map, htmlfile_map = parse_content(filelist, opts)
    resources = [os.path.join(opts.output, 'content', f) for f in resource_map.values()]

    if opf and opf.cover and os.access(opf.cover, os.R_OK):
        cpath = os.path.join(opts.output, 'content', 'resources', '_cover_'+os.path.splitext(opf.cover)[-1])
        shutil.copyfile(opf.cover, cpath)
        resources.append(cpath)
        mi.cover = cpath

    spine = [htmlfile_map[f.path] for f in filelist]
    mi = create_metadata(opts.output, mi, spine, resources)
    buf = cStringIO.StringIO()
    if mi.toc:
        rebase_toc(mi.toc, htmlfile_map, opts.output)
    with open(os.path.join(opts.output, 'metadata.opf'), 'wb') as f:
        mi.render(f, buf)
    toc = buf.getvalue()
    if toc:
        with open(os.path.join(opts.output, 'toc.ncx'), 'wb') as f:
            f.write(toc)
    print 'Open ebook created in', opts.output

def create_oebzip(htmlfile, opts):
    '''
    Create a zip file that contains the Open ebook.
    '''
    tdir = PersistentTemporaryDirectory('_create_oebzip')
    if opts.output is None:
        opts.output = os.path.join(os.path.splitext(htmlfile)[0]+'.oeb.zip')
    ofile = opts.output
    opts.output = tdir
    create_dir(htmlfile, opts)
    zf = ZipFile(ofile, 'w')
    zf.add_dir(opts.output)
    print 'Output saved to', ofile

def main(args=sys.argv):
    parser = option_parser()
    opts, args = parser.parse_args(args)
    if len(args) < 2:
        parser.print_help()
        print _('You must specify an input HTML file')
        return 1

    htmlfile = args[1]
    if opts.zip:
        create_oebzip(htmlfile, opts)
    else:
        create_dir(htmlfile, opts)

    return 0

def gui_main(htmlfile, pt=None):
    '''
    Convenience wrapper for use in recursively importing HTML files.
    '''
    if pt is None:
        pt = PersistentTemporaryFile('_html2oeb_gui.oeb.zip')
    pt.close()
    opts = '''
pretty_print = True
max_levels = 5
output  = %s
'''%repr(pt.name)
    c = config(defaults=opts)
    opts = c.parse()
    create_oebzip(htmlfile, opts)
    zf = ZipFile(pt.name, 'r')
    nontrivial = [f for f in zf.infolist() if f.compress_size > 1 and not f.filename.endswith('.opf')]
    if len(nontrivial) < 2:
        return None
    return pt.name


if __name__ == '__main__':
    sys.exit(main())