Support for splitting HTML files to respect maximum flow size limit for EPUB on the SONY Reader.

2025-07-09 03:04:10 -04:00 · 2008-09-21 22:47:43 -07:00 · 2008-09-21 22:47:43 -07:00 · 35c8db2dd7
commit 35c8db2dd7
parent 5c37760a27
14 changed files with 515 additions and 199 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -170,6 +170,19 @@ def fit_image(width, height, pwidth, pheight):
    return scaled, int(width), int(height)
 class CurrentDir(object):
    def __init__(self, path):
        self.path = path
        self.cwd = None
    def __enter__(self, *args):
        self.cwd = os.getcwd()
        os.chdir(self.path)
        return self.cwd 
    def __exit__(self, *args):
        os.chdir(self.cwd)
 def sanitize_file_name(name):
    '''
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -105,5 +105,8 @@ to auto-generate a Table of Contents.
              help=_('Print generated OPF file to stdout'))
    c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
              help=_('Print generated NCX file to stdout'))
-    
+    c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', default=False,
              help=_('Keep intermediate files during processing by html2epub'))
    c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
              help=_('Extract the contents of the produced EPUB file to the specified directory.'))
    return c
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -97,7 +97,9 @@ def convert(htmlfile, opts, notification=None):
    opts.chapter = XPath(opts.chapter, 
                    namespaces={'re':'http://exslt.org/regular-expressions'})
-    with TemporaryDirectory('_html2epub') as tdir:
+    with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
        if opts.keep_intermediate:
            print 'Intermediate files in', tdir
        resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir)
        resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
@ -159,6 +161,8 @@ def convert(htmlfile, opts, notification=None):
        epub = initialize_container(opts.output)
        epub.add_dir(tdir)
        print 'Output written to', opts.output
        if opts.extract_to is not None:
            epub.extractall(opts.extract_to)
 def main(args=sys.argv):
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -1,4 +1,4 @@
-#!/usr/bin/env  python
+from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
@ -7,176 +7,347 @@ __docformat__ = 'restructuredtext en'
 Split the flows in an epub file to conform to size limitations.
 '''
-import sys, os, math, copy
+import os, math, copy, logging, functools
 from urllib import unquote
-from lxml.etree import parse, XMLParser
+from lxml.etree import XPath as _XPath
 from lxml import etree, html
 from lxml.cssselect import CSSSelector
 from cssutils import CSSParser
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.epub import tostring
 from calibre import CurrentDir, LoggingInterface
-PARSER = XMLParser(recover=True)
+XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
 content = functools.partial(os.path.join, 'content')
 SPLIT_ATTR       = 'cs'
 SPLIT_POINT_ATTR = 'csp'
 class SplitError(ValueError):
-    def __init__(self, path):
+    def __init__(self, path, root):
-        ValueError.__init__(self, _('Could not find reasonable point at which to split: ')+os.path.basename(path))
+        size = len(tostring(root))/1024.
        ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% 
                            (os.path.basename(path), size))
 def split_tree(tree, split_point, before, opts, filepath):
    trees = set([])
    tree2 = copy.deepcopy(tree)
    path = tree.getpath(split_point)
    root, root2 = tree.getroot(), tree2.getroot()
    body, body2 = root.xpath('//body')[0], root2.xpath('//body')[0]
    split_point2 = root2.xpath(path)[0]
    # Tree 1
    hit_split_point = False
    for elem in body.iterdescendants():
        if elem is split_point:
            hit_split_point = True
            if before:
                elem.text = u''
                elem.tail = u''
                elem.set('calibre_split', '1')
            continue
        if hit_split_point:
            elem.text = u''
            elem.tail = u''
        elem.set('calibre_split', '1' if hit_split_point else '0')
-    # Tree 2
+class Splitter(LoggingInterface):
    hit_split_point = False
    for elem in body2.iterdescendants():
        if elem is split_point2:
            hit_split_point = True
            if not before:
                elem.text = u''
                elem.tail = u''
                elem.set('calibre_split', '1')
            continue
        if not hit_split_point:
            elem.text = u''
            elem.tail = u''
        elem.set('calibre_split', '0' if hit_split_point else '1')
-    for t, r in [(tree, root), (tree2, root2)]:
+    def __init__(self, path, opts, always_remove=False):
-        if len(tostring(r)) < opts.profile.flow_size:
+        LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
-            trees.append(t)
+        self.setup_cli_handler(opts.verbose)
        self.path = path
        self.always_remove = always_remove
        self.base = os.path.splitext(path)[0] + '_split_%d.html'
        self.opts = opts
        self.log_info('\tSplitting %s (%d KB)', path, os.stat(content(path)).st_size/1024.)
        root = html.fromstring(open(content(path)).read())
        css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
        if css:
            cssp = os.path.join('content', *(css[0].get('href').split('/')))
            self.log_debug('\t\tParsing stylesheet...') 
            stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
        else:
-            new_split_point, before = find_split_point(t)
+            stylesheet = None
-            if new_split_point is None:
+        self.page_breaks = []
-                raise SplitError(filepath)
+        if stylesheet is not None:
-            trees.extend(split_tree(t, new_split_point, before, opts, filepath))
+            self.find_page_breaks(stylesheet, root)
-    return trees
+        self.trees = self.split(root.getroottree())
        self.commit()
        self.log_info('\t\tSplit into %d parts.', len(self.trees))
        if self.opts.verbose:
            for f in self.files:
                self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
        self.trees = None
    def split(self, tree):
        '''
        Split ``tree`` into a *before* and *after* tree, preserving tag structure,
        but not duplicating any text. All tags that have had their text and tail
        removed have the attribute ``calibre_split`` set to 1.
        '''
        self.log_debug('\t\tSplitting...')
        root = tree.getroot()
        split_point, before = self.find_split_point(root)
        if split_point is None:
            if not self.always_remove:
                self.log_warn(_('\t\tToo much markup. Re-splitting without structure preservation. This may cause incorrect rendering.'))
            raise SplitError(self.path, root)
        tree2 = copy.deepcopy(tree)
        root2 = tree2.getroot()
        body, body2 = root.body, root2.body
        trees = []
        path = tree.getpath(split_point)
        split_point2 = root2.xpath(path)[0]
        def nix_element(elem, top=True):
            if self.always_remove:
                parent = elem.getparent()
                index = parent.index(elem)
                if top:
                    parent.remove(elem)
                else:
                    index = parent.index(elem)
                    parent[index:index+1] = list(elem.iterchildren())
            else:
                elem.text = u''
                elem.tail = u''
                elem.set(SPLIT_ATTR, '1')
                if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
                    elem.set('style', 'display:none;')
        def fix_split_point(sp):
            sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') 
        # Tree 1
        hit_split_point = False
        for elem in list(body.iterdescendants(etree.Element)):
            if elem.get(SPLIT_ATTR, '0') == '1':
                continue
            if elem is split_point:
                hit_split_point = True
                if before:
                    nix_element(elem)
                fix_split_point(elem)
                continue
            if hit_split_point:
                nix_element(elem)
-def find_split_point(tree):
+        # Tree 2
-    root = tree.getroot()
+        hit_split_point = False
-    css = root.xpath('//style[@type="text/css"]')
+        for elem in list(body2.iterdescendants(etree.Element)):
-    if css:
+            if elem.get(SPLIT_ATTR, '0') == '1':
                continue
            if elem is split_point2:
                hit_split_point = True
                if not before:
                    nix_element(elem, top=False)
                fix_split_point(elem)
                continue
            if not hit_split_point:
                nix_element(elem, top=False)
-        def pick_elem(elems):
+        for t, r in [(tree, root), (tree2, root2)]:
-            if elems:
+            size = len(tostring(r)) 
-                elems = [i for i in elems if elem.get('calibre_split', '0') != '1']
+            if size <= self.opts.profile.flow_size:
-                if elems:
+                trees.append(t)
-                    i = int(math.floor(len(elems)/2.))
+                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(trees), size/1024.)
-                    return elems[i]
+            else:
                trees.extend(self.split(t))
-        def selector_element(rule):
+        return trees
            try:
                selector = CSSSelector(rule.selectorText)
                return pick_elem(selector(root))
            except:
                return None
-        css = css[0].text
+    def find_page_breaks(self, stylesheet, root):
-        from cssutils import CSSParser
+        '''
-        stylesheet = CSSParser().parseString(css)
+        Find all elements that have either page-break-before or page-break-after set.
        '''
        page_break_selectors = set([])
        for rule in stylesheet:
            if rule.type != rule.STYLE_RULE:
                continue
            before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
            if before and before != 'avoid':
                elem = selector_element(rule)
                if elem is not None:
                    return elem, True
            after  = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
-            if after and after != 'avoid':
+            try:
-                elem = selector_element(rule)
+                if before and before != 'avoid':
-                if elem is not None:
+                    page_break_selectors.add((CSSSelector(rule.selectorText), True))
-                    return elem, False
+            except:
                pass
            try:
                if after and after != 'avoid':
                    page_break_selectors.add((CSSSelector(rule.selectorText), False))
            except:
                pass
-    for path in ('//*[re:match(name(), "h[1-6]", "i")', '/body/div', '//p'):
+        page_breaks = set([])
-        elems = root.xpath(path)
+        for selector, before in page_break_selectors:
-        elem = pick_elem(elems)
+            for elem in selector(root):
                elem.pb_before = before
                page_breaks.add(elem)
        for i, elem in enumerate(root.iter()):
            elem.pb_order = i
        page_breaks = list(page_breaks)
        page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
        tree = root.getroottree()
        self.page_breaks = [(XPath(tree.getpath(x)), x.pb_before) for x in page_breaks]
    def find_split_point(self, root):
        '''
        Find the tag at which to split the tree rooted at `root`. 
        Search order is:
            * page breaks
            * Heading tags
            * <div> tags
            * <p> tags
        We try to split in the "middle" of the file (as defined by tag counts.
        '''
        def pick_elem(elems):
            if elems:
                elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != '1'\
                          and i.get(SPLIT_ATTR, '0') != '1']
                if elems:
                    i = int(math.floor(len(elems)/2.))
                    elems[i].set(SPLIT_POINT_ATTR, '1')
                    return elems[i]
        page_breaks = []
        for x in self.page_breaks:
            pb = x[0](root)
            if pb:
                page_breaks.append(pb[0])
        elem = pick_elem(page_breaks)
        if elem is not None:
-            return elem, True
+            i = page_breaks.index(elem)
-        
+            return elem, self.page_breaks[i][1]
    return None, True
 def do_split(path, opts):
    tree = parse(path, parser=PARSER)
    split_point, before = find_split_point(tree)
    if split_point is None:
        raise SplitError(path)
    trees = split_tree(tree, split_point, before, opts, path)
    base = os.path.splitext(os.path.basename(path))[0] + '_split_%d.html'
    anchor_map = {None:base%0}
    files = []
    for i, tree in enumerate(trees):
        root = tree.getroot()
        files.append(base%i)
        for elem in root.xpath('//*[@id and @calibre_split = "1"]'):
            anchor_map[elem.get('id')] = files[-1]
            elem.attrib.pop('calibre_split')
        for elem in root.xpath('//*[@calibre_split]'):
            elem.attrib.pop('calibre_split')
        open(os.path.join(os.path.dirname(path), files[-1]), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
    os.remove(path)
    return path, files, anchor_map
 def fix_opf(opf, orig_file, files, anchor_map):
    orig = None
    for item in opf.manifest:
        if os.path.samefile(orig_file, item.path):
            orig = item
            break
    opf.manifest.remove(orig)
    ids = []
    for f in files:
        ids.append(opf.manifest.add_item(f))
    index = None
    for i, item in enumerate(opf.spine):
        if item.id == orig.id:
            index = i
            break
        for path in ('//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//p'):
            elems = root.xpath(path)
            elem = pick_elem(elems)
            if elem is not None:
                return elem, True
        return None, True
    def commit(self):
        '''
        Commit all changes caused by the split. This removes the previously
        introduced ``calibre_split`` attribute and calculates an *anchor_map* for
        all anchors in the original tree. Internal links are re-directed. The
        original file is deleted and the split files are saved.
        '''
        self.anchor_map = {None:self.base%0}
        self.files = []
        for i, tree in enumerate(self.trees):
            root = tree.getroot()
            self.files.append(self.base%i)
            for elem in root.xpath('//*[@id]'):
                if elem.get(SPLIT_ATTR, '0') == '0':
                    self.anchor_map[elem.get('id')] = self.files[-1]
            for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
                elem.attrib.pop(SPLIT_ATTR, None)
                elem.attrib.pop(SPLIT_POINT_ATTR, '0')
        for current, tree in zip(self.files, self.trees):
            for a in tree.getroot().xpath('//a[@href]'):
                href = a.get('href').strip()
                if href.startswith('#'):
                    anchor = href[1:]
                    file = self.anchor_map[anchor]
                    if file != current:
                        a.set('href', file+href)            
            open(content(current), 'wb').\
                write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
        os.remove(content(self.path))
    def fix_opf(self, opf):
        '''
        Fix references to the split file in the OPF.
        '''
        items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
        new_items = [('content/'+f, None) for f in self.files]
        id_map = {}
        for item in items:
            id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
        for id in id_map.keys():
            opf.replace_spine_items_by_idref(id, id_map[id])
        for ref in opf.iterguide():
            href = ref.get('href', '') 
            if href.startswith('content/'+self.path):
                href = href.split('#')
                frag = None
                if len(href) > 1:
                    frag = href[1]
                new_file = self.anchor_map[frag]
                ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
 def fix_content_links(html_files, changes, opts):
    split_files = [f.path for f in changes]
    anchor_maps = [f.anchor_map for f in changes]
    files = list(html_files)
    for j, f in enumerate(split_files):
        try:
            i = files.index(f)
            files[i:i+1] = changes[j].files
        except ValueError:
            continue
    for htmlfile in files:
        changed = False
        root = html.fromstring(open(content(htmlfile), 'rb').read())
        for a in root.xpath('//a[@href]'):
            href = a.get('href')
            if not href.startswith('#'):
                href = href.split('#')
                anchor = href[1] if len(href) > 1 else None
                href = href[0]
                if href in split_files:
                    newf = anchor_maps[split_files.index(href)][anchor]
                    frag = ('#'+anchor) if anchor else ''
                    a.set('href', newf+frag)
                    changed = True
        if changed:
            open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
 def fix_ncx(path, changes):
    split_files = [f.path for f in changes]
    anchor_maps = [f.anchor_map for f in changes]
    tree = etree.parse(path)
    changed = False
    for content in tree.getroot().xpath('//x:content[@src]', namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
        href = content.get('src')
        if not href.startswith('#'):
            href = href.split('#')
            anchor = href[1] if len(href) > 1 else None
            href = href[0].split('/')[-1]
            if href in split_files:
                newf = anchor_maps[split_files.index(href)][anchor]
                frag = ('#'+anchor) if anchor else ''
                content.set('src', 'content/'+newf+frag)
                changed = True
    if changed:
        open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
 def split(pathtoopf, opts):
    return
    pathtoopf = os.path.abspath(pathtoopf)
-    opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
+    with CurrentDir(os.path.dirname(pathtoopf)):
-    html_files = []
+        opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
-    for item in opf.manifest:
+        html_files = []
-        if 'html' in item.mime_type.lower():
+        for item in opf.itermanifest():
-            html_files.append(item.path)
+            if 'html' in item.get('media-type', '').lower():
-    changes = []
+                html_files.append(unquote(item.get('href')).split('/')[-1])
-    for f in html_files:
+        changes = []
-        if os.stat(f).st_size > opts.profile.flow_size:
+        for f in html_files:
-            fix_opf(opf, *do_split(f, opts))
+            if os.stat(content(f)).st_size > opts.profile.flow_size:
-    if changes:
+                try:
-        pass
+                    changes.append(Splitter(f, opts))
                except SplitError:
                    changes.append(Splitter(f, opts, always_remove=True))
                changes[-1].fix_opf(opf)
        open(pathtoopf, 'wb').write(opf.render())
        fix_content_links(html_files, changes, opts)
-        
+        for item in opf.itermanifest():
-    
+            if item.get('media-type', '') == 'application/x-dtbncx+xml':
-
+                fix_ncx(item.get('href'), changes)
-def main(args=sys.argv):
+                break 
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -228,8 +228,14 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
        raise ValueError('OPF does not have a spine')
    flat = []
    for path in opf_reader.spine.items():
        path = os.path.abspath(path)
        if path not in flat:
            flat.append(os.path.abspath(path))
    for item in opf_reader.manifest:
        if 'html' in item.mime_type:
            path = os.path.abspath(item.path)
            if path not in flat:
                flat.append(path)
    flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
    return flat
@ -329,14 +335,15 @@ class Parser(PreProcessor, LoggingInterface):
            if self.root.get(bad, None) is not None:
                self.root.attrib.pop(bad)
-        
+    def save_path(self):    
        return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
    def save(self):
        '''
        Save processed HTML into the content directory.
        Should be called after all HTML processing is finished.
        '''
-        with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
+        with open(self.save_path(), 'wb') as f:
            ans = tostring(self.root, pretty_print=self.opts.pretty_print)
            ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
            ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
@ -390,21 +397,26 @@ class Parser(PreProcessor, LoggingInterface):
        if not isinstance(olink, unicode):
            olink = olink.decode(self.htmlfile.encoding)
        link = self.htmlfile.resolve(olink)
        frag = (('#'+link.fragment) if link.fragment else '')
        if link.path == self.htmlfile.path:
            return frag if frag else '#'
        if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
            return olink
        if link.path in self.htmlfiles:
-            return self.htmlfile_map[link.path]
+            return self.htmlfile_map[link.path] + frag 
        if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None:
            return olink # This happens when --max-levels is used
        if link.path in self.resource_map.keys():
-            return self.resource_map[link.path]
+            return self.resource_map[link.path] + frag
        name = os.path.basename(link.path)
        name, ext = os.path.splitext(name)
        name += ('_%d'%len(self.resource_map)) + ext
        shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
        name = 'resources/' + name
        self.resource_map[link.path] = name
-        return name
+        return name + frag
 class Processor(Parser):
    '''
@ -438,9 +450,12 @@ class Processor(Parser):
    def save(self):
        head = self.head if self.head is not None else self.body
-        style = etree.SubElement(head, 'style', attrib={'type':'text/css'})
+        style_path = os.path.basename(self.save_path())+'.css'
-        style.text='\n'+self.css
+        style = etree.SubElement(head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', 
                                                       'href':'resources/'+style_path})
        style.tail = '\n\n'
        style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path)
        open(style_path, 'wb').write(self.css.encode('utf-8'))
        return Parser.save(self)
    def populate_toc(self, toc):
@ -530,6 +545,8 @@ class Processor(Parser):
                css.append('\n'.join(style.xpath('./text()')))
                style.getparent().remove(style)
        cache = {}
        class_counter = 0
        for font in self.root.xpath('//font'):
            try:
                size = int(font.attrib.pop('size', '3'))
@ -542,16 +559,33 @@ class Processor(Parser):
            color = font.attrib.pop('color', None)
            if color is not None:
                setting += 'color:%s'%color
-            id = get_id(font, counter)
+            classname = cache.get(setting, None)
-            counter += 1
+            if classname is None:
-            css.append('#%s { %s }'%(id, setting))
+                classname = 'calibre_class_%d'%class_counter
                class_counter += 1
                cache[setting] = classname
            cn = font.get('class', '')
            if cn: cn += ' '
            cn += classname
            font.set('class', cn)
        for elem in self.root.xpath('//*[@style]'):
-            id = get_id(elem, counter)
+            setting = elem.get('style')
-            counter += 1
+            classname = cache.get(setting, None)
-            css.append('#%s {%s}'%(id, elem.get('style')))
+            if classname is None:
                classname = 'calibre_class_%d'%class_counter
                class_counter += 1
                cache[setting] = classname
            cn = elem.get('class', '')
            if cn: cn += ' '
            cn += classname
            elem.set('class', cn)
            elem.attrib.pop('style')
        for setting, cn in cache.items():
            css.append('.%s {%s}'%(cn, setting))
        self.raw_css = '\n\n'.join(css)
        self.css = unicode(self.raw_css)
        if self.opts.override_css:
@ -688,6 +722,9 @@ def create_metadata(basepath, mi, filelist, resources):
    '''
    mi = OPFCreator(basepath, mi)
    entries = [('content/'+f, 'application/xhtml+xml') for f in filelist] + [(f, None) for f in resources]
    for f in filelist:
        if os.path.exists(os.path.join(basepath, 'content', 'resources', f+'.css')):
            entries.append(('content/resources/'+f+'.css', 'text/css'))
    mi.create_manifest(entries)
    mi.create_spine(['content/'+f for f in filelist])
    return mi
--- a/src/calibre/ebooks/metadata/init.py
+++ b/src/calibre/ebooks/metadata/init.py
@ -143,7 +143,8 @@ class ResourceCollection(object):
        self._resources.remove(resource)
    def replace(self, start, end, items):
-        pass
+        'Same as list[start:end] = items'
        self._resources[start:end] = items
    @staticmethod
    def from_directory_contents(top, topdown=True):
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@ -156,6 +156,19 @@ class Spine(ResourceCollection):
        self.manifest = manifest
    def replace(self, start, end, ids):
        '''
        Replace the items between start (inclusive) and end (not inclusive) with
        with the items identified by ids. ids can be a list of any length.
        '''
        items = []
        for id in ids:
            path = self.manifest.path_for_id(id)
            if path is None:
                raise ValueError('id %s not in manifest')
            items.append(Spine.Item(lambda x: id, path, is_path=True))
        ResourceCollection.replace(start, end, items)
    def linear_items(self):
        for r in self:
            if r.is_linear:
@ -297,6 +310,55 @@ class OPF(object):
    def get_text(self, elem):
        return u''.join(self.TEXT(elem))
    def itermanifest(self):
        return self.manifest_path(self.tree)
    def create_manifest_item(self, href, media_type):
        ids = [i.get('id', None) for i in self.itermanifest()]
        id = None
        for c in xrange(1, sys.maxint):
            id = 'id%d'%c
            if id not in ids:
                break
        if not media_type:
            media_type = 'application/xhtml+xml'
        ans = etree.Element('{%s}item'%self.NAMESPACES['opf'], 
                             attrib={'id':id, 'href':href, 'media-type':media_type})
        ans.tail = '\n\t\t'
        return ans
    def replace_manifest_item(self, item, items):
        items = [self.create_manifest_item(*i) for i in items]
        for i, item2 in enumerate(items):
            item2.set('id', item.get('id')+'.%d'%(i+1))
        manifest = item.getparent()
        index = manifest.index(item)
        manifest[index:index+1] = items
        return [i.get('id') for i in items]
    def iterspine(self):
        return self.spine_path(self.tree)
    def create_spine_item(self, idref):
        ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref)
        ans.tail = '\n\t\t'
        return ans
    def replace_spine_items_by_idref(self, idref, new_idrefs):
        items = list(map(self.create_spine_item, new_idrefs))
        spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.tree)[0]
        old = [i for i in self.iterspine() if i.get('idref', None) == idref]
        for x in old:
            i = spine.index(x)
            spine[i:i+1] = items
    def iterguide(self):
        return self.guide_path(self.tree)
    def render(self):
        return etree.tostring(self.tree, encoding='UTF-8', xml_declaration=True, 
                              pretty_print=True)
    @apply
    def authors():
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -24,6 +24,8 @@ class TOC(list):
                 base_path=os.getcwd()):
        self.href = href
        self.fragment = fragment
        if not self.fragment:
            self.fragment = None
        self.text = text
        self.parent = parent
        self.base_path = base_path
@ -153,8 +155,20 @@ class TOC(list):
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
            if not fragment:
                fragment = None
            else:
                fragment = fragment.strip()
            href = href.strip()
            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
-            self.add_item(href, fragment, txt)
+            add = True
            for i in self.flat():
                if i.href == href and i.fragment == fragment:
                    add = False
                    break 
            if add:
                self.add_item(href, fragment, txt)
    def render(self, stream, uid):
        from calibre.resources import ncx_template
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@ -3,7 +3,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Miscellaneous widgets used in the GUI
 '''
-import re, os
+import re, os, traceback
 from PyQt4.QtGui import QListView, QIcon, QFont, QLabel, QListWidget, \
                        QListWidgetItem, QTextCharFormat, QApplication, \
                        QSyntaxHighlighter, QCursor, QColor, QWidget, QDialog, \
@ -254,7 +254,12 @@ class FontFamilyModel(QAbstractListModel):
    def __init__(self, *args):
        QAbstractListModel.__init__(self, *args)
-        self.families = find_font_families()
+        try:
            self.families = find_font_families()
        except:
            self.families = []
            print 'WARNING: Could not load fonts'
            traceback.print_exc()
        self.families.sort()
        self.families[:0] = ['None']
--- a/src/calibre/linux_installer.py
+++ b/src/calibre/linux_installer.py
@ -278,7 +278,7 @@ def download_tarball():
 def main(args=sys.argv):
    defdir = '/opt/calibre'
-    destdir = raw_input('Enter the installation directory for calibre [%s]: '%defdir).strip()
+    destdir = raw_input('Enter the installation directory for calibre (Its contents will be deleted!)[%s]: '%defdir).strip()
    if not destdir:
        destdir = defdir
    if os.path.exists(destdir):
--- a/src/calibre/manual/custom.py
+++ b/src/calibre/manual/custom.py
@ -147,6 +147,7 @@ def cli_docs(app):
            info(bold('creating docs for %s...'%cmd))
            open(os.path.join('cli', cmd+'.rst'), 'wb').write(raw)
 def auto_member(dirname, arguments, options, content, lineno,
                    content_offset, block_text, state, state_machine):
    name = arguments[0]
@ -196,8 +197,7 @@ def auto_member(dirname, arguments, options, content, lineno,
    node = nodes.paragraph()
    state.nested_parse(result, content_offset, node)
-    return node
+    return list(node)
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -134,6 +134,7 @@ There can be several causes for this:
    * **Any windows version**: If this happens during an initial run of calibre, try deleting the folder you chose for your ebooks and restarting calibre.
    * **Windows Vista**: If the folder :file:`C:\\Users\\Your User Name\\AppData\\Local\\VirtualStore\\Program Files\\calibre` exists, delete it. Uninstall |app|. Reboot. Re-install.
    * **Any windows version**: Search your computer for a folder named :file:`_ipython`. Delete it and try again.
    * **Any windows version**: Try disabling any antivirus program you have running and see if that fixes it. Also try diabling any firewall software that prevents connections to the local computer.
 If it still wont launch, start a command prompt (press the windows key and R; then type :command:`cmd.exe` in the Run dialog that appears). At the command prompt type the following command and press Enter::
--- a/src/calibre/ptempfile.py
+++ b/src/calibre/ptempfile.py
@ -57,19 +57,21 @@ def PersistentTemporaryDirectory(suffix='', prefix='', dir=None):
    atexit.register(shutil.rmtree, tdir, True)
    return tdir
-class TemporaryDirectory(str):
+class TemporaryDirectory(object):
    '''
-    A temporary directory to be used ina  with statement.
+    A temporary directory to be used in a with statement.
    '''
-    def __init__(self, suffix='', prefix='', dir=None):
+    def __init__(self, suffix='', prefix='', dir=None, keep=False):
        self.suffix = suffix
        self.prefix = prefix
        self.dir = dir
        self.keep = keep
    def __enter__(self):
        self.tdir = tempfile.mkdtemp(self.suffix, __appname__+"_"+ __version__+"_" +self.prefix, self.dir)
        return self.tdir
    def __exit__(self, *args):
-        shutil.rmtree(self.tdir)
+        if not self.keep:
            shutil.rmtree(self.tdir)
--- a/src/calibre/translations/init.py
+++ b/src/calibre/translations/init.py
@ -53,6 +53,9 @@ def import_from_launchpad(url):
            open(out, 'wb').write(tf.extractfile(next).read())
        next = tf.next()
    check_for_critical_bugs()
    path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
    print path
    subprocess.check_call('python setup.py translations'.split(), dir=path)
    return 0
 def check_for_critical_bugs():