Support for splitting HTML files to respect maximum flow size limit for EPUB on the SONY Reader.

2025-07-09 03:04:10 -04:00 · 2008-09-21 22:47:43 -07:00 · 2008-09-21 22:47:43 -07:00 · 35c8db2dd7
commit 35c8db2dd7
parent 5c37760a27
14 changed files with 515 additions and 199 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -170,6 +170,19 @@ def fit_image(width, height, pwidth, pheight):

    return scaled, int(width), int(height)

+class CurrentDir(object):
+    
+    def __init__(self, path):
+        self.path = path
+        self.cwd = None
+        
+    def __enter__(self, *args):
+        self.cwd = os.getcwd()
+        os.chdir(self.path)
+        return self.cwd 
+    
+    def __exit__(self, *args):
+        os.chdir(self.cwd)

 def sanitize_file_name(name):
    '''
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -105,5 +105,8 @@ to auto-generate a Table of Contents.
              help=_('Print generated OPF file to stdout'))
    c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
              help=_('Print generated NCX file to stdout'))
-    
+    c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', default=False,
+              help=_('Keep intermediate files during processing by html2epub'))
+    c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
+              help=_('Extract the contents of the produced EPUB file to the specified directory.'))
    return c
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -97,7 +97,9 @@ def convert(htmlfile, opts, notification=None):
    opts.chapter = XPath(opts.chapter, 
                    namespaces={'re':'http://exslt.org/regular-expressions'})
    
-    with TemporaryDirectory('_html2epub') as tdir:
+    with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
+        if opts.keep_intermediate:
+            print 'Intermediate files in', tdir
        resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir)
        resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
        
@ -159,6 +161,8 @@ def convert(htmlfile, opts, notification=None):
        epub = initialize_container(opts.output)
        epub.add_dir(tdir)
        print 'Output written to', opts.output
+        if opts.extract_to is not None:
+            epub.extractall(opts.extract_to)
        
            
 def main(args=sys.argv):
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -1,4 +1,4 @@
-#!/usr/bin/env  python
+from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
@ -7,108 +7,212 @@ __docformat__ = 'restructuredtext en'
 Split the flows in an epub file to conform to size limitations.
 '''

-import sys, os, math, copy
+import os, math, copy, logging, functools
+from urllib import unquote

-from lxml.etree import parse, XMLParser
+from lxml.etree import XPath as _XPath
+from lxml import etree, html
 from lxml.cssselect import CSSSelector
+from cssutils import CSSParser

 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.epub import tostring
+from calibre import CurrentDir, LoggingInterface

-PARSER = XMLParser(recover=True)
+XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
+content = functools.partial(os.path.join, 'content')
+
+SPLIT_ATTR       = 'cs'
+SPLIT_POINT_ATTR = 'csp'

 class SplitError(ValueError):
    
-    def __init__(self, path):
-        ValueError.__init__(self, _('Could not find reasonable point at which to split: ')+os.path.basename(path))
+    def __init__(self, path, root):
+        size = len(tostring(root))/1024.
+        ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% 
+                            (os.path.basename(path), size))

-def split_tree(tree, split_point, before, opts, filepath):
-    trees = set([])
+    
+
+class Splitter(LoggingInterface):
+    
+    def __init__(self, path, opts, always_remove=False):
+        LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
+        self.setup_cli_handler(opts.verbose)
+        self.path = path
+        self.always_remove = always_remove
+        self.base = os.path.splitext(path)[0] + '_split_%d.html'
+        self.opts = opts
+        self.log_info('\tSplitting %s (%d KB)', path, os.stat(content(path)).st_size/1024.)
+        root = html.fromstring(open(content(path)).read())
+            
+        css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
+        if css:
+            cssp = os.path.join('content', *(css[0].get('href').split('/')))
+            self.log_debug('\t\tParsing stylesheet...') 
+            stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
+        else:
+            stylesheet = None
+        self.page_breaks = []
+        if stylesheet is not None:
+            self.find_page_breaks(stylesheet, root)
+        
+        self.trees = self.split(root.getroottree())
+        self.commit()
+        self.log_info('\t\tSplit into %d parts.', len(self.trees))
+        if self.opts.verbose:
+            for f in self.files:
+                self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
+        self.trees = None
+        
+    def split(self, tree):
+        '''
+        Split ``tree`` into a *before* and *after* tree, preserving tag structure,
+        but not duplicating any text. All tags that have had their text and tail
+        removed have the attribute ``calibre_split`` set to 1.
+        '''
+        self.log_debug('\t\tSplitting...')
+        root = tree.getroot()
+        split_point, before = self.find_split_point(root)
+        if split_point is None:
+            if not self.always_remove:
+                self.log_warn(_('\t\tToo much markup. Re-splitting without structure preservation. This may cause incorrect rendering.'))
+            raise SplitError(self.path, root)
        tree2 = copy.deepcopy(tree)
+        root2 = tree2.getroot()
+        body, body2 = root.body, root2.body
+        trees = []
        path = tree.getpath(split_point)
-    root, root2 = tree.getroot(), tree2.getroot()
-    body, body2 = root.xpath('//body')[0], root2.xpath('//body')[0]
        split_point2 = root2.xpath(path)[0]
        
+        def nix_element(elem, top=True):
+            if self.always_remove:
+                parent = elem.getparent()
+                index = parent.index(elem)
+                if top:
+                    parent.remove(elem)
+                else:
+                    index = parent.index(elem)
+                    parent[index:index+1] = list(elem.iterchildren())
+                
+            else:
+                elem.text = u''
+                elem.tail = u''
+                elem.set(SPLIT_ATTR, '1')
+                if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
+                    elem.set('style', 'display:none;')
+        
+        def fix_split_point(sp):
+            sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') 
+        
        # Tree 1
        hit_split_point = False
-    for elem in body.iterdescendants():
+        for elem in list(body.iterdescendants(etree.Element)):
+            if elem.get(SPLIT_ATTR, '0') == '1':
+                continue
            if elem is split_point:
                hit_split_point = True
                if before:
-                elem.text = u''
-                elem.tail = u''
-                elem.set('calibre_split', '1')
+                    nix_element(elem)
+                fix_split_point(elem)
                continue
            if hit_split_point:
-            elem.text = u''
-            elem.tail = u''
-        elem.set('calibre_split', '1' if hit_split_point else '0')
+                nix_element(elem)
+            
            
        # Tree 2
        hit_split_point = False
-    for elem in body2.iterdescendants():
+        for elem in list(body2.iterdescendants(etree.Element)):
+            if elem.get(SPLIT_ATTR, '0') == '1':
+                continue
            if elem is split_point2:
                hit_split_point = True
                if not before:
-                elem.text = u''
-                elem.tail = u''
-                elem.set('calibre_split', '1')
+                    nix_element(elem, top=False)
+                fix_split_point(elem)
                continue
            if not hit_split_point:
-            elem.text = u''
-            elem.tail = u''
-        elem.set('calibre_split', '0' if hit_split_point else '1')
+                nix_element(elem, top=False)
        
        for t, r in [(tree, root), (tree2, root2)]:
-        if len(tostring(r)) < opts.profile.flow_size:
+            size = len(tostring(r)) 
+            if size <= self.opts.profile.flow_size:
                trees.append(t)
+                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(trees), size/1024.)
            else:
-            new_split_point, before = find_split_point(t)
-            if new_split_point is None:
-                raise SplitError(filepath)
-            trees.extend(split_tree(t, new_split_point, before, opts, filepath))
+                trees.extend(self.split(t))
                
        return trees

-
-def find_split_point(tree):
-    root = tree.getroot()
-    css = root.xpath('//style[@type="text/css"]')
-    if css:
-        
-        def pick_elem(elems):
-            if elems:
-                elems = [i for i in elems if elem.get('calibre_split', '0') != '1']
-                if elems:
-                    i = int(math.floor(len(elems)/2.))
-                    return elems[i]
-        
-        def selector_element(rule):
-            try:
-                selector = CSSSelector(rule.selectorText)
-                return pick_elem(selector(root))
-            except:
-                return None
-        
-        css = css[0].text
-        from cssutils import CSSParser
-        stylesheet = CSSParser().parseString(css)
+    def find_page_breaks(self, stylesheet, root):
+        '''
+        Find all elements that have either page-break-before or page-break-after set.
+        '''
+        page_break_selectors = set([])
        for rule in stylesheet:
            if rule.type != rule.STYLE_RULE:
                continue
            before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
-            if before and before != 'avoid':
-                elem = selector_element(rule)
-                if elem is not None:
-                    return elem, True
            after  = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
+            try:
+                if before and before != 'avoid':
+                    page_break_selectors.add((CSSSelector(rule.selectorText), True))
+            except:
+                pass
+            try:
                if after and after != 'avoid':
-                elem = selector_element(rule)
-                if elem is not None:
-                    return elem, False
+                    page_break_selectors.add((CSSSelector(rule.selectorText), False))
+            except:
+                pass
            
-    for path in ('//*[re:match(name(), "h[1-6]", "i")', '/body/div', '//p'):
+        page_breaks = set([])
+        for selector, before in page_break_selectors:
+            for elem in selector(root):
+                elem.pb_before = before
+                page_breaks.add(elem)
+                
+        for i, elem in enumerate(root.iter()):
+            elem.pb_order = i
+            
+        page_breaks = list(page_breaks)
+        page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
+        tree = root.getroottree()
+        self.page_breaks = [(XPath(tree.getpath(x)), x.pb_before) for x in page_breaks]
+        
+    def find_split_point(self, root):
+        '''
+        Find the tag at which to split the tree rooted at `root`. 
+        Search order is:
+            * page breaks
+            * Heading tags
+            * <div> tags
+            * <p> tags
+            
+        We try to split in the "middle" of the file (as defined by tag counts.
+        '''
+        def pick_elem(elems):
+            if elems:
+                elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != '1'\
+                          and i.get(SPLIT_ATTR, '0') != '1']
+                if elems:
+                    i = int(math.floor(len(elems)/2.))
+                    elems[i].set(SPLIT_POINT_ATTR, '1')
+                    return elems[i]
+    
+        page_breaks = []
+        for x in self.page_breaks:
+            pb = x[0](root)
+            if pb:
+                page_breaks.append(pb[0])
+                
+        elem = pick_elem(page_breaks)
+        if elem is not None:
+            i = page_breaks.index(elem)
+            return elem, self.page_breaks[i][1]
+        
+            
+                            
+        for path in ('//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//p'):
            elems = root.xpath(path)
            elem = pick_elem(elems)
            if elem is not None:
@ -116,67 +220,134 @@ def find_split_point(tree):
            
        return None, True
    
-def do_split(path, opts):
-    tree = parse(path, parser=PARSER)
-    split_point, before = find_split_point(tree)
-    if split_point is None:
-        raise SplitError(path)
-    trees = split_tree(tree, split_point, before, opts, path)
-    base = os.path.splitext(os.path.basename(path))[0] + '_split_%d.html'
-    anchor_map = {None:base%0}
-    files = []
-    for i, tree in enumerate(trees):
+    def commit(self):
+        '''
+        Commit all changes caused by the split. This removes the previously
+        introduced ``calibre_split`` attribute and calculates an *anchor_map* for
+        all anchors in the original tree. Internal links are re-directed. The
+        original file is deleted and the split files are saved.
+        '''
+        self.anchor_map = {None:self.base%0}
+        self.files = []
+        
+        for i, tree in enumerate(self.trees):
            root = tree.getroot()
-        files.append(base%i)
-        for elem in root.xpath('//*[@id and @calibre_split = "1"]'):
-            anchor_map[elem.get('id')] = files[-1]
-            elem.attrib.pop('calibre_split')
-        for elem in root.xpath('//*[@calibre_split]'):
-            elem.attrib.pop('calibre_split')
-        open(os.path.join(os.path.dirname(path), files[-1]), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
-    os.remove(path)
-    return path, files, anchor_map
+            self.files.append(self.base%i)
+            for elem in root.xpath('//*[@id]'):
+                if elem.get(SPLIT_ATTR, '0') == '0':
+                    self.anchor_map[elem.get('id')] = self.files[-1]
+            for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
+                elem.attrib.pop(SPLIT_ATTR, None)
+                elem.attrib.pop(SPLIT_POINT_ATTR, '0')
                
-def fix_opf(opf, orig_file, files, anchor_map):
-    orig = None
-    for item in opf.manifest:
-        if os.path.samefile(orig_file, item.path):
-            orig = item
-            break
-    opf.manifest.remove(orig)
-    ids = []
-    for f in files:
-        ids.append(opf.manifest.add_item(f))
-    index = None
-    for i, item in enumerate(opf.spine):
-        if item.id == orig.id:
-            index = i
-            break
+        for current, tree in zip(self.files, self.trees):
+            for a in tree.getroot().xpath('//a[@href]'):
+                href = a.get('href').strip()
+                if href.startswith('#'):
+                    anchor = href[1:]
+                    file = self.anchor_map[anchor]
+                    if file != current:
+                        a.set('href', file+href)            
+            open(content(current), 'wb').\
+                write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
+            
+        os.remove(content(self.path))
+
+
+    def fix_opf(self, opf):
+        '''
+        Fix references to the split file in the OPF.
+        '''
+        items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
+        new_items = [('content/'+f, None) for f in self.files]
+        id_map = {}
+        for item in items:
+            id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
+        
+        for id in id_map.keys():
+            opf.replace_spine_items_by_idref(id, id_map[id])
+        
+        for ref in opf.iterguide():
+            href = ref.get('href', '') 
+            if href.startswith('content/'+self.path):
+                href = href.split('#')
+                frag = None
+                if len(href) > 1:
+                    frag = href[1]
+                new_file = self.anchor_map[frag]
+                ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))

          
                
+def fix_content_links(html_files, changes, opts):
+    split_files = [f.path for f in changes]
+    anchor_maps = [f.anchor_map for f in changes]
+    files = list(html_files)
+    for j, f in enumerate(split_files):
+        try:
+            i = files.index(f)
+            files[i:i+1] = changes[j].files
+        except ValueError:
+            continue
+        
+    for htmlfile in files:
+        changed = False
+        root = html.fromstring(open(content(htmlfile), 'rb').read())
+        for a in root.xpath('//a[@href]'):
+            href = a.get('href')
+            if not href.startswith('#'):
+                href = href.split('#')
+                anchor = href[1] if len(href) > 1 else None
+                href = href[0]
+                if href in split_files:
+                    newf = anchor_maps[split_files.index(href)][anchor]
+                    frag = ('#'+anchor) if anchor else ''
+                    a.set('href', newf+frag)
+                    changed = True
+                    
+        if changed:
+            open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
+
+def fix_ncx(path, changes):
+    split_files = [f.path for f in changes]
+    anchor_maps = [f.anchor_map for f in changes]
+    tree = etree.parse(path)
+    changed = False
+    for content in tree.getroot().xpath('//x:content[@src]', namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
+        href = content.get('src')
+        if not href.startswith('#'):
+            href = href.split('#')
+            anchor = href[1] if len(href) > 1 else None
+            href = href[0].split('/')[-1]
+            if href in split_files:
+                newf = anchor_maps[split_files.index(href)][anchor]
+                frag = ('#'+anchor) if anchor else ''
+                content.set('src', 'content/'+newf+frag)
+                changed = True
+    if changed:
+        open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
       
 def split(pathtoopf, opts):
-    return
    pathtoopf = os.path.abspath(pathtoopf)
+    with CurrentDir(os.path.dirname(pathtoopf)):
        opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
        html_files = []
-    for item in opf.manifest:
-        if 'html' in item.mime_type.lower():
-            html_files.append(item.path)
+        for item in opf.itermanifest():
+            if 'html' in item.get('media-type', '').lower():
+                html_files.append(unquote(item.get('href')).split('/')[-1])
        changes = []
        for f in html_files:
-        if os.stat(f).st_size > opts.profile.flow_size:
-            fix_opf(opf, *do_split(f, opts))
-    if changes:
-        pass
+            if os.stat(content(f)).st_size > opts.profile.flow_size:
+                try:
+                    changes.append(Splitter(f, opts))
+                except SplitError:
+                    changes.append(Splitter(f, opts, always_remove=True))
+                changes[-1].fix_opf(opf)
        
+        open(pathtoopf, 'wb').write(opf.render())
+        fix_content_links(html_files, changes, opts)
        
-        
-    
-
-def main(args=sys.argv):
-    return 0
-
-if __name__ == '__main__':
-    sys.exit(main())
+        for item in opf.itermanifest():
+            if item.get('media-type', '') == 'application/x-dtbncx+xml':
+                fix_ncx(item.get('href'), changes)
+                break 
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -228,8 +228,14 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
        raise ValueError('OPF does not have a spine')
    flat = []
    for path in opf_reader.spine.items():
+        path = os.path.abspath(path)
        if path not in flat:
            flat.append(os.path.abspath(path))
+    for item in opf_reader.manifest:
+        if 'html' in item.mime_type:
+            path = os.path.abspath(item.path)
+            if path not in flat:
+                flat.append(path)
    flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
    return flat
            
@ -329,14 +335,15 @@ class Parser(PreProcessor, LoggingInterface):
            if self.root.get(bad, None) is not None:
                self.root.attrib.pop(bad)
        
-        
+    def save_path(self):    
+        return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
    
    def save(self):
        '''
        Save processed HTML into the content directory.
        Should be called after all HTML processing is finished.
        '''
-        with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
+        with open(self.save_path(), 'wb') as f:
            ans = tostring(self.root, pretty_print=self.opts.pretty_print)
            ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
            ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
@ -390,21 +397,26 @@ class Parser(PreProcessor, LoggingInterface):
        if not isinstance(olink, unicode):
            olink = olink.decode(self.htmlfile.encoding)
        link = self.htmlfile.resolve(olink)
+        frag = (('#'+link.fragment) if link.fragment else '')
+        if link.path == self.htmlfile.path:
+            return frag if frag else '#'
        if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
            return olink
        if link.path in self.htmlfiles:
-            return self.htmlfile_map[link.path]
+            return self.htmlfile_map[link.path] + frag 
        if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None:
            return olink # This happens when --max-levels is used
        if link.path in self.resource_map.keys():
-            return self.resource_map[link.path]
+            return self.resource_map[link.path] + frag
        name = os.path.basename(link.path)
        name, ext = os.path.splitext(name)
        name += ('_%d'%len(self.resource_map)) + ext
        shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
        name = 'resources/' + name
        self.resource_map[link.path] = name
-        return name
+        return name + frag
+    
+        

 class Processor(Parser):
    '''
@ -438,9 +450,12 @@ class Processor(Parser):
        
    def save(self):
        head = self.head if self.head is not None else self.body
-        style = etree.SubElement(head, 'style', attrib={'type':'text/css'})
-        style.text='\n'+self.css
+        style_path = os.path.basename(self.save_path())+'.css'
+        style = etree.SubElement(head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', 
+                                                       'href':'resources/'+style_path})
        style.tail = '\n\n'
+        style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path)
+        open(style_path, 'wb').write(self.css.encode('utf-8'))
        return Parser.save(self)
    
    def populate_toc(self, toc):
@ -530,6 +545,8 @@ class Processor(Parser):
                css.append('\n'.join(style.xpath('./text()')))
                style.getparent().remove(style)
        
+        cache = {}
+        class_counter = 0
        for font in self.root.xpath('//font'):
            try:
                size = int(font.attrib.pop('size', '3'))
@ -542,16 +559,33 @@ class Processor(Parser):
            color = font.attrib.pop('color', None)
            if color is not None:
                setting += 'color:%s'%color
-            id = get_id(font, counter)
-            counter += 1
-            css.append('#%s { %s }'%(id, setting))
+            classname = cache.get(setting, None)
+            if classname is None:
+                classname = 'calibre_class_%d'%class_counter
+                class_counter += 1
+                cache[setting] = classname
+            cn = font.get('class', '')
+            if cn: cn += ' '
+            cn += classname
+            font.set('class', cn)
            
        for elem in self.root.xpath('//*[@style]'):
-            id = get_id(elem, counter)
-            counter += 1
-            css.append('#%s {%s}'%(id, elem.get('style')))
+            setting = elem.get('style')
+            classname = cache.get(setting, None)
+            if classname is None:
+                classname = 'calibre_class_%d'%class_counter
+                class_counter += 1
+                cache[setting] = classname
+            cn = elem.get('class', '')
+            if cn: cn += ' '
+            cn += classname
+            elem.set('class', cn)
            elem.attrib.pop('style')
        
+        for setting, cn in cache.items():
+            css.append('.%s {%s}'%(cn, setting))
+        
+            
        self.raw_css = '\n\n'.join(css)
        self.css = unicode(self.raw_css)
        if self.opts.override_css:
@ -688,6 +722,9 @@ def create_metadata(basepath, mi, filelist, resources):
    '''
    mi = OPFCreator(basepath, mi)
    entries = [('content/'+f, 'application/xhtml+xml') for f in filelist] + [(f, None) for f in resources]
+    for f in filelist:
+        if os.path.exists(os.path.join(basepath, 'content', 'resources', f+'.css')):
+            entries.append(('content/resources/'+f+'.css', 'text/css'))
    mi.create_manifest(entries)
    mi.create_spine(['content/'+f for f in filelist])
    return mi
--- a/src/calibre/ebooks/metadata/init.py
+++ b/src/calibre/ebooks/metadata/init.py
@ -143,7 +143,8 @@ class ResourceCollection(object):
        self._resources.remove(resource)
    
    def replace(self, start, end, items):
-        pass
+        'Same as list[start:end] = items'
+        self._resources[start:end] = items
        
    @staticmethod
    def from_directory_contents(top, topdown=True):
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@ -156,6 +156,19 @@ class Spine(ResourceCollection):
        self.manifest = manifest
            
            
+    def replace(self, start, end, ids):
+        '''
+        Replace the items between start (inclusive) and end (not inclusive) with
+        with the items identified by ids. ids can be a list of any length.
+        '''
+        items = []
+        for id in ids:
+            path = self.manifest.path_for_id(id)
+            if path is None:
+                raise ValueError('id %s not in manifest')
+            items.append(Spine.Item(lambda x: id, path, is_path=True))
+        ResourceCollection.replace(start, end, items)
+                    
    def linear_items(self):
        for r in self:
            if r.is_linear:
@ -297,6 +310,55 @@ class OPF(object):
    def get_text(self, elem):
        return u''.join(self.TEXT(elem))
    
+    def itermanifest(self):
+        return self.manifest_path(self.tree)
+    
+    def create_manifest_item(self, href, media_type):
+        ids = [i.get('id', None) for i in self.itermanifest()]
+        id = None
+        for c in xrange(1, sys.maxint):
+            id = 'id%d'%c
+            if id not in ids:
+                break
+        if not media_type:
+            media_type = 'application/xhtml+xml'
+        ans = etree.Element('{%s}item'%self.NAMESPACES['opf'], 
+                             attrib={'id':id, 'href':href, 'media-type':media_type})
+        ans.tail = '\n\t\t'
+        return ans
+    
+    def replace_manifest_item(self, item, items):
+        items = [self.create_manifest_item(*i) for i in items]
+        for i, item2 in enumerate(items):
+            item2.set('id', item.get('id')+'.%d'%(i+1))
+        manifest = item.getparent()
+        index = manifest.index(item)
+        manifest[index:index+1] = items
+        return [i.get('id') for i in items]
+    
+    def iterspine(self):
+        return self.spine_path(self.tree)
+    
+    def create_spine_item(self, idref):
+        ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref)
+        ans.tail = '\n\t\t'
+        return ans
+    
+    def replace_spine_items_by_idref(self, idref, new_idrefs):
+        items = list(map(self.create_spine_item, new_idrefs))
+        spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.tree)[0]
+        old = [i for i in self.iterspine() if i.get('idref', None) == idref]
+        for x in old:
+            i = spine.index(x)
+            spine[i:i+1] = items
+    
+    def iterguide(self):
+        return self.guide_path(self.tree)
+    
+    def render(self):
+        return etree.tostring(self.tree, encoding='UTF-8', xml_declaration=True, 
+                              pretty_print=True)
+    
    @apply
    def authors():
        
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@ -24,6 +24,8 @@ class TOC(list):
                 base_path=os.getcwd()):
        self.href = href
        self.fragment = fragment
+        if not self.fragment:
+            self.fragment = None
        self.text = text
        self.parent = parent
        self.base_path = base_path
@ -153,7 +155,19 @@ class TOC(list):
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
+            if not fragment:
+                fragment = None
+            else:
+                fragment = fragment.strip()
+            href = href.strip()
+            
            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
+            add = True
+            for i in self.flat():
+                if i.href == href and i.fragment == fragment:
+                    add = False
+                    break 
+            if add:
                self.add_item(href, fragment, txt)

    def render(self, stream, uid):
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@ -3,7 +3,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Miscellaneous widgets used in the GUI
 '''
-import re, os
+import re, os, traceback
 from PyQt4.QtGui import QListView, QIcon, QFont, QLabel, QListWidget, \
                        QListWidgetItem, QTextCharFormat, QApplication, \
                        QSyntaxHighlighter, QCursor, QColor, QWidget, QDialog, \
@ -254,7 +254,12 @@ class FontFamilyModel(QAbstractListModel):
    
    def __init__(self, *args):
        QAbstractListModel.__init__(self, *args)
+        try:
            self.families = find_font_families()
+        except:
+            self.families = []
+            print 'WARNING: Could not load fonts'
+            traceback.print_exc()
        self.families.sort()
        self.families[:0] = ['None']
        
--- a/src/calibre/linux_installer.py
+++ b/src/calibre/linux_installer.py
@ -278,7 +278,7 @@ def download_tarball():

 def main(args=sys.argv):
    defdir = '/opt/calibre'
-    destdir = raw_input('Enter the installation directory for calibre [%s]: '%defdir).strip()
+    destdir = raw_input('Enter the installation directory for calibre (Its contents will be deleted!)[%s]: '%defdir).strip()
    if not destdir:
        destdir = defdir
    if os.path.exists(destdir):
--- a/src/calibre/manual/custom.py
+++ b/src/calibre/manual/custom.py
@ -147,6 +147,7 @@ def cli_docs(app):
            info(bold('creating docs for %s...'%cmd))
            open(os.path.join('cli', cmd+'.rst'), 'wb').write(raw)

+
 def auto_member(dirname, arguments, options, content, lineno,
                    content_offset, block_text, state, state_machine):
    name = arguments[0]
@ -196,8 +197,7 @@ def auto_member(dirname, arguments, options, content, lineno,
    node = nodes.paragraph()
    state.nested_parse(result, content_offset, node)

-    return node
-    
+    return list(node)



--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@ -134,6 +134,7 @@ There can be several causes for this:
    * **Any windows version**: If this happens during an initial run of calibre, try deleting the folder you chose for your ebooks and restarting calibre.
    * **Windows Vista**: If the folder :file:`C:\\Users\\Your User Name\\AppData\\Local\\VirtualStore\\Program Files\\calibre` exists, delete it. Uninstall |app|. Reboot. Re-install.
    * **Any windows version**: Search your computer for a folder named :file:`_ipython`. Delete it and try again.
+    * **Any windows version**: Try disabling any antivirus program you have running and see if that fixes it. Also try diabling any firewall software that prevents connections to the local computer.

 If it still wont launch, start a command prompt (press the windows key and R; then type :command:`cmd.exe` in the Run dialog that appears). At the command prompt type the following command and press Enter::

--- a/src/calibre/ptempfile.py
+++ b/src/calibre/ptempfile.py
@ -57,19 +57,21 @@ def PersistentTemporaryDirectory(suffix='', prefix='', dir=None):
    atexit.register(shutil.rmtree, tdir, True)
    return tdir

-class TemporaryDirectory(str):
+class TemporaryDirectory(object):
    '''
    A temporary directory to be used in a with statement.
    '''
-    def __init__(self, suffix='', prefix='', dir=None):
+    def __init__(self, suffix='', prefix='', dir=None, keep=False):
        self.suffix = suffix
        self.prefix = prefix
        self.dir = dir
+        self.keep = keep
    
    def __enter__(self):
        self.tdir = tempfile.mkdtemp(self.suffix, __appname__+"_"+ __version__+"_" +self.prefix, self.dir)
        return self.tdir
    
    def __exit__(self, *args):
+        if not self.keep:
            shutil.rmtree(self.tdir)
      
--- a/src/calibre/translations/init.py
+++ b/src/calibre/translations/init.py
@ -53,6 +53,9 @@ def import_from_launchpad(url):
            open(out, 'wb').write(tf.extractfile(next).read())
        next = tf.next()
    check_for_critical_bugs()
+    path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+    print path
+    subprocess.check_call('python setup.py translations'.split(), dir=path)
    return 0
 
 def check_for_critical_bugs():