From 35c8db2dd7c4d1f819e3f66df786a7460c97ee7a Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 21 Sep 2008 22:47:43 -0700
Subject: [PATCH] Support for splitting HTML files to respect maximum flow size
 limit for EPUB on the SONY Reader.

---
 src/calibre/__init__.py                 |  13 +
 src/calibre/ebooks/epub/__init__.py     |   5 +-
 src/calibre/ebooks/epub/from_html.py    |   6 +-
 src/calibre/ebooks/epub/split.py        | 471 ++++++++++++++++--------
 src/calibre/ebooks/html.py              |  65 +++-
 src/calibre/ebooks/metadata/__init__.py |   5 +-
 src/calibre/ebooks/metadata/opf2.py     |  62 ++++
 src/calibre/ebooks/metadata/toc.py      |  16 +-
 src/calibre/gui2/widgets.py             |   9 +-
 src/calibre/linux_installer.py          |   2 +-
 src/calibre/manual/custom.py            |  44 +--
 src/calibre/manual/faq.rst              |   1 +
 src/calibre/ptempfile.py                |  12 +-
 src/calibre/translations/__init__.py    |   3 +
 14 files changed, 515 insertions(+), 199 deletions(-)

diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index 1a70a6969c..c41e33ad50 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -170,6 +170,19 @@ def fit_image(width, height, pwidth, pheight):
 
     return scaled, int(width), int(height)
 
+class CurrentDir(object):
+    
+    def __init__(self, path):
+        self.path = path
+        self.cwd = None
+        
+    def __enter__(self, *args):
+        self.cwd = os.getcwd()
+        os.chdir(self.path)
+        return self.cwd 
+    
+    def __exit__(self, *args):
+        os.chdir(self.cwd)
 
 def sanitize_file_name(name):
     '''
diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py
index 7c9d1197a9..d33582ac8b 100644
--- a/src/calibre/ebooks/epub/__init__.py
+++ b/src/calibre/ebooks/epub/__init__.py
@@ -105,5 +105,8 @@ to auto-generate a Table of Contents.
               help=_('Print generated OPF file to stdout'))
     c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
               help=_('Print generated NCX file to stdout'))
-    
+    c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', default=False,
+              help=_('Keep intermediate files during processing by html2epub'))
+    c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
+              help=_('Extract the contents of the produced EPUB file to the specified directory.'))
     return c
\ No newline at end of file
diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index a94a68c76b..12bec12734 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -97,7 +97,9 @@ def convert(htmlfile, opts, notification=None):
     opts.chapter = XPath(opts.chapter, 
                     namespaces={'re':'http://exslt.org/regular-expressions'})
     
-    with TemporaryDirectory('_html2epub') as tdir:
+    with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
+        if opts.keep_intermediate:
+            print 'Intermediate files in', tdir
         resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir)
         resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
         
@@ -159,6 +161,8 @@ def convert(htmlfile, opts, notification=None):
         epub = initialize_container(opts.output)
         epub.add_dir(tdir)
         print 'Output written to', opts.output
+        if opts.extract_to is not None:
+            epub.extractall(opts.extract_to)
         
             
 def main(args=sys.argv):
diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py
index 135ae626b9..c567080c8d 100644
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env  python
+from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
@@ -7,176 +7,347 @@ __docformat__ = 'restructuredtext en'
 Split the flows in an epub file to conform to size limitations.
 '''
 
-import sys, os, math, copy
+import os, math, copy, logging, functools
+from urllib import unquote
 
-from lxml.etree import parse, XMLParser
+from lxml.etree import XPath as _XPath
+from lxml import etree, html
 from lxml.cssselect import CSSSelector
+from cssutils import CSSParser
 
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.epub import tostring
+from calibre import CurrentDir, LoggingInterface
 
-PARSER = XMLParser(recover=True)
+XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
+content = functools.partial(os.path.join, 'content')
+
+SPLIT_ATTR       = 'cs'
+SPLIT_POINT_ATTR = 'csp'
 
 class SplitError(ValueError):
     
-    def __init__(self, path):
-        ValueError.__init__(self, _('Could not find reasonable point at which to split: ')+os.path.basename(path))
+    def __init__(self, path, root):
+        size = len(tostring(root))/1024.
+        ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% 
+                            (os.path.basename(path), size))
 
-def split_tree(tree, split_point, before, opts, filepath):
-    trees = set([])
-    tree2 = copy.deepcopy(tree)
-    path = tree.getpath(split_point)
-    root, root2 = tree.getroot(), tree2.getroot()
-    body, body2 = root.xpath('//body')[0], root2.xpath('//body')[0]
-    split_point2 = root2.xpath(path)[0]
     
-    # Tree 1
-    hit_split_point = False
-    for elem in body.iterdescendants():
-        if elem is split_point:
-            hit_split_point = True
-            if before:
-                elem.text = u''
-                elem.tail = u''
-                elem.set('calibre_split', '1')
-            continue
-        if hit_split_point:
-            elem.text = u''
-            elem.tail = u''
-        elem.set('calibre_split', '1' if hit_split_point else '0')
-        
-    # Tree 2
-    hit_split_point = False
-    for elem in body2.iterdescendants():
-        if elem is split_point2:
-            hit_split_point = True
-            if not before:
-                elem.text = u''
-                elem.tail = u''
-                elem.set('calibre_split', '1')
-            continue
-        if not hit_split_point:
-            elem.text = u''
-            elem.tail = u''
-        elem.set('calibre_split', '0' if hit_split_point else '1')
+
+class Splitter(LoggingInterface):
     
-    for t, r in [(tree, root), (tree2, root2)]:
-        if len(tostring(r)) < opts.profile.flow_size:
-            trees.append(t)
-        else:
-            new_split_point, before = find_split_point(t)
-            if new_split_point is None:
-                raise SplitError(filepath)
-            trees.extend(split_tree(t, new_split_point, before, opts, filepath))
+    def __init__(self, path, opts, always_remove=False):
+        LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
+        self.setup_cli_handler(opts.verbose)
+        self.path = path
+        self.always_remove = always_remove
+        self.base = os.path.splitext(path)[0] + '_split_%d.html'
+        self.opts = opts
+        self.log_info('\tSplitting %s (%d KB)', path, os.stat(content(path)).st_size/1024.)
+        root = html.fromstring(open(content(path)).read())
             
-    return trees
-    
+        css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
+        if css:
+            cssp = os.path.join('content', *(css[0].get('href').split('/')))
+            self.log_debug('\t\tParsing stylesheet...') 
+            stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
+        else:
+            stylesheet = None
+        self.page_breaks = []
+        if stylesheet is not None:
+            self.find_page_breaks(stylesheet, root)
+        
+        self.trees = self.split(root.getroottree())
+        self.commit()
+        self.log_info('\t\tSplit into %d parts.', len(self.trees))
+        if self.opts.verbose:
+            for f in self.files:
+                self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
+        self.trees = None
+        
+    def split(self, tree):
+        '''
+        Split ``tree`` into a *before* and *after* tree, preserving tag structure,
+        but not duplicating any text. All tags that have had their text and tail
+        removed have the attribute ``calibre_split`` set to 1.
+        '''
+        self.log_debug('\t\tSplitting...')
+        root = tree.getroot()
+        split_point, before = self.find_split_point(root)
+        if split_point is None:
+            if not self.always_remove:
+                self.log_warn(_('\t\tToo much markup. Re-splitting without structure preservation. This may cause incorrect rendering.'))
+            raise SplitError(self.path, root)
+        tree2 = copy.deepcopy(tree)
+        root2 = tree2.getroot()
+        body, body2 = root.body, root2.body
+        trees = []
+        path = tree.getpath(split_point)
+        split_point2 = root2.xpath(path)[0]
+        
+        def nix_element(elem, top=True):
+            if self.always_remove:
+                parent = elem.getparent()
+                index = parent.index(elem)
+                if top:
+                    parent.remove(elem)
+                else:
+                    index = parent.index(elem)
+                    parent[index:index+1] = list(elem.iterchildren())
+                
+            else:
+                elem.text = u''
+                elem.tail = u''
+                elem.set(SPLIT_ATTR, '1')
+                if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
+                    elem.set('style', 'display:none;')
+        
+        def fix_split_point(sp):
+            sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') 
+        
+        # Tree 1
+        hit_split_point = False
+        for elem in list(body.iterdescendants(etree.Element)):
+            if elem.get(SPLIT_ATTR, '0') == '1':
+                continue
+            if elem is split_point:
+                hit_split_point = True
+                if before:
+                    nix_element(elem)
+                fix_split_point(elem)
+                continue
+            if hit_split_point:
+                nix_element(elem)
+            
+            
+        # Tree 2
+        hit_split_point = False
+        for elem in list(body2.iterdescendants(etree.Element)):
+            if elem.get(SPLIT_ATTR, '0') == '1':
+                continue
+            if elem is split_point2:
+                hit_split_point = True
+                if not before:
+                    nix_element(elem, top=False)
+                fix_split_point(elem)
+                continue
+            if not hit_split_point:
+                nix_element(elem, top=False)
+        
+        for t, r in [(tree, root), (tree2, root2)]:
+            size = len(tostring(r)) 
+            if size <= self.opts.profile.flow_size:
+                trees.append(t)
+                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', len(trees), size/1024.)
+            else:
+                trees.extend(self.split(t))
+                
+        return trees
 
-def find_split_point(tree):
-    root = tree.getroot()
-    css = root.xpath('//style[@type="text/css"]')
-    if css:
-        
-        def pick_elem(elems):
-            if elems:
-                elems = [i for i in elems if elem.get('calibre_split', '0') != '1']
-                if elems:
-                    i = int(math.floor(len(elems)/2.))
-                    return elems[i]
-        
-        def selector_element(rule):
-            try:
-                selector = CSSSelector(rule.selectorText)
-                return pick_elem(selector(root))
-            except:
-                return None
-        
-        css = css[0].text
-        from cssutils import CSSParser
-        stylesheet = CSSParser().parseString(css)
+    def find_page_breaks(self, stylesheet, root):
+        '''
+        Find all elements that have either page-break-before or page-break-after set.
+        '''
+        page_break_selectors = set([])
         for rule in stylesheet:
             if rule.type != rule.STYLE_RULE:
                 continue
             before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
-            if before and before != 'avoid':
-                elem = selector_element(rule)
-                if elem is not None:
-                    return elem, True
             after  = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
-            if after and after != 'avoid':
-                elem = selector_element(rule)
-                if elem is not None:
-                    return elem, False
-                
-    for path in ('//*[re:match(name(), "h[1-6]", "i")', '/body/div', '//p'):
-        elems = root.xpath(path)
-        elem = pick_elem(elems)
-        if elem is not None:
-            return elem, True
-        
-    return None, True
-
-def do_split(path, opts):
-    tree = parse(path, parser=PARSER)
-    split_point, before = find_split_point(tree)
-    if split_point is None:
-        raise SplitError(path)
-    trees = split_tree(tree, split_point, before, opts, path)
-    base = os.path.splitext(os.path.basename(path))[0] + '_split_%d.html'
-    anchor_map = {None:base%0}
-    files = []
-    for i, tree in enumerate(trees):
-        root = tree.getroot()
-        files.append(base%i)
-        for elem in root.xpath('//*[@id and @calibre_split = "1"]'):
-            anchor_map[elem.get('id')] = files[-1]
-            elem.attrib.pop('calibre_split')
-        for elem in root.xpath('//*[@calibre_split]'):
-            elem.attrib.pop('calibre_split')
-        open(os.path.join(os.path.dirname(path), files[-1]), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
-    os.remove(path)
-    return path, files, anchor_map
-
-def fix_opf(opf, orig_file, files, anchor_map):
-    orig = None
-    for item in opf.manifest:
-        if os.path.samefile(orig_file, item.path):
-            orig = item
-            break
-    opf.manifest.remove(orig)
-    ids = []
-    for f in files:
-        ids.append(opf.manifest.add_item(f))
-    index = None
-    for i, item in enumerate(opf.spine):
-        if item.id == orig.id:
-            index = i
-            break
-        
-    
+            try:
+                if before and before != 'avoid':
+                    page_break_selectors.add((CSSSelector(rule.selectorText), True))
+            except:
+                pass
+            try:
+                if after and after != 'avoid':
+                    page_break_selectors.add((CSSSelector(rule.selectorText), False))
+            except:
+                pass
             
- 
-def split(pathtoopf, opts):
-    return
-    pathtoopf = os.path.abspath(pathtoopf)
-    opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
-    html_files = []
-    for item in opf.manifest:
-        if 'html' in item.mime_type.lower():
-            html_files.append(item.path)
-    changes = []
-    for f in html_files:
-        if os.stat(f).st_size > opts.profile.flow_size:
-            fix_opf(opf, *do_split(f, opts))
-    if changes:
-        pass
-        
-             
+        page_breaks = set([])
+        for selector, before in page_break_selectors:
+            for elem in selector(root):
+                elem.pb_before = before
+                page_breaks.add(elem)
+                
+        for i, elem in enumerate(root.iter()):
+            elem.pb_order = i
+            
+        page_breaks = list(page_breaks)
+        page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
+        tree = root.getroottree()
+        self.page_breaks = [(XPath(tree.getpath(x)), x.pb_before) for x in page_breaks]
         
+    def find_split_point(self, root):
+        '''
+        Find the tag at which to split the tree rooted at `root`. 
+        Search order is:
+            * page breaks
+            * Heading tags
+            * <div> tags
+            * <p> tags
+            
+        We try to split in the "middle" of the file (as defined by tag counts.
+        '''
+        def pick_elem(elems):
+            if elems:
+                elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != '1'\
+                          and i.get(SPLIT_ATTR, '0') != '1']
+                if elems:
+                    i = int(math.floor(len(elems)/2.))
+                    elems[i].set(SPLIT_POINT_ATTR, '1')
+                    return elems[i]
     
+        page_breaks = []
+        for x in self.page_breaks:
+            pb = x[0](root)
+            if pb:
+                page_breaks.append(pb[0])
+                
+        elem = pick_elem(page_breaks)
+        if elem is not None:
+            i = page_breaks.index(elem)
+            return elem, self.page_breaks[i][1]
+        
+            
+                            
+        for path in ('//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//p'):
+            elems = root.xpath(path)
+            elem = pick_elem(elems)
+            if elem is not None:
+                return elem, True
+            
+        return None, True
+    
+    def commit(self):
+        '''
+        Commit all changes caused by the split. This removes the previously
+        introduced ``calibre_split`` attribute and calculates an *anchor_map* for
+        all anchors in the original tree. Internal links are re-directed. The
+        original file is deleted and the split files are saved.
+        '''
+        self.anchor_map = {None:self.base%0}
+        self.files = []
+        
+        for i, tree in enumerate(self.trees):
+            root = tree.getroot()
+            self.files.append(self.base%i)
+            for elem in root.xpath('//*[@id]'):
+                if elem.get(SPLIT_ATTR, '0') == '0':
+                    self.anchor_map[elem.get('id')] = self.files[-1]
+            for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
+                elem.attrib.pop(SPLIT_ATTR, None)
+                elem.attrib.pop(SPLIT_POINT_ATTR, '0')
+                
+        for current, tree in zip(self.files, self.trees):
+            for a in tree.getroot().xpath('//a[@href]'):
+                href = a.get('href').strip()
+                if href.startswith('#'):
+                    anchor = href[1:]
+                    file = self.anchor_map[anchor]
+                    if file != current:
+                        a.set('href', file+href)            
+            open(content(current), 'wb').\
+                write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
+            
+        os.remove(content(self.path))
 
-def main(args=sys.argv):
-    return 0
 
-if __name__ == '__main__':
-    sys.exit(main())
\ No newline at end of file
+    def fix_opf(self, opf):
+        '''
+        Fix references to the split file in the OPF.
+        '''
+        items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
+        new_items = [('content/'+f, None) for f in self.files]
+        id_map = {}
+        for item in items:
+            id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
+        
+        for id in id_map.keys():
+            opf.replace_spine_items_by_idref(id, id_map[id])
+        
+        for ref in opf.iterguide():
+            href = ref.get('href', '') 
+            if href.startswith('content/'+self.path):
+                href = href.split('#')
+                frag = None
+                if len(href) > 1:
+                    frag = href[1]
+                new_file = self.anchor_map[frag]
+                ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
+
+          
+                
+def fix_content_links(html_files, changes, opts):
+    split_files = [f.path for f in changes]
+    anchor_maps = [f.anchor_map for f in changes]
+    files = list(html_files)
+    for j, f in enumerate(split_files):
+        try:
+            i = files.index(f)
+            files[i:i+1] = changes[j].files
+        except ValueError:
+            continue
+        
+    for htmlfile in files:
+        changed = False
+        root = html.fromstring(open(content(htmlfile), 'rb').read())
+        for a in root.xpath('//a[@href]'):
+            href = a.get('href')
+            if not href.startswith('#'):
+                href = href.split('#')
+                anchor = href[1] if len(href) > 1 else None
+                href = href[0]
+                if href in split_files:
+                    newf = anchor_maps[split_files.index(href)][anchor]
+                    frag = ('#'+anchor) if anchor else ''
+                    a.set('href', newf+frag)
+                    changed = True
+                    
+        if changed:
+            open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
+
+def fix_ncx(path, changes):
+    split_files = [f.path for f in changes]
+    anchor_maps = [f.anchor_map for f in changes]
+    tree = etree.parse(path)
+    changed = False
+    for content in tree.getroot().xpath('//x:content[@src]', namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
+        href = content.get('src')
+        if not href.startswith('#'):
+            href = href.split('#')
+            anchor = href[1] if len(href) > 1 else None
+            href = href[0].split('/')[-1]
+            if href in split_files:
+                newf = anchor_maps[split_files.index(href)][anchor]
+                frag = ('#'+anchor) if anchor else ''
+                content.set('src', 'content/'+newf+frag)
+                changed = True
+    if changed:
+        open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
+       
+def split(pathtoopf, opts):
+    pathtoopf = os.path.abspath(pathtoopf)
+    with CurrentDir(os.path.dirname(pathtoopf)):
+        opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
+        html_files = []
+        for item in opf.itermanifest():
+            if 'html' in item.get('media-type', '').lower():
+                html_files.append(unquote(item.get('href')).split('/')[-1])
+        changes = []
+        for f in html_files:
+            if os.stat(content(f)).st_size > opts.profile.flow_size:
+                try:
+                    changes.append(Splitter(f, opts))
+                except SplitError:
+                    changes.append(Splitter(f, opts, always_remove=True))
+                changes[-1].fix_opf(opf)
+        
+        open(pathtoopf, 'wb').write(opf.render())
+        fix_content_links(html_files, changes, opts)
+        
+        for item in opf.itermanifest():
+            if item.get('media-type', '') == 'application/x-dtbncx+xml':
+                fix_ncx(item.get('href'), changes)
+                break 
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index 7532d43cf8..c8ad45de8c 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -228,8 +228,14 @@ def opf_traverse(opf_reader, verbose=0, encoding=None):
         raise ValueError('OPF does not have a spine')
     flat = []
     for path in opf_reader.spine.items():
+        path = os.path.abspath(path)
         if path not in flat:
             flat.append(os.path.abspath(path))
+    for item in opf_reader.manifest:
+        if 'html' in item.mime_type:
+            path = os.path.abspath(item.path)
+            if path not in flat:
+                flat.append(path)
     flat = [HTMLFile(path, 0, encoding, verbose) for path in flat]
     return flat
             
@@ -329,14 +335,15 @@ class Parser(PreProcessor, LoggingInterface):
             if self.root.get(bad, None) is not None:
                 self.root.attrib.pop(bad)
         
-        
-        
+    def save_path(self):    
+        return os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path])
+    
     def save(self):
         '''
         Save processed HTML into the content directory.
         Should be called after all HTML processing is finished.
         '''
-        with open(os.path.join(self.tdir, self.htmlfile_map[self.htmlfile.path]), 'wb') as f:
+        with open(self.save_path(), 'wb') as f:
             ans = tostring(self.root, pretty_print=self.opts.pretty_print)
             ans = re.compile(r'<html>', re.IGNORECASE).sub('<html xmlns="http://www.w3.org/1999/xhtml">', ans)
             ans = re.compile(r'<head[^<>]*?>', re.IGNORECASE).sub('<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n', ans)
@@ -390,21 +397,26 @@ class Parser(PreProcessor, LoggingInterface):
         if not isinstance(olink, unicode):
             olink = olink.decode(self.htmlfile.encoding)
         link = self.htmlfile.resolve(olink)
+        frag = (('#'+link.fragment) if link.fragment else '')
+        if link.path == self.htmlfile.path:
+            return frag if frag else '#'
         if not link.path or not os.path.exists(link.path) or not os.path.isfile(link.path):
             return olink
         if link.path in self.htmlfiles:
-            return self.htmlfile_map[link.path]
+            return self.htmlfile_map[link.path] + frag 
         if re.match(r'\.(x){0,1}htm(l){0,1}', os.path.splitext(link.path)[1]) is not None:
             return olink # This happens when --max-levels is used
         if link.path in self.resource_map.keys():
-            return self.resource_map[link.path]
+            return self.resource_map[link.path] + frag
         name = os.path.basename(link.path)
         name, ext = os.path.splitext(name)
         name += ('_%d'%len(self.resource_map)) + ext
         shutil.copyfile(link.path, os.path.join(self.resource_dir, name))
         name = 'resources/' + name
         self.resource_map[link.path] = name
-        return name
+        return name + frag
+    
+        
 
 class Processor(Parser):
     '''
@@ -438,9 +450,12 @@ class Processor(Parser):
         
     def save(self):
         head = self.head if self.head is not None else self.body
-        style = etree.SubElement(head, 'style', attrib={'type':'text/css'})
-        style.text='\n'+self.css
+        style_path = os.path.basename(self.save_path())+'.css'
+        style = etree.SubElement(head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', 
+                                                       'href':'resources/'+style_path})
         style.tail = '\n\n'
+        style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path)
+        open(style_path, 'wb').write(self.css.encode('utf-8'))
         return Parser.save(self)
     
     def populate_toc(self, toc):
@@ -530,6 +545,8 @@ class Processor(Parser):
                 css.append('\n'.join(style.xpath('./text()')))
                 style.getparent().remove(style)
         
+        cache = {}
+        class_counter = 0
         for font in self.root.xpath('//font'):
             try:
                 size = int(font.attrib.pop('size', '3'))
@@ -542,15 +559,32 @@ class Processor(Parser):
             color = font.attrib.pop('color', None)
             if color is not None:
                 setting += 'color:%s'%color
-            id = get_id(font, counter)
-            counter += 1
-            css.append('#%s { %s }'%(id, setting))
+            classname = cache.get(setting, None)
+            if classname is None:
+                classname = 'calibre_class_%d'%class_counter
+                class_counter += 1
+                cache[setting] = classname
+            cn = font.get('class', '')
+            if cn: cn += ' '
+            cn += classname
+            font.set('class', cn)
             
         for elem in self.root.xpath('//*[@style]'):
-            id = get_id(elem, counter)
-            counter += 1
-            css.append('#%s {%s}'%(id, elem.get('style')))
+            setting = elem.get('style')
+            classname = cache.get(setting, None)
+            if classname is None:
+                classname = 'calibre_class_%d'%class_counter
+                class_counter += 1
+                cache[setting] = classname
+            cn = elem.get('class', '')
+            if cn: cn += ' '
+            cn += classname
+            elem.set('class', cn)
             elem.attrib.pop('style')
+        
+        for setting, cn in cache.items():
+            css.append('.%s {%s}'%(cn, setting))
+        
             
         self.raw_css = '\n\n'.join(css)
         self.css = unicode(self.raw_css)
@@ -688,6 +722,9 @@ def create_metadata(basepath, mi, filelist, resources):
     '''
     mi = OPFCreator(basepath, mi)
     entries = [('content/'+f, 'application/xhtml+xml') for f in filelist] + [(f, None) for f in resources]
+    for f in filelist:
+        if os.path.exists(os.path.join(basepath, 'content', 'resources', f+'.css')):
+            entries.append(('content/resources/'+f+'.css', 'text/css'))
     mi.create_manifest(entries)
     mi.create_spine(['content/'+f for f in filelist])
     return mi
diff --git a/src/calibre/ebooks/metadata/__init__.py b/src/calibre/ebooks/metadata/__init__.py
index 4460fea4a0..52c7319f42 100644
--- a/src/calibre/ebooks/metadata/__init__.py
+++ b/src/calibre/ebooks/metadata/__init__.py
@@ -141,9 +141,10 @@ class ResourceCollection(object):
         
     def remove(self, resource):
         self._resources.remove(resource)
-        
+    
     def replace(self, start, end, items):
-        pass
+        'Same as list[start:end] = items'
+        self._resources[start:end] = items
         
     @staticmethod
     def from_directory_contents(top, topdown=True):
diff --git a/src/calibre/ebooks/metadata/opf2.py b/src/calibre/ebooks/metadata/opf2.py
index 91255efbf5..2dd1ba5f85 100644
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@@ -155,6 +155,19 @@ class Spine(ResourceCollection):
         ResourceCollection.__init__(self)
         self.manifest = manifest
             
+            
+    def replace(self, start, end, ids):
+        '''
+        Replace the items between start (inclusive) and end (not inclusive) with
+        with the items identified by ids. ids can be a list of any length.
+        '''
+        items = []
+        for id in ids:
+            path = self.manifest.path_for_id(id)
+            if path is None:
+                raise ValueError('id %s not in manifest')
+            items.append(Spine.Item(lambda x: id, path, is_path=True))
+        ResourceCollection.replace(start, end, items)
                     
     def linear_items(self):
         for r in self:
@@ -297,6 +310,55 @@ class OPF(object):
     def get_text(self, elem):
         return u''.join(self.TEXT(elem))
     
+    def itermanifest(self):
+        return self.manifest_path(self.tree)
+    
+    def create_manifest_item(self, href, media_type):
+        ids = [i.get('id', None) for i in self.itermanifest()]
+        id = None
+        for c in xrange(1, sys.maxint):
+            id = 'id%d'%c
+            if id not in ids:
+                break
+        if not media_type:
+            media_type = 'application/xhtml+xml'
+        ans = etree.Element('{%s}item'%self.NAMESPACES['opf'], 
+                             attrib={'id':id, 'href':href, 'media-type':media_type})
+        ans.tail = '\n\t\t'
+        return ans
+    
+    def replace_manifest_item(self, item, items):
+        items = [self.create_manifest_item(*i) for i in items]
+        for i, item2 in enumerate(items):
+            item2.set('id', item.get('id')+'.%d'%(i+1))
+        manifest = item.getparent()
+        index = manifest.index(item)
+        manifest[index:index+1] = items
+        return [i.get('id') for i in items]
+    
+    def iterspine(self):
+        return self.spine_path(self.tree)
+    
+    def create_spine_item(self, idref):
+        ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref)
+        ans.tail = '\n\t\t'
+        return ans
+    
+    def replace_spine_items_by_idref(self, idref, new_idrefs):
+        items = list(map(self.create_spine_item, new_idrefs))
+        spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.tree)[0]
+        old = [i for i in self.iterspine() if i.get('idref', None) == idref]
+        for x in old:
+            i = spine.index(x)
+            spine[i:i+1] = items
+    
+    def iterguide(self):
+        return self.guide_path(self.tree)
+    
+    def render(self):
+        return etree.tostring(self.tree, encoding='UTF-8', xml_declaration=True, 
+                              pretty_print=True)
+    
     @apply
     def authors():
         
diff --git a/src/calibre/ebooks/metadata/toc.py b/src/calibre/ebooks/metadata/toc.py
index 8f4d2d2ecd..5e1a51619e 100644
--- a/src/calibre/ebooks/metadata/toc.py
+++ b/src/calibre/ebooks/metadata/toc.py
@@ -24,6 +24,8 @@ class TOC(list):
                  base_path=os.getcwd()):
         self.href = href
         self.fragment = fragment
+        if not self.fragment:
+            self.fragment = None
         self.text = text
         self.parent = parent
         self.base_path = base_path
@@ -153,8 +155,20 @@ class TOC(list):
                 continue
             purl = urlparse(unquote(a['href']))
             href, fragment = purl[2], purl[5]
+            if not fragment:
+                fragment = None
+            else:
+                fragment = fragment.strip()
+            href = href.strip()
+            
             txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
-            self.add_item(href, fragment, txt)
+            add = True
+            for i in self.flat():
+                if i.href == href and i.fragment == fragment:
+                    add = False
+                    break 
+            if add:
+                self.add_item(href, fragment, txt)
 
     def render(self, stream, uid):
         from calibre.resources import ncx_template
diff --git a/src/calibre/gui2/widgets.py b/src/calibre/gui2/widgets.py
index 5a1bd756b3..5b3c2afe8f 100644
--- a/src/calibre/gui2/widgets.py
+++ b/src/calibre/gui2/widgets.py
@@ -3,7 +3,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Miscellaneous widgets used in the GUI
 '''
-import re, os
+import re, os, traceback
 from PyQt4.QtGui import QListView, QIcon, QFont, QLabel, QListWidget, \
                         QListWidgetItem, QTextCharFormat, QApplication, \
                         QSyntaxHighlighter, QCursor, QColor, QWidget, QDialog, \
@@ -254,7 +254,12 @@ class FontFamilyModel(QAbstractListModel):
     
     def __init__(self, *args):
         QAbstractListModel.__init__(self, *args)
-        self.families = find_font_families()
+        try:
+            self.families = find_font_families()
+        except:
+            self.families = []
+            print 'WARNING: Could not load fonts'
+            traceback.print_exc()
         self.families.sort()
         self.families[:0] = ['None']
         
diff --git a/src/calibre/linux_installer.py b/src/calibre/linux_installer.py
index f494bd0b14..e07cdc192a 100644
--- a/src/calibre/linux_installer.py
+++ b/src/calibre/linux_installer.py
@@ -278,7 +278,7 @@ def download_tarball():
 
 def main(args=sys.argv):
     defdir = '/opt/calibre'
-    destdir = raw_input('Enter the installation directory for calibre [%s]: '%defdir).strip()
+    destdir = raw_input('Enter the installation directory for calibre (Its contents will be deleted!)[%s]: '%defdir).strip()
     if not destdir:
         destdir = defdir
     if os.path.exists(destdir):
diff --git a/src/calibre/manual/custom.py b/src/calibre/manual/custom.py
index 2797829d87..1b0a6334d7 100644
--- a/src/calibre/manual/custom.py
+++ b/src/calibre/manual/custom.py
@@ -58,7 +58,7 @@ CLI_CMD=r'''
 .. include:: ../global.rst
 ||
 .. _$cmd:
-|| 
+||
 #def option(opt)
 :option:`${opt.get_opt_string() + ((', '+', '.join(opt._short_opts)) if opt._short_opts else '')}`
 #end
@@ -68,7 +68,7 @@ $cmd
 .. code-block:: none
 ||
     $cmdline
-|| 
+||
 #for line in usage
 #choose
 #when len(line) > 0
@@ -106,7 +106,7 @@ def cli_docs(app):
     info(bold('creating CLI documentation...'))
     documented_cmds = []
     undocumented_cmds = []
-        
+
     for script in entry_points['console_scripts']:
         module = script[script.index('=')+1:script.index(':')].strip()
         cmd = script[:script.index('=')].strip()
@@ -115,22 +115,22 @@ def cli_docs(app):
             documented_cmds.append((cmd, getattr(module, 'option_parser')()))
         else:
             undocumented_cmds.append(cmd)
-            
+
         documented_cmds.sort(cmp=lambda x, y: cmp(x[0], y[0]))
         undocumented_cmds.sort()
-    
+
     templ = TextTemplate(CLI_INDEX)
-    raw = templ.generate(documented_commands=documented_cmds, 
+    raw = templ.generate(documented_commands=documented_cmds,
                          undocumented_commands=undocumented_cmds).render()
     raw = raw.replace('||', '\n')
     if not os.path.exists('cli'):
         os.makedirs('cli')
     if not os.path.exists(os.path.join('cli', 'global.rst')):
-        os.link('global.rst', os.path.join('cli', 'global.rst'))                  
+        os.link('global.rst', os.path.join('cli', 'global.rst'))
     if not os.path.exists(os.path.join('cli', 'cli-index.rst')):
         info(bold('creating cli-index...'))
         open(os.path.join('cli', 'cli-index.rst'), 'wb').write(raw)
-    
+
     templ = TextTemplate(CLI_CMD)
     for cmd, parser in documented_cmds:
         usage = [i for i in parser.usage.replace('%prog', cmd).splitlines()]
@@ -140,18 +140,19 @@ def cli_docs(app):
         groups = [(None, None, parser.option_list)]
         for grp in parser.option_groups:
             groups.append((grp.title, grp.description, grp.option_list))
-        
+
         raw = templ.generate(cmd=cmd, cmdline=cmdline, usage=usage, groups=groups).render()
         raw = raw.replace('||', '\n').replace('&lt;', '<').replace('&gt;', '>')
         if not os.path.exists(os.path.join('cli', cmd+'.rst')):
             info(bold('creating docs for %s...'%cmd))
             open(os.path.join('cli', cmd+'.rst'), 'wb').write(raw)
 
+
 def auto_member(dirname, arguments, options, content, lineno,
                     content_offset, block_text, state, state_machine):
     name = arguments[0]
     env = state.document.settings.env
-    
+
     mod_cls, obj = rpartition(name, '.')
     if not mod_cls and hasattr(env, 'autodoc_current_class'):
         mod_cls = env.autodoc_current_class
@@ -162,11 +163,11 @@ def auto_member(dirname, arguments, options, content, lineno,
         mod = env.autodoc_current_module
     if not mod:
         mod = env.currmodule
-    
+
     module = __import__(mod, None, None, ['foo'])
     cls = getattr(module, cls)
     lines = inspect.getsourcelines(cls)[0]
-    
+
     comment_lines = []
     for i, line in enumerate(lines):
         if re.search(r'%s\s*=\s*\S+'%obj, line) and not line.strip().startswith('#:'):
@@ -178,33 +179,32 @@ def auto_member(dirname, arguments, options, content, lineno,
             break
     comment_lines.reverse()
     docstring = '\n'.join(comment_lines)
-    
+
     if module is not None and docstring is not None:
         docstring = docstring.decode(get_module_charset(mod))
-    
+
     result = ViewList()
     result.append('.. attribute:: %s.%s'%(cls.__name__, obj), '<autodoc>')
     result.append('', '<autodoc>')
-    
+
     docstring = prepare_docstring(docstring)
     for i, line in enumerate(docstring):
         result.append('    ' + line, '<docstring of %s>' % name, i)
-    
+
     result.append('', '')
     result.append('    **Default**: ``%s``'%repr(getattr(cls, obj, None)), '<default memeber value>')
     result.append('', '')
     node = nodes.paragraph()
     state.nested_parse(result, content_offset, node)
-        
-    return node
-    
-    
 
-    
+    return list(node)
+
+
+
 
 def setup(app):
     app.add_builder(CustomBuilder)
     app.add_directive('automember', auto_member, 1, (1, 0, 1))
     app.connect('doctree-read', substitute)
     app.connect('builder-inited', cli_docs)
-    
+
diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst
index 7ee463ea18..686d06dca6 100644
--- a/src/calibre/manual/faq.rst
+++ b/src/calibre/manual/faq.rst
@@ -134,6 +134,7 @@ There can be several causes for this:
     * **Any windows version**: If this happens during an initial run of calibre, try deleting the folder you chose for your ebooks and restarting calibre.
     * **Windows Vista**: If the folder :file:`C:\\Users\\Your User Name\\AppData\\Local\\VirtualStore\\Program Files\\calibre` exists, delete it. Uninstall |app|. Reboot. Re-install.
     * **Any windows version**: Search your computer for a folder named :file:`_ipython`. Delete it and try again.
+    * **Any windows version**: Try disabling any antivirus program you have running and see if that fixes it. Also try diabling any firewall software that prevents connections to the local computer.
 
 If it still wont launch, start a command prompt (press the windows key and R; then type :command:`cmd.exe` in the Run dialog that appears). At the command prompt type the following command and press Enter::
 
diff --git a/src/calibre/ptempfile.py b/src/calibre/ptempfile.py
index fd28c87f58..ae9fa26cb7 100644
--- a/src/calibre/ptempfile.py
+++ b/src/calibre/ptempfile.py
@@ -57,19 +57,21 @@ def PersistentTemporaryDirectory(suffix='', prefix='', dir=None):
     atexit.register(shutil.rmtree, tdir, True)
     return tdir
 
-class TemporaryDirectory(str):
+class TemporaryDirectory(object):
     '''
-    A temporary directory to be used ina  with statement.
+    A temporary directory to be used in a with statement.
     '''
-    def __init__(self, suffix='', prefix='', dir=None):
+    def __init__(self, suffix='', prefix='', dir=None, keep=False):
         self.suffix = suffix
         self.prefix = prefix
-        self.dir = dir 
+        self.dir = dir
+        self.keep = keep
     
     def __enter__(self):
         self.tdir = tempfile.mkdtemp(self.suffix, __appname__+"_"+ __version__+"_" +self.prefix, self.dir)
         return self.tdir
     
     def __exit__(self, *args):
-        shutil.rmtree(self.tdir)
+        if not self.keep:
+            shutil.rmtree(self.tdir)
       
diff --git a/src/calibre/translations/__init__.py b/src/calibre/translations/__init__.py
index 086803c12d..31630d345c 100644
--- a/src/calibre/translations/__init__.py
+++ b/src/calibre/translations/__init__.py
@@ -53,6 +53,9 @@ def import_from_launchpad(url):
             open(out, 'wb').write(tf.extractfile(next).read())
         next = tf.next()
     check_for_critical_bugs()
+    path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+    print path
+    subprocess.check_call('python setup.py translations'.split(), dir=path)
     return 0
  
 def check_for_critical_bugs():