diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py
index 1f33e34483..61d7801cb4 100644
--- a/src/calibre/__init__.py
+++ b/src/calibre/__init__.py
@@ -284,7 +284,7 @@ def english_sort(x, y):
 class LoggingInterface:
 
     def __init__(self, logger):
-        self.__logger = logger
+        self.__logger = self.logger = logger
         
     def setup_cli_handler(self, verbosity):
         for handler in self.__logger.handlers:
diff --git a/src/calibre/ebooks/epub/__init__.py b/src/calibre/ebooks/epub/__init__.py
index bf151d840c..1d8b6e6b4c 100644
--- a/src/calibre/ebooks/epub/__init__.py
+++ b/src/calibre/ebooks/epub/__init__.py
@@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
 Conversion to EPUB.
 '''
 import sys, textwrap
-from lxml import html
 from calibre.utils.config import Config, StringConfig
 from calibre.utils.zipfile import ZipFile, ZIP_STORED
 from calibre.ebooks.html import config as common_config, tostring
@@ -16,13 +15,11 @@ class DefaultProfile(object):
     
     flow_size   = sys.maxint
     screen_size = None
-    dpi         = 100
     
 class PRS505(DefaultProfile):
     
     flow_size   = 300000
     screen_size = (600, 775)
-    dpi         = 166
         
 
 PROFILES = {
@@ -30,6 +27,13 @@ PROFILES = {
             'None'   : DefaultProfile,
             }
 
+def rules(stylesheets):
+    for s in stylesheets:
+        if hasattr(s, 'cssText'):
+            for r in s:
+                if r.type == r.STYLE_RULE:
+                    yield r
+
 def initialize_container(path_to_container, opf_name='metadata.opf'):
     '''
     Create an empty EPUB document, with a default skeleton.
@@ -95,6 +99,12 @@ to auto-generate a Table of Contents.
         help=_("Don't add auto-detected chapters to the Table of Contents."))
     toc('toc_threshold', ['--toc-threshold'], default=6,
         help=_('If fewer than this number of chapters is detected, then links are added to the Table of Contents.'))
+    toc('level1_toc', ['--level1-toc'], default=None,
+        help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level one. If this is specified, it takes precedence over other forms of auto-detection.'))
+    toc('level2_toc', ['--level2-toc'], default=None,
+        help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level two. Each entry is added under the previous level one entry.'))
+    toc('from_ncx', ['--from-ncx'], default=None,
+        help=_('Path to a .ncx file that contains the table of contents to use for this ebook. The NCX file should contain links relative to the directory it is placed in. See http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for an overview of the NCX format.'))
     toc('use_auto_toc', ['--use-auto-toc'], default=False,
         help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.'))
     
@@ -107,8 +117,10 @@ to auto-generate a Table of Contents.
            help=_('Set the left margin in pts. Default is %default'))
     layout('margin_right', ['--margin-right'], default=5.0, 
            help=_('Set the right margin in pts. Default is %default'))
-    layout('base_font_size', ['--base-font-size'], default=100.0,
-           help=_('The base font size as a percentage. Default is %default. Changing this should allow you to control overall base font sizes, except for input HTML files that use absolute font sizes for their text tags.'))
+    layout('base_font_size2', ['--base-font-size'], default=12.0,
+           help=_('The base font size in pts. Default is %defaultpt. Set to 0 to disable rescaling of fonts.'))
+    layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=True,
+           help=_('Remove spacing between paragraphs. Will not work if the source file forces inter-paragraph spacing.'))
     
     c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
               help=_('Print generated OPF file to stdout'))
diff --git a/src/calibre/ebooks/epub/fonts.py b/src/calibre/ebooks/epub/fonts.py
new file mode 100644
index 0000000000..5d0887f2d0
--- /dev/null
+++ b/src/calibre/ebooks/epub/fonts.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Font size rationalization. See :function:`relativize`.
+'''
+
+import logging, re, operator, functools, collections, unittest, copy, sys
+from xml.dom import SyntaxErr
+
+from lxml.cssselect import CSSSelector
+from lxml import etree
+from lxml.html import HtmlElement
+
+from calibre.ebooks.html import fromstring
+from calibre.ebooks.epub import rules
+from cssutils import CSSParser
+
+num           = r'[-]?\d+|[-]?\d*\.\d+'
+length        = r'(?P<zero>0)|(?P<num>{num})(?P<unit>%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num)
+absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
+relative_size = r'(?P<rel>smaller|larger)'
+
+font_size_pat   = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
+line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))  
+
+PTU = {
+       'in' : 72.,
+       'cm' : 72/2.54,
+       'mm' : 72/25.4,
+       'pt' : 1.0,
+       'pc' : 1/12.,
+       }
+
+DEFAULT_FONT_SIZE = 12
+
+class Rationalizer(object):
+    
+    @classmethod
+    def specificity(cls, s):
+        '''Map CSS specificity tuple to a single integer'''
+        return sum([10**(4-i) + x for i,x in enumerate(s)]) 
+        
+    @classmethod
+    def compute_font_size(cls, elem):
+        '''
+        Calculate the effective font size of an element traversing its ancestors as far as
+        neccessary.
+        '''
+        cfs = elem.computed_font_size
+        if cfs is not None:
+            return
+        sfs = elem.specified_font_size
+        if callable(sfs):
+            parent = elem.getparent()
+            cls.compute_font_size(parent)
+            elem.computed_font_size = sfs(parent.computed_font_size)
+        else:
+            elem.computed_font_size = sfs
+        
+    @classmethod
+    def calculate_font_size(cls, style):
+        'Return font size in pts from style object. For relative units returns a callable'
+        match = font_size_pat.search(style.font)
+        fs = ''
+        if match:
+            fs = match.group()
+        if style.fontSize:
+            fs = style.fontSize
+            
+        match = font_size_pat.search(fs)
+        if match is None:
+            return None
+        match = match.groupdict()
+        unit = match.get('unit', '')
+        if unit: unit = unit.lower()
+        if unit in PTU.keys():
+            return PTU[unit] * float(match['num'])
+        if unit in ('em', 'ex'):
+            return functools.partial(operator.mul, float(match['num']))
+        if unit == '%':
+            return functools.partial(operator.mul, float(match['num'])/100.)
+        abs = match.get('abs', '')
+        if abs: abs = abs.lower()
+        if abs:
+            x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1))
+            return 12 * x
+        if match.get('zero', False):
+            return 0.
+        return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) 
+        
+    @classmethod
+    def resolve_rules(cls, stylesheets):
+        for sheet in stylesheets:
+            if hasattr(sheet, 'fs_rules'):
+                continue
+            sheet.fs_rules = []
+            sheet.lh_rules = []
+            for r in sheet:
+                if r.type == r.STYLE_RULE:
+                    font_size = cls.calculate_font_size(r.style)
+                    if font_size is not None:
+                        for s in r.selectorList:
+                            sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
+                    orig = line_height_pat.search(r.style.lineHeight) 
+                    if orig is not None:
+                        for s in r.selectorList:
+                            sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
+    
+        
+    @classmethod
+    def apply_font_size_rules(cls, stylesheets, root):
+        'Add a ``specified_font_size`` attribute to every element that has a specified font size'
+        cls.resolve_rules(stylesheets)
+        for sheet in stylesheets:
+            for selector, font_size in sheet.fs_rules:
+                elems = selector(root)
+                for elem in elems:
+                    elem.specified_font_size = font_size
+    
+    @classmethod
+    def remove_font_size_information(cls, stylesheets):
+        for r in rules(stylesheets):
+            r.style.removeProperty('font-size')
+            try:
+                new = font_size_pat.sub('', r.style.font).strip()
+                if new:
+                    r.style.font = new
+                else:
+                    r.style.removeProperty('font')
+            except SyntaxErr:
+                r.style.removeProperty('font')
+            if line_height_pat.search(r.style.lineHeight) is not None:
+                r.style.removeProperty('line-height')
+    
+    @classmethod
+    def compute_font_sizes(cls, root, stylesheets, base=12):
+        stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
+        cls.apply_font_size_rules(stylesheets, root)
+        
+        # Compute the effective font size of all tags
+        root.computed_font_size = DEFAULT_FONT_SIZE
+        for elem in root.iter(etree.Element):
+            cls.compute_font_size(elem)
+        
+        extra_css = {}
+        if base > 0:
+            # Calculate the "base" (i.e. most common) font size
+            font_sizes = collections.defaultdict(lambda : 0)
+            body = root.xpath('//body')[0]
+            IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')
+            for elem in body.iter(etree.Element):
+                if elem.tag not in IGNORE:
+                    t = getattr(elem, 'text', '')
+                    if t: t = t.strip()
+                    if t:
+                        font_sizes[elem.computed_font_size] += len(t)
+                    
+                t = getattr(elem, 'tail', '')
+                if t: t = t.strip()
+                if t:
+                    parent = elem.getparent()
+                    if parent.tag not in IGNORE:
+                        font_sizes[parent.computed_font_size] += len(t)
+                
+            try:
+                most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
+                scale = base/most_common if most_common > 0 else 1.
+            except ValueError:
+                scale = 1.
+            
+            # rescale absolute line-heights
+            counter = 0
+            for sheet in stylesheets:
+                for selector, lh in sheet.lh_rules:
+                    for elem in selector(root):
+                        elem.set('id', elem.get('id', 'cfs_%d'%counter))
+                        counter += 1
+                        if not extra_css.has_key(elem.get('id')):
+                            extra_css[elem.get('id')] = []
+                        extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
+            
+        
+            
+            # Rescale all computed font sizes
+            for elem in body.iter(etree.Element):
+                if isinstance(elem, HtmlElement):
+                    elem.computed_font_size *= scale
+        
+        # Remove all font size specifications from the last stylesheet 
+        cls.remove_font_size_information(stylesheets[-1:])
+                    
+        # Create the CSS to implement the rescaled font sizes
+        for elem in body.iter(etree.Element):
+            cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
+            if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.:
+                elem.set('id', elem.get('id', 'cfs_%d'%counter))
+                counter += 1
+                if not extra_css.has_key(elem.get('id')):
+                    extra_css[elem.get('id')] = []
+                extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
+                
+        css = CSSParser(loglevel=logging.ERROR).parseString('')
+        for id, r in extra_css.items():
+            css.add('#%s {%s}'%(id, ';'.join(r)))
+        return css
+    
+    @classmethod
+    def rationalize(cls, stylesheets, root, opts):
+        logger     = logging.getLogger('html2epub')
+        logger.info('\t\tRationalizing fonts...')
+        extra_css = None
+        if opts.base_font_size2 > 0:
+            try:
+                extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2)
+            except:
+                logger.warning('Failed to rationalize font sizes.')
+                if opts.verbose > 1:
+                    logger.exception('')
+            finally:
+                root.remove_font_size_information()
+        logger.debug('\t\tDone rationalizing')
+        return extra_css
+
+################################################################################
+############## Testing
+################################################################################
+
+class FontTest(unittest.TestCase):
+    
+    def setUp(self):
+        from calibre.ebooks.epub import config
+        self.opts = config(defaults='').parse()
+        self.html = '''
+        <html>
+            <head>
+                <title>Test document</title>
+            </head>
+            <body>
+                <div id="div1">
+                <!-- A comment -->
+                    <p id="p1">Some <b>text</b></p>
+                </div>
+                <p id="p2">Some other <span class="it">text</span>.</p>
+                <p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
+            </body>
+        </html> 
+        '''
+        self.root = fromstring(self.html)
+        
+    def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
+        root1 = copy.deepcopy(self.root)
+        root1.computed_font_size = DEFAULT_FONT_SIZE
+        stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css)
+        stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base)
+        root2 = copy.deepcopy(root1)
+        root2.remove_font_size_information()
+        root2.computed_font_size = DEFAULT_FONT_SIZE
+        Rationalizer.apply_font_size_rules([stylesheet2], root2)
+        for elem in root2.iter(etree.Element):
+            Rationalizer.compute_font_size(elem)
+        for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
+            self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, 
+                msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
+                (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
+        return stylesheet2.cssText
+        
+    def testStripping(self):
+        'Test that any original entries are removed from the CSS'
+        css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
+        css = CSSParser(loglevel=logging.ERROR).parseString(css)
+        Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
+        self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), 
+                         'p{font:bolditalic}')
+    
+    def testIdentity(self):
+        'Test that no unnecessary font size changes are made'
+        extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
+        self.assertEqual(extra_css.strip(), '')
+        
+    def testRelativization(self):
+        'Test conversion of absolute to relative sizes'
+        self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
+        
+    def testResizing(self):
+        'Test resizing of fonts'
+        self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
+        
+
+def suite():
+    return unittest.TestLoader().loadTestsFromTestCase(FontTest)
+    
+def test():
+    unittest.TextTestRunner(verbosity=2).run(suite())
+
+if __name__ == '__main__':
+    sys.exit(test())    
+        
\ No newline at end of file
diff --git a/src/calibre/ebooks/epub/from_html.py b/src/calibre/ebooks/epub/from_html.py
index e5fe93ce27..54f91e9f0f 100644
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@@ -32,8 +32,7 @@ Conversion of HTML/OPF files follows several stages:
     * The EPUB container is created.
 '''
 
-import os, sys, re, cStringIO, logging
-from contextlib import nested
+import os, sys, cStringIO, logging
 
 from lxml.etree import XPath
 try:
@@ -41,7 +40,7 @@ try:
 except ImportError:
     import Image as PILImage
 
-from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\
+from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
     opf_traverse, create_metadata, rebase_toc
 from calibre.ebooks.epub import config as common_config
 from calibre.ptempfile import TemporaryDirectory
@@ -50,21 +49,23 @@ from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.epub import initialize_container, PROFILES
 from calibre.ebooks.epub.split import split
+from calibre.ebooks.epub.fonts import Rationalizer
 from calibre.constants import preferred_encoding
 
 
-class HTMLProcessor(Processor):
+class HTMLProcessor(Processor, Rationalizer):
     
-    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
+    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets):
         Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, 
-                        name='html2epub')
+                           name='html2epub')
         if opts.verbose > 2:
             self.debug_tree('parsed')
         self.detect_chapters()
         
-        
-        self.extract_css()
-        self.relativize_font_sizes()
+        self.extract_css(stylesheets)
+        if self.opts.base_font_size2 > 0:
+            self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet], 
+                                             self.root, self.opts)
         if opts.verbose > 2:
             self.debug_tree('nocss')
             
@@ -73,19 +74,6 @@ class HTMLProcessor(Processor):
             meta.getparent().remove(meta)
         Processor.save(self)
         
-        #self.collect_font_statistics()
-        
-        
-    def collect_font_statistics(self):
-        '''
-        Collect font statistics to figure out the base font size used in this
-        HTML document.
-        '''
-        self.font_statistics = {} #: A mapping of font size (in pts) to number of characters rendered at that font size
-        for text in get_text(self.body if self.body is not None else self.root):
-            length, parent = len(re.sub(r'\s+', '', text)), text.getparent()
-            #TODO: Use cssutils on self.raw_css to figure out the font size 
-            # of this piece of text and update statistics accordingly        
     
             
 
@@ -104,21 +92,30 @@ the <spine> element of the OPF file.
 
 def parse_content(filelist, opts, tdir):
     os.makedirs(os.path.join(tdir, 'content', 'resources'))
-    resource_map = {}
+    resource_map, stylesheets = {}, {}
     toc = TOC(base_path=tdir, type='root')
+    stylesheet_map = {}
     for htmlfile in filelist:
+        logging.getLogger('html2epub').debug('Processing %s...'%htmlfile)
         hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), 
-                           resource_map, filelist)
+                           resource_map, filelist, stylesheets)
         hp.populate_toc(toc)
         hp.save()
+        stylesheet_map[os.path.basename(hp.save_path())] = \
+            [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None]
     
+    logging.getLogger('html2epub').debug('Saving stylesheets...')
+    if opts.base_font_size2 > 0:
+        Rationalizer.remove_font_size_information(stylesheets.values())
+        for path, css in stylesheets.items():
+            open(path, 'wb').write(getattr(css, 'cssText', css).encode('utf-8'))
     if toc.count('chapter') > opts.toc_threshold:
         toc.purge(['file', 'link', 'unknown'])
     if toc.count('chapter') + toc.count('file') > opts.toc_threshold:
         toc.purge(['link', 'unknown'])
     toc.purge(['link'], max=opts.max_toc_links)
     
-    return resource_map, hp.htmlfile_map, toc
+    return resource_map, hp.htmlfile_map, toc, stylesheet_map
 
 def resize_cover(im, opts):
     width, height = im.size
@@ -176,7 +173,7 @@ def process_title_page(mi, filelist, htmlfilemap, opts, tdir):
         <title>Cover</title>
         <style type="text/css">@page {padding: 0pt; margin:0pt}</style>
     </head>
-    <body style="padding: 0pt; margin: 0pt;}">
+    <body style="padding: 0pt; margin: 0pt">
         <div style="text-align:center">
             <img style="text-align: center" src="%s" alt="cover" />
         </div>
@@ -212,11 +209,22 @@ def convert(htmlfile, opts, notification=None):
         mi = merge_metadata(htmlfile, opf, opts)
     opts.chapter = XPath(opts.chapter, 
                     namespaces={'re':'http://exslt.org/regular-expressions'})
+    if opts.level1_toc:
+        opts.level1_toc = XPath(opts.level1_toc, 
+                            namespaces={'re':'http://exslt.org/regular-expressions'})
+    else:
+        opts.level1_toc = None
+    if opts.level2_toc:
+        opts.level2_toc = XPath(opts.level2_toc, 
+                            namespaces={'re':'http://exslt.org/regular-expressions'})
+    else:
+        opts.level2_toc = None 
     
     with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
         if opts.keep_intermediate:
             print 'Intermediate files in', tdir
-        resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir)
+        resource_map, htmlfile_map, generated_toc, stylesheet_map = \
+                                        parse_content(filelist, opts, tdir)
         logger = logging.getLogger('html2epub')
         resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
         
@@ -235,6 +243,10 @@ def convert(htmlfile, opts, notification=None):
             rebase_toc(mi.toc, htmlfile_map, tdir)
         if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
             mi.toc = generated_toc
+        if opts.from_ncx:
+            toc = TOC()
+            toc.read_ncx_toc(opts.from_ncx)
+            mi.toc = toc
         for item in mi.manifest:
             if getattr(item, 'mime_type', None) == 'text/html':
                 item.mime_type = 'application/xhtml+xml'
@@ -247,7 +259,7 @@ def convert(htmlfile, opts, notification=None):
                 f.write(toc)
             if opts.show_ncx:
                 print toc
-        split(opf_path, opts)
+        split(opf_path, opts, stylesheet_map)
         opf = OPF(opf_path, tdir)
         opf.remove_guide()
         if has_title_page:
diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py
index 11df503dc4..30d3857941 100644
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@@ -12,10 +12,9 @@ import os, math, logging, functools, collections, re, copy
 from lxml.etree import XPath as _XPath
 from lxml import etree, html
 from lxml.cssselect import CSSSelector
-from cssutils import CSSParser
 
 from calibre.ebooks.metadata.opf2 import OPF
-from calibre.ebooks.epub import tostring
+from calibre.ebooks.epub import tostring, rules
 from calibre import CurrentDir, LoggingInterface
 
 XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
@@ -35,7 +34,7 @@ class SplitError(ValueError):
 
 class Splitter(LoggingInterface):
     
-    def __init__(self, path, opts, always_remove=False):
+    def __init__(self, path, opts, stylesheet_map, always_remove=False):
         LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
         self.setup_cli_handler(opts.verbose)
         self.path = path
@@ -46,22 +45,8 @@ class Splitter(LoggingInterface):
         self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
         root = html.fromstring(open(content(path)).read())
             
-        css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
-        if css:
-            cssp = os.path.join('content', *(css[0].get('href').split('/')))
-            self.log_debug('\t\tParsing stylesheet...')
-            try: 
-                stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
-            except:
-                self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled')
-                if self.opts.verbose > 1:
-                    self.log_exception('')
-                stylesheet = None
-        else:
-            stylesheet = None
         self.page_breaks = []
-        if stylesheet is not None:
-            self.find_page_breaks(stylesheet, root)
+        self.find_page_breaks(stylesheet_map[self.path], root)
             
         self.trees = []
         self.split_size = 0
@@ -189,14 +174,12 @@ class Splitter(LoggingInterface):
                 self.split(t)
                 
                 
-    def find_page_breaks(self, stylesheet, root):
+    def find_page_breaks(self, stylesheets, root):
         '''
         Find all elements that have either page-break-before or page-break-after set.
         '''
         page_break_selectors = set([])
-        for rule in stylesheet:
-            if rule.type != rule.STYLE_RULE:
-                continue
+        for rule in rules(stylesheets):
             before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
             after  = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
             try:
@@ -385,7 +368,7 @@ def fix_ncx(path, changes):
     if changed:
         open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
        
-def split(pathtoopf, opts):
+def split(pathtoopf, opts, stylesheet_map):
     pathtoopf = os.path.abspath(pathtoopf)
     with CurrentDir(os.path.dirname(pathtoopf)):
         opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
@@ -403,7 +386,7 @@ def split(pathtoopf, opts):
         for f in html_files:
             if os.stat(content(f)).st_size > opts.profile.flow_size:
                 try:
-                    changes.append(Splitter(f, opts, 
+                    changes.append(Splitter(f, opts, stylesheet_map,
                         always_remove=(always_remove or \
                         os.stat(content(f)).st_size > 5*opts.profile.flow_size)))
                 except (SplitError, RuntimeError):
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index e22f2bbc0e..e5e6f3f37a 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -8,12 +8,14 @@ Code to recursively parse HTML files and create an open ebook in a specified
 directory or zip file. All the action starts in :function:`create_dir`.
 '''
 
-import sys, re, os, shutil, logging, tempfile, cStringIO
+import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools
 from urlparse import urlparse
 from urllib import unquote
 
-from lxml import html, etree
-from lxml.html import soupparser
+from lxml import etree
+from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \
+                      fromstring as _fromstring, tostring as _tostring, \
+                      soupparser, HtmlElement
 from lxml.etree import XPath
 get_text = XPath("//text()")
 
@@ -25,9 +27,67 @@ from calibre.ebooks.metadata.meta import get_metadata
 from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
 from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
 from calibre.utils.zipfile import ZipFile
+from cssutils import CSSParser
+
+class HTMLElement(HtmlElement):
+    
+    @apply
+    def specified_font_size():
+        
+        def fget(self):
+            ans = self.get('specified_font_size', '')
+            if not ans:
+                return lambda x: x
+            if ans.startswith('f'):
+                return functools.partial(operator.mul, float(ans[1:]))
+            return float(ans)
+        
+        def fset(self, val):
+            self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val))
+                     
+        return property(fget=fget, fset=fset)
+    
+    @apply
+    def computed_font_size():
+        def fget(self):
+            ans = self.get('computed_font_size', '')
+            if ans == '':
+                return None
+            return float(ans)
+        
+        def fset(self, val):
+            self.set('computed_font_size', repr(val))
+        
+        return property(fget=fget, fset=fset)
+    
+    def remove_font_size_information(self):
+        for elem in self.iter():
+            for p in ('computed', 'specified'):
+                elem.attrib.pop(p+'_font_size', None)
+                
+    def getpath(self):
+        return self.getroottree().getpath(self)
+
+class Lookup(HtmlElementClassLookup):
+    
+    def lookup(self, node_type, document, namespace, name):
+        if node_type == 'element':
+            return HTMLElement
+        return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name)
+
+class HTMLParser(_HTMLParser):
+    
+    def __init__(self, **kwargs):
+        super(HTMLParser, self).__init__(**kwargs)
+        self.set_element_class_lookup(Lookup())
+        
+parser = HTMLParser()
+
+def fromstring(raw, **kw):
+    return _fromstring(raw, parser=parser, **kw)
 
 def tostring(root, pretty_print=False):
-    return html.tostring(root, encoding='utf-8', method='xml', 
+    return _tostring(root, encoding='utf-8', method='xml', 
                          include_meta_content_type=True, 
                          pretty_print=pretty_print)
     
@@ -372,11 +432,11 @@ class Parser(PreProcessor, LoggingInterface):
         for pat in ENCODING_PATS:
             src = pat.sub('', src)
         try:
-            self.root =  html.fromstring(src)
+            self.root = fromstring(src)
         except:
             if self.opts.verbose:
                 self.log_exception('lxml based parsing failed')
-            self.root = soupparser.fromstring(src)
+            self.root = soupparser.fromstring(src, makeelement=parser.makeelement)
         head = self.root.xpath('./head')
         if head:
             head = head[0]
@@ -402,7 +462,7 @@ class Parser(PreProcessor, LoggingInterface):
             os.makedirs(tdir)
         with open(os.path.join(tdir, '%s-%s.html'%\
                     (os.path.basename(self.htmlfile.path), name)), 'wb') as f:
-            f.write(html.tostring(self.root, encoding='utf-8'))
+            f.write(tostring(self.root, encoding='utf-8'))
             self.log_debug(_('Written processed HTML to ')+f.name)
     
             
@@ -443,19 +503,21 @@ class Processor(Parser):
     '''
     
     LINKS_PATH = XPath('//a[@href]')
+    PIXEL_PAT  = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px')
+    
+    def __init__(self, *args, **kwargs):
+        Parser.__init__(self, *args, **kwargs)
+        temp = LoggingInterface(logging.getLogger('cssutils'))
+        temp.setup_cli_handler(self.opts.verbose)
+        self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR)
+        self.stylesheet = self.font_css = self.override_css = None
     
     def detect_chapters(self):
         self.detected_chapters = self.opts.chapter(self.root)
         for elem in self.detected_chapters:
             text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
             self.log_info('\tDetected chapter: %s', text[:50])
-            if self.opts.chapter_mark in ('both', 'pagebreak'):
-                style = elem.get('style', '').strip()
-                if style and not style.endswith(';'):
-                    style += '; '
-                style += 'page-break-before: always'
-                elem.set('style', style)
-            if self.opts.chapter_mark in ('both', 'rule'):
+            if self.opts.chapter_mark != 'none':
                 hr = etree.Element('hr')
                 if elem.getprevious() is None:
                     elem.getparent()[:0] = [hr]
@@ -466,16 +528,28 @@ class Processor(Parser):
                             insert = i
                             break
                     elem.getparent()[insert:insert] = [hr]
+                if self.opts.chapter_mark != 'rule':
+                    hr.set('style', 'width:0pt;page-break-before:always')
+                    if self.opts.chapter_mark == 'both':
+                        hr2 = etree.Element('hr')
+                        hr2.tail = u'\u00a0'
+                        p = hr.getparent()
+                        i = p.index(hr)
+                        p[i:i] = [hr2]
+                
                     
         
     def save(self):
-        style_path = os.path.basename(self.save_path())+'.css'
-        style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', 
-                                                       'href':'resources/'+style_path,
-                                                       'charset':'UTF-8'})
-        style.tail = '\n'
-        style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path)
-        open(style_path, 'wb').write(self.css.encode('utf-8'))
+        style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
+        for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
+            if sheet is not None:
+                style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', 
+                                                           'href':'resources/%s_%d.css'%(style_path, i),
+                                                           'charset':'UTF-8'})
+                style.tail = '\n'
+                path = os.path.join(os.path.dirname(self.save_path()), *(style.get('href').split('/')))
+                self.resource_map[path] = style.get('href')
+                open(path, 'wb').write(getattr(sheet, 'cssText', sheet).encode('utf-8'))
         return Parser.save(self)
     
     def populate_toc(self, toc):
@@ -491,14 +565,45 @@ class Processor(Parser):
                 text = text[:50] + u'\u2026'
             return target.add_item(href, fragment, text, type=type)
         
-        # Add chapters to TOC
+        name = self.htmlfile_map[self.htmlfile.path]
+        href = 'content/'+name
+        
+        # Add level 1 and level 2 TOC items
         counter = 0
+        if self.opts.level1_toc is not None:
+            level1 = self.opts.level1_toc(self.root)
+            if level1:
+                added = {}
+                for elem in level1:
+                    text = (u''.join(elem.xpath('string()'))).strip()
+                    if text:
+                        id = elem.get('id', 'calibre_chapter_%d'%counter)
+                        counter += 1
+                        elem.set('id', id)
+                        added[elem] = add_item(href, id, text, toc, type='chapter')
+                        add_item(href, id, 'Top', added[elem], type='chapter')
+                if self.opts.level2_toc is not None:
+                    level2 = list(self.opts.level2_toc(self.root))
+                    for elem in level2:
+                        level1 = None
+                        for item in self.root.iterdescendants():
+                            if item in added.keys():
+                                level1 = added[item]
+                            elif item == elem and level1 is not None:
+                                text = (u''.join(elem.xpath('string()'))).strip()
+                                if text:
+                                    id = elem.get('id', 'calibre_chapter_%d'%counter)
+                                    counter += 1
+                                    elem.set('id', id)
+                                    add_item(href, id, text, level1, type='chapter')
+                    
+        
+        # Add chapters to TOC
+        
         if not self.opts.no_chapters_in_toc:
             for elem in getattr(self, 'detected_chapters', []):
                 text = (u''.join(elem.xpath('string()'))).strip()
                 if text:
-                    name = self.htmlfile_map[self.htmlfile.path]
-                    href = 'content/'+name
                     counter += 1
                     id = elem.get('id', 'calibre_chapter_%d'%counter)
                     elem.set('id', id)
@@ -518,8 +623,7 @@ class Processor(Parser):
                 pass
             
         
-        name = self.htmlfile_map[self.htmlfile.path]
-        href = 'content/'+name
+        
         
         
         if referrer.href != href: # Happens for root file
@@ -541,13 +645,24 @@ class Processor(Parser):
                             name = self.htmlfile_map[self.htmlfile.referrer.path]
                         add_item(href, fragment, text, target)
                         
-                    
+    @classmethod
+    def preprocess_css(cls, css, dpi=96):
+        def rescale(match):
+            val = match.group(1)
+            try:
+                val = float(val)
+            except ValueError:
+                return ''
+            return '%fpt'%(72 * val/dpi)
         
-    def extract_css(self):
+        return cls.PIXEL_PAT.sub(rescale, css)
+        
+    def extract_css(self, parsed_sheets):
         '''
-        Remove all CSS information from the document and store in self.raw_css. 
-        This includes <font> tags.
+        Remove all CSS information from the document and store it as 
+        :class:`StyleSheet` objects.
         '''
+        
         def get_id(chapter, counter, prefix='calibre_css_'):
             new_id = '%s_%d'%(prefix, counter)
             if chapter.tag.lower() == 'a' and  'name' in chapter.keys():
@@ -562,17 +677,40 @@ class Processor(Parser):
                 chapter.set('id', id)
             return id
     
-        css = []
+        self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('')
         for link in self.root.xpath('//link'):
             if 'css' in link.get('type', 'text/css').lower():
-                file = os.path.join(self.tdir, link.get('href', ''))
-                if file and os.path.exists(file) and os.path.isfile(file):
-                    css.append(open(file, 'rb').read().decode('utf-8'))
-                link.getparent().remove(link)
-                    
+                file = os.path.join(self.tdir, *(link.get('href', '').split('/')))
+                if file and not 'http:' in file:
+                    if not parsed_sheets.has_key(file):
+                        try:
+                            self.log_info('Processing stylesheet %s...'%file)
+                            css = self.preprocess_css(open(file).read())
+                        except (IOError, OSError):
+                            self.log_error('Failed to open stylesheet: %s'%file)
+                        else:
+                            try:
+                                parsed_sheets[file] = self.css_parser.parseString(css)
+                            except:
+                                parsed_sheets[file] = css.decode('utf8', 'replace')
+                                self.log_warning('Failed to parse stylesheet: %s'%file)
+                                if self.opts.verbose > 1:
+                                    self.log_exception('')
+                    if parsed_sheets.has_key(file):
+                        self.external_stylesheets.append(parsed_sheets[file])
+                
+        
         for style in self.root.xpath('//style'):
             if 'css' in style.get('type', 'text/css').lower():
-                css.append('\n'.join(style.xpath('./text()')))
+                raw = '\n'.join(style.xpath('./text()'))
+                css = self.preprocess_css(raw)
+                try:
+                    sheet = self.css_parser.parseString(css)
+                except:
+                    self.log_debug('Failed to parse style element')
+                else:
+                    for rule in sheet:
+                        self.stylesheet.add(rule)
                 style.getparent().remove(style)
         
         cache = {}
@@ -613,57 +751,19 @@ class Processor(Parser):
             elem.set('class', cn)
             elem.attrib.pop('style')
         
-        for setting, cn in cache.items():
-            css.append('.%s {%s}'%(cn, setting))
-        
-            
-        self.raw_css = '\n\n'.join(css)
-        self.css = unicode(self.raw_css)
+        css = '\n'.join(['.%s {%s;}'%(cn, setting) for \
+                         setting, cn in cache.items()])
+        self.stylesheet = self.css_parser.parseString(self.preprocess_css(css))
+        css = ''
         if self.opts.override_css:
-            self.css += '\n\n'+self.opts.override_css
-        self.do_layout()
-        # TODO: Figure out what to do about CSS imports from linked stylesheets
-    
-    def relativize_font_sizes(self, dpi=100, base=16):
-        '''
-        Convert all absolute font sizes to percentages of ``base`` using ``dpi``
-        to convert from screen to paper units.
-        :param base: Base size in pixels. Adobe DE seems to need base size to be 16
-        irrespective of the unit of the length being converted
-        :param dpi: Dots per inch used to convert pixels to absolute lengths. Since
-        most HTML files are created on computers with monitors of DPI ~ 100, we use
-        100 by default.
-        '''
-        size_value_pat = re.compile(r'(?<!/)(?P<num>[0-9.]+)(?P<unit>cm|mm|in|pt|pc|px)', re.I)
+            css += '\n\n' + self.opts.override_css
+        css += '\n\n' + 'body {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt;}'
+        css += '\n\n@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
+        if self.opts.remove_paragraph_spacing:
+            css += '\n\np {text-indent: 2.1em; margin-top:1pt; margin-bottom:1pt; padding:0pt; border:0pt;}'
+        self.override_css = self.css_parser.parseString(self.preprocess_css(css))
         
-        # points per unit
-        ptu = { # Convert to pt
-                  'px' : 72./dpi,
-                  'pt' : 1.0,
-                  'pc' : 1/12.,
-                  'in' : 72.,
-                  'cm' : 72/2.54,
-                  'mm' : 72/25.4,
-                  }
         
-        def relativize(match):
-            val  = float(match.group('num'))
-            unit = match.group('unit').lower()
-            val  *= ptu[unit]
-            return '%.1f%%'%((val/base) * 100)
-             
-        
-        def sub(match):
-            rule = match.group(1)
-            value = size_value_pat.sub(relativize, match.group(2))
-            return '%s : %s'%(rule, value)
-        
-        self.css = re.compile(r'(font|font-size)\s*:\s*([^;]+)', re.I).sub(sub, self.css)
-    
-    def do_layout(self):
-        self.css += '\nbody {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt; font-size: %f%%}\n'%self.opts.base_font_size
-        self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
-
 def config(defaults=None, config_name='html',
            desc=_('Options to control the traversal of HTML')):
     if defaults is None:
diff --git a/src/calibre/gui2/dialogs/epub.py b/src/calibre/gui2/dialogs/epub.py
index 9f6dbd6dc6..78a2be0f51 100644
--- a/src/calibre/gui2/dialogs/epub.py
+++ b/src/calibre/gui2/dialogs/epub.py
@@ -17,6 +17,7 @@ from calibre.ebooks.epub.from_any import SOURCE_FORMATS, config
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.ebooks.metadata.opf import OPFCreator
+from lxml.etree import XPath
 
 class Config(QDialog, Ui_Dialog):
     
@@ -234,6 +235,16 @@ class Config(QDialog, Ui_Dialog):
                     self.source_format = d.format()
                 
     def accept(self):
+        for opt in ('chapter', 'level1_toc', 'level2_toc'):
+            text = unicode(getattr(self, 'opt_'+opt).text())
+            if text:
+                try:
+                    XPath(text,namespaces={'re':'http://exslt.org/regular-expressions'})
+                except Exception, err:
+                    error_dialog(self, _('Invalid XPath expression'),
+                        _('The expression %s is invalid. Error: %s')%(text, err) 
+                                 ).exec_()
+                    return
         mi = self.get_metadata()
         self.read_settings()
         self.cover_file = None
diff --git a/src/calibre/gui2/dialogs/epub.ui b/src/calibre/gui2/dialogs/epub.ui
index fe4ccdef5d..3ecc0991e8 100644
--- a/src/calibre/gui2/dialogs/epub.ui
+++ b/src/calibre/gui2/dialogs/epub.ui
@@ -77,7 +77,7 @@
      <item>
       <widget class="QStackedWidget" name="stack" >
        <property name="currentIndex" >
-        <number>1</number>
+        <number>3</number>
        </property>
        <widget class="QWidget" name="metadata_page" >
         <layout class="QGridLayout" name="gridLayout_4" >
@@ -416,29 +416,36 @@
               <string>Base &amp;font size:</string>
              </property>
              <property name="buddy" >
-              <cstring>opt_base_font_size</cstring>
+              <cstring>opt_base_font_size2</cstring>
              </property>
             </widget>
            </item>
            <item row="1" column="2" >
-            <widget class="QDoubleSpinBox" name="opt_base_font_size" >
+            <widget class="QDoubleSpinBox" name="opt_base_font_size2" >
              <property name="suffix" >
-              <string> %</string>
+              <string> pt</string>
              </property>
              <property name="decimals" >
               <number>0</number>
              </property>
              <property name="minimum" >
-              <double>10.000000000000000</double>
+              <double>0.000000000000000</double>
              </property>
              <property name="maximum" >
-              <double>500.000000000000000</double>
+              <double>30.000000000000000</double>
              </property>
              <property name="singleStep" >
-              <double>5.000000000000000</double>
+              <double>1.000000000000000</double>
              </property>
              <property name="value" >
-              <double>100.000000000000000</double>
+              <double>30.000000000000000</double>
+             </property>
+            </widget>
+           </item>
+           <item row="2" column="0" >
+            <widget class="QCheckBox" name="opt_remove_paragraph_spacing" >
+             <property name="text" >
+              <string>Remove &amp;spacing between paragraphs</string>
              </property>
             </widget>
            </item>
@@ -674,6 +681,32 @@ p, li { white-space: pre-wrap; }
               </property>
              </widget>
             </item>
+            <item row="4" column="1" >
+             <widget class="QLineEdit" name="opt_level1_toc" />
+            </item>
+            <item row="4" column="0" >
+             <widget class="QLabel" name="label_19" >
+              <property name="text" >
+               <string>Level &amp;1 TOC</string>
+              </property>
+              <property name="buddy" >
+               <cstring>opt_level1_toc</cstring>
+              </property>
+             </widget>
+            </item>
+            <item row="5" column="0" >
+             <widget class="QLabel" name="label_20" >
+              <property name="text" >
+               <string>Level &amp;2 TOC</string>
+              </property>
+              <property name="buddy" >
+               <cstring>opt_level2_toc</cstring>
+              </property>
+             </widget>
+            </item>
+            <item row="5" column="1" >
+             <widget class="QLineEdit" name="opt_level2_toc" />
+            </item>
            </layout>
           </widget>
          </item>
diff --git a/src/calibre/linux.py b/src/calibre/linux.py
index 7a820d3cfa..33796f5b15 100644
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@@ -295,6 +295,11 @@ complete -o nospace  -F _prs500 prs500
 ''')
         f.close()
         print 'done'
+    except TypeError, err:
+        if 'resolve_entities' in str(err):
+            print 'You need python-lxml >= 2.0.5 for calibre'
+            sys.exit(1)
+        raise
     except:
         if fatal_errors:
             raise
diff --git a/src/calibre/trac/plugins/download.py b/src/calibre/trac/plugins/download.py
index 33049e9dc8..ca5ecabed4 100644
--- a/src/calibre/trac/plugins/download.py
+++ b/src/calibre/trac/plugins/download.py
@@ -45,7 +45,7 @@ class Distribution(object):
     INSTALLERS = ('emerge -avn', 'apt-get install', 'yum install')
     AS_ROOT    = (True, False, True)
     
-    TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Interpid Ibex',
+    TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Intrepid Ibex',
                 'fedora':'Fedora 10', 'debian':'Debian sid', 'generic': 'Install from source'}
     
     MANUAL_MAP = {
diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py
index 003e9af318..dffb9f8c56 100644
--- a/src/calibre/web/feeds/__init__.py
+++ b/src/calibre/web/feeds/__init__.py
@@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Contains the logic for parsing feeds.
 '''
-import time, logging, traceback
+import time, logging, traceback, copy
 from datetime import datetime
 
 from calibre.web.feeds.feedparser import parse
@@ -17,7 +17,7 @@ class Article(object):
     def __init__(self, id, title, url, summary, published, content):
         self.downloaded = False
         self.id = id
-        self.title = title
+        self.title = title.strip() if title else title
         self.url = url
         self.summary = summary
         self.content = content
@@ -38,7 +38,14 @@ Has content : %s
 
     def __str__(self):
         return repr(self)
-
+    
+    def is_same_as(self, other_article):
+        #if self.title != getattr(other_article, 'title', False):
+        #    return False
+        if self.url:
+            return self.url == getattr(other_article, 'url', False)
+        return self.content == getattr(other_article, 'content', False)
+    
 
 class Feed(object):
 
@@ -169,7 +176,72 @@ class Feed(object):
                               len(a.summary if a.summary else ''))
                 
         return length > 2000 * len(self)
+    
+    def has_article(self, article):
+        for a in self:
+            if a.is_same_as(article):
+                return True
+        return False
+    
+    def find(self, article):
+        for i, a in enumerate(self):
+            if a.is_same_as(article):
+                return i
+        return -1
+    
+    def remove(self, article):
+        i = self.index(article)
+        if i > -1:
+            self.articles[i:i+1] = []
 
+class FeedCollection(list):
+    
+    def __init__(self, feeds):
+        list.__init__(self, [f for f in feeds if len(f.articles) > 0])
+        found_articles = set([])
+        duplicates = set([])
+        
+        def in_set(s, a):
+            for x in s:
+                if a.is_same_as(x):
+                    return x
+            return None
+        
+        print '#feeds', len(self)
+        print map(len, self)
+        for f in self:
+            dups = []
+            for a in f:
+                first = in_set(found_articles, a)
+                if first is not None:
+                    dups.append(a)
+                    duplicates.add((first, f))
+                else:
+                    found_articles.add(a)
+            for x in dups:
+                f.articles.remove(x)
+                
+        self.duplicates = duplicates
+        print len(duplicates)
+        print map(len, self)
+        #raise
+                
+    def find_article(self, article):
+        for j, f in enumerate(self):
+            for i, a in enumerate(f):
+                if a is article:
+                    return (j, i)
+    
+    def restore_duplicates(self):
+        temp = []
+        for article, feed in self.duplicates:
+            art = copy.deepcopy(article)
+            j, i = self.find_article(article)
+            art.url = '../feed_%d/article_%d/index.html'%(j, i)
+            temp.append((feed, art))
+        for feed, art in temp:
+            feed.articles.append(art)
+        
 
 def feed_from_xml(raw_xml, title=None, oldest_article=7, 
                   max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 6a9e9acd52..212ca84aac 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -289,15 +289,16 @@ class BasicNewsRecipe(object, LoggingInterface):
         '''
         return soup
     
-    def postprocess_html(self, soup):
+    def postprocess_html(self, soup, first_fetch):
         '''
         This method is called with the source of each downloaded :term:`HTML` file, after
         it is parsed for links and images. 
         It can be used to do arbitrarily powerful post-processing on the :term:`HTML`.
         It should return `soup` after processing it. 
         
-        `soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ 
+        :param soup: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ 
         instance containing the downloaded :term:`HTML`.
+        :param first_fetch: True if this is the first page of an article.
         '''
         return soup
     
@@ -482,7 +483,7 @@ class BasicNewsRecipe(object, LoggingInterface):
                 elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                 body.insert(0, elem)
             
-        return self.postprocess_html(soup)
+        return self.postprocess_html(soup, first_fetch)
         
     
     def download(self):
diff --git a/src/calibre/web/feeds/recipes/espn.py b/src/calibre/web/feeds/recipes/espn.py
index d8c33847cf..34a1bc609a 100644
--- a/src/calibre/web/feeds/recipes/espn.py
+++ b/src/calibre/web/feeds/recipes/espn.py
@@ -67,7 +67,7 @@ class ESPN(BasicNewsRecipe):
         
         return soup
     
-    def postprocess_html(self, soup):
+    def postprocess_html(self, soup, first_fetch):
         for div in soup.findAll('div', style=True):
             div['style'] = div['style'].replace('center', 'left')
         return soup
diff --git a/src/calibre/web/feeds/recipes/newsweek.py b/src/calibre/web/feeds/recipes/newsweek.py
index 0da8b8965d..9ad551c469 100644
--- a/src/calibre/web/feeds/recipes/newsweek.py
+++ b/src/calibre/web/feeds/recipes/newsweek.py
@@ -92,7 +92,7 @@ class Newsweek(BasicNewsRecipe):
         return sections
         
     
-    def postprocess_html(self,  soup):
+    def postprocess_html(self, soup, first_fetch):
         divs = list(soup.findAll('div', 'pagination'))
         if not divs:
             return
diff --git a/src/calibre/web/feeds/recipes/outlook_india.py b/src/calibre/web/feeds/recipes/outlook_india.py
index c5782d1536..db8ad900ab 100644
--- a/src/calibre/web/feeds/recipes/outlook_india.py
+++ b/src/calibre/web/feeds/recipes/outlook_india.py
@@ -73,7 +73,7 @@ class OutlookIndia(BasicNewsRecipe):
                     
         return feeds
 
-    def postprocess_html(self, soup):
+    def postprocess_html(self, soup, first_fetch):
         bad = []
         for table in soup.findAll('table'):
             if table.find(text=re.compile(r'\(\d+ of \d+\)')):
diff --git a/src/calibre/web/feeds/recipes/scientific_american.py b/src/calibre/web/feeds/recipes/scientific_american.py
index b9ca0f131f..7d22013aaf 100644
--- a/src/calibre/web/feeds/recipes/scientific_american.py
+++ b/src/calibre/web/feeds/recipes/scientific_american.py
@@ -7,14 +7,16 @@ __docformat__ = 'restructuredtext en'
 sciam.com
 '''
 import re
+from lxml import html
 from calibre.web.feeds.news import BasicNewsRecipe
 
 class ScientificAmerican(BasicNewsRecipe):
     title = u'Scientific American'
-    description = u'Popular science' 
+    description = u'Popular science. Monthly magazine.' 
     __author__ = 'Kovid Goyal'
     oldest_article = 30 
     max_articles_per_feed = 100
+    no_stylesheets = True
     use_embedded_content   = False
     remove_tags_before = dict(name='div', attrs={'class':'headline'})
     remove_tags_after  = dict(id='article')
@@ -26,25 +28,102 @@ class ScientificAmerican(BasicNewsRecipe):
     html2lrf_options = ['--base-font-size', '8']
     recursions = 1
     match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)']
-    feeds = [
-             (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), 
-             (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), 
-             (u'Health', u'http://rss.sciam.com/sciam/health'), 
-             (u'Space', u'http://rss.sciam.com/sciam/space'), 
-             (u'Technology', u'http://rss.sciam.com/sciam/technology'), 
-             (u'Biology', u'http://rss.sciam.com/sciam/biology'), 
-             (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), 
-             (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), 
-             (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), 
-             (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), 
-             (u'Math', u'http://rss.sciam.com/sciam/math'), 
-             (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), 
-             (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), 
-             (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
-            ]
+#    feeds = [
+#             (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), 
+#             (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), 
+#             (u'Health', u'http://rss.sciam.com/sciam/health'), 
+#             (u'Space', u'http://rss.sciam.com/sciam/space'), 
+#             (u'Technology', u'http://rss.sciam.com/sciam/technology'), 
+#             (u'Biology', u'http://rss.sciam.com/sciam/biology'), 
+#             (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), 
+#             (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), 
+#             (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), 
+#             (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), 
+#             (u'Math', u'http://rss.sciam.com/sciam/math'), 
+#             (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), 
+#             (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), 
+#             (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
+#            ]
+#    
+    def parse_index(self):
+        src = self.browser.open('http://www.sciam.com/sciammag/').read()
+        root = html.fromstring(src)
+        self.cover_url = root.xpath('//img[re:match(@src, "cover_")]', 
+                                    namespaces={'re':'http://exslt.org/regular-expressions'}
+                                    )[0].get('src')
+        self.timefmt = root.xpath('//div[@id = "magazine-month"]')[0].text
+        feeds = []
+        features = []
+        for a in root.xpath('//a[@href and @title = "Feature"]'):
+            if not a.text.strip():
+                continue
+            article = {
+                       'url'    : a.get('href'),
+                       'title'  : u''.join(a.xpath('./text()')),
+                       'date'   : '',
+                       'description' : '',   
+                       }
+            for s in a.itersiblings('span'):
+                if s.get('class', '') == 'sub':
+                    article['description'] += u''.join(s.xpath('./text()')) + ' '
+            features.append(article)
+        if features:
+            feeds.append(('Features', features))
+            
+        departments = []
+        for a in root.xpath('//a[@href and @class="title"]'):
+            txt = u''.join(a.xpath('./text()')).strip()
+            if not txt:
+                continue
+            article = {
+                       'url'    : a.get('href'),
+                       'title'  : txt,
+                       'date'   : '',
+                       'description' : '',   
+                       }
+            p = a.getparent()
+            p.remove(a)
+            article['description'] = u''.join(p.xpath('./text()'))
+            departments.append(article)
+            
+        feeds.append(('Departments', departments))
+        opinion = []
+        for a in root.xpath('//div[@id = "opinion"]//a[@href]'):
+            txt = u''.join(a.xpath('./text()')).strip()
+            if not txt:
+                continue
+            article = {
+                       'url'    : a.get('href'),
+                       'title'  : txt,
+                       'date'   : '',
+                       'description' : '',   
+                       }
+            opinion.append(article)
+        feeds.append(('Opinion', opinion))
+        
+        ontheweb = []
+        for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'):
+            txt = u''.join(a.xpath('./text()')).strip()
+            if not txt:
+                continue
+            article = {
+                       'url'    : a.get('href'),
+                       'title'  : txt,
+                       'date'   : '',
+                       'description' : '',   
+                       }
+            ontheweb.append(article)
+        feeds.append(('On the web', ontheweb))
+        
+        return feeds
+        
     
-    def postprocess_html(self,  soup):
+    def postprocess_html(self, soup, first_fetch):
         if soup is not None:
             for span in soup.findAll('span', attrs={'class':'pagination'}):
                 span.extract()
+            if not first_fetch:
+                div = soup.find('div', attrs={'class':'headline'})
+                if div: 
+                    div.extract()
         return soup
diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py
index bd867a2045..c220e8390f 100644
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@@ -198,7 +198,7 @@ class RecursiveFetcher(object, LoggingInterface):
                 try:
                     f = self.fetch_url(iurl)
                 except Exception, err:
-                    self.log_warning('Could not fetch stylesheet %s', iurl)
+                    self.log_debug('Could not fetch stylesheet %s', iurl)
                     self.log_debug('Error: %s', str(err), exc_info=True)
                     continue
                 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')