Implemented font size control in EPUB conversion. You can now specify the base font size in absolute units. Remove spacing between paragraphs by default for EPUB output. Can be turned off. Added options for more sophisticated generation of an automatic Table of Contents in EPUB files. Restricted Scientific American recipe to only download articles in current issue.

2025-07-09 03:04:10 -04:00 · 2008-10-04 13:02:38 -07:00 · 2008-10-04 13:02:38 -07:00 · 7fefb01f35
commit 7fefb01f35
parent 6fab7e97c3
17 changed files with 788 additions and 180 deletions
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -284,7 +284,7 @@ def english_sort(x, y):
 class LoggingInterface:

    def __init__(self, logger):
-        self.__logger = logger
+        self.__logger = self.logger = logger
        
    def setup_cli_handler(self, verbosity):
        for handler in self.__logger.handlers:
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -7,7 +7,6 @@ __docformat__ = 'restructuredtext en'
 Conversion to EPUB.
 '''
 import sys, textwrap
-from lxml import html
 from calibre.utils.config import Config, StringConfig
 from calibre.utils.zipfile import ZipFile, ZIP_STORED
 from calibre.ebooks.html import config as common_config, tostring
@ -16,13 +15,11 @@ class DefaultProfile(object):
    
    flow_size   = sys.maxint
    screen_size = None
-    dpi         = 100
    
 class PRS505(DefaultProfile):
    
    flow_size   = 300000
    screen_size = (600, 775)
-    dpi         = 166
        

 PROFILES = {
@ -30,6 +27,13 @@ PROFILES = {
            'None'   : DefaultProfile,
            }

+def rules(stylesheets):
+    for s in stylesheets:
+        if hasattr(s, 'cssText'):
+            for r in s:
+                if r.type == r.STYLE_RULE:
+                    yield r
+
 def initialize_container(path_to_container, opf_name='metadata.opf'):
    '''
    Create an empty EPUB document, with a default skeleton.
@ -95,6 +99,12 @@ to auto-generate a Table of Contents.
        help=_("Don't add auto-detected chapters to the Table of Contents."))
    toc('toc_threshold', ['--toc-threshold'], default=6,
        help=_('If fewer than this number of chapters is detected, then links are added to the Table of Contents.'))
+    toc('level1_toc', ['--level1-toc'], default=None,
+        help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level one. If this is specified, it takes precedence over other forms of auto-detection.'))
+    toc('level2_toc', ['--level2-toc'], default=None,
+        help=_('XPath expression that specifies all tags that should be added to the Table of Contents at level two. Each entry is added under the previous level one entry.'))
+    toc('from_ncx', ['--from-ncx'], default=None,
+        help=_('Path to a .ncx file that contains the table of contents to use for this ebook. The NCX file should contain links relative to the directory it is placed in. See http://www.niso.org/workrooms/daisy/Z39-86-2005.html#NCX for an overview of the NCX format.'))
    toc('use_auto_toc', ['--use-auto-toc'], default=False,
        help=_('Normally, if the source file already has a Table of Contents, it is used in preference to the autodetected one. With this option, the autodetected one is always used.'))
    
@ -107,8 +117,10 @@ to auto-generate a Table of Contents.
           help=_('Set the left margin in pts. Default is %default'))
    layout('margin_right', ['--margin-right'], default=5.0, 
           help=_('Set the right margin in pts. Default is %default'))
-    layout('base_font_size', ['--base-font-size'], default=100.0,
-           help=_('The base font size as a percentage. Default is %default. Changing this should allow you to control overall base font sizes, except for input HTML files that use absolute font sizes for their text tags.'))
+    layout('base_font_size2', ['--base-font-size'], default=12.0,
+           help=_('The base font size in pts. Default is %defaultpt. Set to 0 to disable rescaling of fonts.'))
+    layout('remove_paragraph_spacing', ['--remove-paragraph-spacing'], default=True,
+           help=_('Remove spacing between paragraphs. Will not work if the source file forces inter-paragraph spacing.'))
    
    c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
              help=_('Print generated OPF file to stdout'))
--- a/src/calibre/ebooks/epub/fonts.py
+++ b/src/calibre/ebooks/epub/fonts.py
@ -0,0 +1,300 @@
+#!/usr/bin/env  python
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Font size rationalization. See :function:`relativize`.
+'''
+
+import logging, re, operator, functools, collections, unittest, copy, sys
+from xml.dom import SyntaxErr
+
+from lxml.cssselect import CSSSelector
+from lxml import etree
+from lxml.html import HtmlElement
+
+from calibre.ebooks.html import fromstring
+from calibre.ebooks.epub import rules
+from cssutils import CSSParser
+
+num           = r'[-]?\d+|[-]?\d*\.\d+'
+length        = r'(?P<zero>0)|(?P<num>{num})(?P<unit>%|em|ex|px|in|cm|mm|pt|pc)'.replace('{num}', num)
+absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
+relative_size = r'(?P<rel>smaller|larger)'
+
+font_size_pat   = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
+line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))  
+
+PTU = {
+       'in' : 72.,
+       'cm' : 72/2.54,
+       'mm' : 72/25.4,
+       'pt' : 1.0,
+       'pc' : 1/12.,
+       }
+
+DEFAULT_FONT_SIZE = 12
+
+class Rationalizer(object):
+    
+    @classmethod
+    def specificity(cls, s):
+        '''Map CSS specificity tuple to a single integer'''
+        return sum([10**(4-i) + x for i,x in enumerate(s)]) 
+        
+    @classmethod
+    def compute_font_size(cls, elem):
+        '''
+        Calculate the effective font size of an element traversing its ancestors as far as
+        neccessary.
+        '''
+        cfs = elem.computed_font_size
+        if cfs is not None:
+            return
+        sfs = elem.specified_font_size
+        if callable(sfs):
+            parent = elem.getparent()
+            cls.compute_font_size(parent)
+            elem.computed_font_size = sfs(parent.computed_font_size)
+        else:
+            elem.computed_font_size = sfs
+        
+    @classmethod
+    def calculate_font_size(cls, style):
+        'Return font size in pts from style object. For relative units returns a callable'
+        match = font_size_pat.search(style.font)
+        fs = ''
+        if match:
+            fs = match.group()
+        if style.fontSize:
+            fs = style.fontSize
+            
+        match = font_size_pat.search(fs)
+        if match is None:
+            return None
+        match = match.groupdict()
+        unit = match.get('unit', '')
+        if unit: unit = unit.lower()
+        if unit in PTU.keys():
+            return PTU[unit] * float(match['num'])
+        if unit in ('em', 'ex'):
+            return functools.partial(operator.mul, float(match['num']))
+        if unit == '%':
+            return functools.partial(operator.mul, float(match['num'])/100.)
+        abs = match.get('abs', '')
+        if abs: abs = abs.lower()
+        if abs:
+            x = (1.2)**(abs.count('x') * (-1 if 'small' in abs else 1))
+            return 12 * x
+        if match.get('zero', False):
+            return 0.
+        return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) 
+        
+    @classmethod
+    def resolve_rules(cls, stylesheets):
+        for sheet in stylesheets:
+            if hasattr(sheet, 'fs_rules'):
+                continue
+            sheet.fs_rules = []
+            sheet.lh_rules = []
+            for r in sheet:
+                if r.type == r.STYLE_RULE:
+                    font_size = cls.calculate_font_size(r.style)
+                    if font_size is not None:
+                        for s in r.selectorList:
+                            sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
+                    orig = line_height_pat.search(r.style.lineHeight) 
+                    if orig is not None:
+                        for s in r.selectorList:
+                            sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
+    
+        
+    @classmethod
+    def apply_font_size_rules(cls, stylesheets, root):
+        'Add a ``specified_font_size`` attribute to every element that has a specified font size'
+        cls.resolve_rules(stylesheets)
+        for sheet in stylesheets:
+            for selector, font_size in sheet.fs_rules:
+                elems = selector(root)
+                for elem in elems:
+                    elem.specified_font_size = font_size
+    
+    @classmethod
+    def remove_font_size_information(cls, stylesheets):
+        for r in rules(stylesheets):
+            r.style.removeProperty('font-size')
+            try:
+                new = font_size_pat.sub('', r.style.font).strip()
+                if new:
+                    r.style.font = new
+                else:
+                    r.style.removeProperty('font')
+            except SyntaxErr:
+                r.style.removeProperty('font')
+            if line_height_pat.search(r.style.lineHeight) is not None:
+                r.style.removeProperty('line-height')
+    
+    @classmethod
+    def compute_font_sizes(cls, root, stylesheets, base=12):
+        stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
+        cls.apply_font_size_rules(stylesheets, root)
+        
+        # Compute the effective font size of all tags
+        root.computed_font_size = DEFAULT_FONT_SIZE
+        for elem in root.iter(etree.Element):
+            cls.compute_font_size(elem)
+        
+        extra_css = {}
+        if base > 0:
+            # Calculate the "base" (i.e. most common) font size
+            font_sizes = collections.defaultdict(lambda : 0)
+            body = root.xpath('//body')[0]
+            IGNORE = ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')
+            for elem in body.iter(etree.Element):
+                if elem.tag not in IGNORE:
+                    t = getattr(elem, 'text', '')
+                    if t: t = t.strip()
+                    if t:
+                        font_sizes[elem.computed_font_size] += len(t)
+                    
+                t = getattr(elem, 'tail', '')
+                if t: t = t.strip()
+                if t:
+                    parent = elem.getparent()
+                    if parent.tag not in IGNORE:
+                        font_sizes[parent.computed_font_size] += len(t)
+                
+            try:
+                most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
+                scale = base/most_common if most_common > 0 else 1.
+            except ValueError:
+                scale = 1.
+            
+            # rescale absolute line-heights
+            counter = 0
+            for sheet in stylesheets:
+                for selector, lh in sheet.lh_rules:
+                    for elem in selector(root):
+                        elem.set('id', elem.get('id', 'cfs_%d'%counter))
+                        counter += 1
+                        if not extra_css.has_key(elem.get('id')):
+                            extra_css[elem.get('id')] = []
+                        extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
+            
+        
+            
+            # Rescale all computed font sizes
+            for elem in body.iter(etree.Element):
+                if isinstance(elem, HtmlElement):
+                    elem.computed_font_size *= scale
+        
+        # Remove all font size specifications from the last stylesheet 
+        cls.remove_font_size_information(stylesheets[-1:])
+                    
+        # Create the CSS to implement the rescaled font sizes
+        for elem in body.iter(etree.Element):
+            cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
+            if abs(cfs-pcfs) > 1/12. and abs(pcfs) > 1/12.:
+                elem.set('id', elem.get('id', 'cfs_%d'%counter))
+                counter += 1
+                if not extra_css.has_key(elem.get('id')):
+                    extra_css[elem.get('id')] = []
+                extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
+                
+        css = CSSParser(loglevel=logging.ERROR).parseString('')
+        for id, r in extra_css.items():
+            css.add('#%s {%s}'%(id, ';'.join(r)))
+        return css
+    
+    @classmethod
+    def rationalize(cls, stylesheets, root, opts):
+        logger     = logging.getLogger('html2epub')
+        logger.info('\t\tRationalizing fonts...')
+        extra_css = None
+        if opts.base_font_size2 > 0:
+            try:
+                extra_css = cls.compute_font_sizes(root, stylesheets, base=opts.base_font_size2)
+            except:
+                logger.warning('Failed to rationalize font sizes.')
+                if opts.verbose > 1:
+                    logger.exception('')
+            finally:
+                root.remove_font_size_information()
+        logger.debug('\t\tDone rationalizing')
+        return extra_css
+
+################################################################################
+############## Testing
+################################################################################
+
+class FontTest(unittest.TestCase):
+    
+    def setUp(self):
+        from calibre.ebooks.epub import config
+        self.opts = config(defaults='').parse()
+        self.html = '''
+        <html>
+            <head>
+                <title>Test document</title>
+            </head>
+            <body>
+                <div id="div1">
+                <!-- A comment -->
+                    <p id="p1">Some <b>text</b></p>
+                </div>
+                <p id="p2">Some other <span class="it">text</span>.</p>
+                <p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
+            </body>
+        </html> 
+        '''
+        self.root = fromstring(self.html)
+        
+    def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
+        root1 = copy.deepcopy(self.root)
+        root1.computed_font_size = DEFAULT_FONT_SIZE
+        stylesheet = CSSParser(loglevel=logging.ERROR).parseString(css)
+        stylesheet2 = Rationalizer.compute_font_sizes(root1, [stylesheet], base)
+        root2 = copy.deepcopy(root1)
+        root2.remove_font_size_information()
+        root2.computed_font_size = DEFAULT_FONT_SIZE
+        Rationalizer.apply_font_size_rules([stylesheet2], root2)
+        for elem in root2.iter(etree.Element):
+            Rationalizer.compute_font_size(elem)
+        for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
+            self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, 
+                msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
+                (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
+        return stylesheet2.cssText
+        
+    def testStripping(self):
+        'Test that any original entries are removed from the CSS'
+        css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
+        css = CSSParser(loglevel=logging.ERROR).parseString(css)
+        Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
+        self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), 
+                         'p{font:bolditalic}')
+    
+    def testIdentity(self):
+        'Test that no unnecessary font size changes are made'
+        extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
+        self.assertEqual(extra_css.strip(), '')
+        
+    def testRelativization(self):
+        'Test conversion of absolute to relative sizes'
+        self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
+        
+    def testResizing(self):
+        'Test resizing of fonts'
+        self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
+        
+
+def suite():
+    return unittest.TestLoader().loadTestsFromTestCase(FontTest)
+    
+def test():
+    unittest.TextTestRunner(verbosity=2).run(suite())
+
+if __name__ == '__main__':
+    sys.exit(test())    
+        
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -32,8 +32,7 @@ Conversion of HTML/OPF files follows several stages:
    * The EPUB container is created.
 '''

-import os, sys, re, cStringIO, logging
-from contextlib import nested
+import os, sys, cStringIO, logging

 from lxml.etree import XPath
 try:
@ -41,7 +40,7 @@ try:
 except ImportError:
    import Image as PILImage

-from calibre.ebooks.html import Processor, get_text, merge_metadata, get_filelist,\
+from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
    opf_traverse, create_metadata, rebase_toc
 from calibre.ebooks.epub import config as common_config
 from calibre.ptempfile import TemporaryDirectory
@ -50,21 +49,23 @@ from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ebooks.epub import initialize_container, PROFILES
 from calibre.ebooks.epub.split import split
+from calibre.ebooks.epub.fonts import Rationalizer
 from calibre.constants import preferred_encoding


-class HTMLProcessor(Processor):
+class HTMLProcessor(Processor, Rationalizer):
    
-    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles):
+    def __init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, stylesheets):
        Processor.__init__(self, htmlfile, opts, tdir, resource_map, htmlfiles, 
-                        name='html2epub')
+                           name='html2epub')
        if opts.verbose > 2:
            self.debug_tree('parsed')
        self.detect_chapters()
        
-        
-        self.extract_css()
-        self.relativize_font_sizes()
+        self.extract_css(stylesheets)
+        if self.opts.base_font_size2 > 0:
+            self.font_css = self.rationalize(self.external_stylesheets+[self.stylesheet], 
+                                             self.root, self.opts)
        if opts.verbose > 2:
            self.debug_tree('nocss')
            
@ -73,19 +74,6 @@ class HTMLProcessor(Processor):
            meta.getparent().remove(meta)
        Processor.save(self)
        
-        #self.collect_font_statistics()
-        
-        
-    def collect_font_statistics(self):
-        '''
-        Collect font statistics to figure out the base font size used in this
-        HTML document.
-        '''
-        self.font_statistics = {} #: A mapping of font size (in pts) to number of characters rendered at that font size
-        for text in get_text(self.body if self.body is not None else self.root):
-            length, parent = len(re.sub(r'\s+', '', text)), text.getparent()
-            #TODO: Use cssutils on self.raw_css to figure out the font size 
-            # of this piece of text and update statistics accordingly        
    
            

@ -104,21 +92,30 @@ the <spine> element of the OPF file.

 def parse_content(filelist, opts, tdir):
    os.makedirs(os.path.join(tdir, 'content', 'resources'))
-    resource_map = {}
+    resource_map, stylesheets = {}, {}
    toc = TOC(base_path=tdir, type='root')
+    stylesheet_map = {}
    for htmlfile in filelist:
+        logging.getLogger('html2epub').debug('Processing %s...'%htmlfile)
        hp = HTMLProcessor(htmlfile, opts, os.path.join(tdir, 'content'), 
-                           resource_map, filelist)
+                           resource_map, filelist, stylesheets)
        hp.populate_toc(toc)
        hp.save()
+        stylesheet_map[os.path.basename(hp.save_path())] = \
+            [s for s in hp.external_stylesheets + [hp.stylesheet, hp.font_css, hp.override_css] if s is not None]
    
+    logging.getLogger('html2epub').debug('Saving stylesheets...')
+    if opts.base_font_size2 > 0:
+        Rationalizer.remove_font_size_information(stylesheets.values())
+        for path, css in stylesheets.items():
+            open(path, 'wb').write(getattr(css, 'cssText', css).encode('utf-8'))
    if toc.count('chapter') > opts.toc_threshold:
        toc.purge(['file', 'link', 'unknown'])
    if toc.count('chapter') + toc.count('file') > opts.toc_threshold:
        toc.purge(['link', 'unknown'])
    toc.purge(['link'], max=opts.max_toc_links)
    
-    return resource_map, hp.htmlfile_map, toc
+    return resource_map, hp.htmlfile_map, toc, stylesheet_map

 def resize_cover(im, opts):
    width, height = im.size
@ -176,7 +173,7 @@ def process_title_page(mi, filelist, htmlfilemap, opts, tdir):
        <title>Cover</title>
        <style type="text/css">@page {padding: 0pt; margin:0pt}</style>
    </head>
-    <body style="padding: 0pt; margin: 0pt;}">
+    <body style="padding: 0pt; margin: 0pt">
        <div style="text-align:center">
            <img style="text-align: center" src="%s" alt="cover" />
        </div>
@ -212,11 +209,22 @@ def convert(htmlfile, opts, notification=None):
        mi = merge_metadata(htmlfile, opf, opts)
    opts.chapter = XPath(opts.chapter, 
                    namespaces={'re':'http://exslt.org/regular-expressions'})
+    if opts.level1_toc:
+        opts.level1_toc = XPath(opts.level1_toc, 
+                            namespaces={'re':'http://exslt.org/regular-expressions'})
+    else:
+        opts.level1_toc = None
+    if opts.level2_toc:
+        opts.level2_toc = XPath(opts.level2_toc, 
+                            namespaces={'re':'http://exslt.org/regular-expressions'})
+    else:
+        opts.level2_toc = None 
    
    with TemporaryDirectory(suffix='_html2epub', keep=opts.keep_intermediate) as tdir:
        if opts.keep_intermediate:
            print 'Intermediate files in', tdir
-        resource_map, htmlfile_map, generated_toc = parse_content(filelist, opts, tdir)
+        resource_map, htmlfile_map, generated_toc, stylesheet_map = \
+                                        parse_content(filelist, opts, tdir)
        logger = logging.getLogger('html2epub')
        resources = [os.path.join(tdir, 'content', f) for f in resource_map.values()]
        
@ -235,6 +243,10 @@ def convert(htmlfile, opts, notification=None):
            rebase_toc(mi.toc, htmlfile_map, tdir)
        if opts.use_auto_toc or mi.toc is None or len(list(mi.toc.flat())) < 2:
            mi.toc = generated_toc
+        if opts.from_ncx:
+            toc = TOC()
+            toc.read_ncx_toc(opts.from_ncx)
+            mi.toc = toc
        for item in mi.manifest:
            if getattr(item, 'mime_type', None) == 'text/html':
                item.mime_type = 'application/xhtml+xml'
@ -247,7 +259,7 @@ def convert(htmlfile, opts, notification=None):
                f.write(toc)
            if opts.show_ncx:
                print toc
-        split(opf_path, opts)
+        split(opf_path, opts, stylesheet_map)
        opf = OPF(opf_path, tdir)
        opf.remove_guide()
        if has_title_page:
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -12,10 +12,9 @@ import os, math, logging, functools, collections, re, copy
 from lxml.etree import XPath as _XPath
 from lxml import etree, html
 from lxml.cssselect import CSSSelector
-from cssutils import CSSParser

 from calibre.ebooks.metadata.opf2 import OPF
-from calibre.ebooks.epub import tostring
+from calibre.ebooks.epub import tostring, rules
 from calibre import CurrentDir, LoggingInterface

 XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'})
@ -35,7 +34,7 @@ class SplitError(ValueError):

 class Splitter(LoggingInterface):
    
-    def __init__(self, path, opts, always_remove=False):
+    def __init__(self, path, opts, stylesheet_map, always_remove=False):
        LoggingInterface.__init__(self, logging.getLogger('htmlsplit'))
        self.setup_cli_handler(opts.verbose)
        self.path = path
@ -46,22 +45,8 @@ class Splitter(LoggingInterface):
        self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
        root = html.fromstring(open(content(path)).read())
            
-        css = XPath('//link[@type = "text/css" and @rel = "stylesheet"]')(root)
-        if css:
-            cssp = os.path.join('content', *(css[0].get('href').split('/')))
-            self.log_debug('\t\tParsing stylesheet...')
-            try: 
-                stylesheet = CSSParser().parseString(open(cssp, 'rb').read())
-            except:
-                self.log_warn('Failed to parse CSS. Splitting on page-breaks is disabled')
-                if self.opts.verbose > 1:
-                    self.log_exception('')
-                stylesheet = None
-        else:
-            stylesheet = None
        self.page_breaks = []
-        if stylesheet is not None:
-            self.find_page_breaks(stylesheet, root)
+        self.find_page_breaks(stylesheet_map[self.path], root)
            
        self.trees = []
        self.split_size = 0
@ -189,14 +174,12 @@ class Splitter(LoggingInterface):
                self.split(t)
                
                
-    def find_page_breaks(self, stylesheet, root):
+    def find_page_breaks(self, stylesheets, root):
        '''
        Find all elements that have either page-break-before or page-break-after set.
        '''
        page_break_selectors = set([])
-        for rule in stylesheet:
-            if rule.type != rule.STYLE_RULE:
-                continue
+        for rule in rules(stylesheets):
            before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
            after  = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
            try:
@ -385,7 +368,7 @@ def fix_ncx(path, changes):
    if changed:
        open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
       
-def split(pathtoopf, opts):
+def split(pathtoopf, opts, stylesheet_map):
    pathtoopf = os.path.abspath(pathtoopf)
    with CurrentDir(os.path.dirname(pathtoopf)):
        opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
@ -403,7 +386,7 @@ def split(pathtoopf, opts):
        for f in html_files:
            if os.stat(content(f)).st_size > opts.profile.flow_size:
                try:
-                    changes.append(Splitter(f, opts, 
+                    changes.append(Splitter(f, opts, stylesheet_map,
                        always_remove=(always_remove or \
                        os.stat(content(f)).st_size > 5*opts.profile.flow_size)))
                except (SplitError, RuntimeError):
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -8,12 +8,14 @@ Code to recursively parse HTML files and create an open ebook in a specified
 directory or zip file. All the action starts in :function:`create_dir`.
 '''

-import sys, re, os, shutil, logging, tempfile, cStringIO
+import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools
 from urlparse import urlparse
 from urllib import unquote

-from lxml import html, etree
-from lxml.html import soupparser
+from lxml import etree
+from lxml.html import HtmlElementClassLookup, HTMLParser as _HTMLParser, \
+                      fromstring as _fromstring, tostring as _tostring, \
+                      soupparser, HtmlElement
 from lxml.etree import XPath
 get_text = XPath("//text()")

@ -25,9 +27,67 @@ from calibre.ebooks.metadata.meta import get_metadata
 from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
 from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
 from calibre.utils.zipfile import ZipFile
+from cssutils import CSSParser
+
+class HTMLElement(HtmlElement):
+    
+    @apply
+    def specified_font_size():
+        
+        def fget(self):
+            ans = self.get('specified_font_size', '')
+            if not ans:
+                return lambda x: x
+            if ans.startswith('f'):
+                return functools.partial(operator.mul, float(ans[1:]))
+            return float(ans)
+        
+        def fset(self, val):
+            self.set('specified_font_size', ('f'+repr(val(1))) if callable(val) else repr(val))
+                     
+        return property(fget=fget, fset=fset)
+    
+    @apply
+    def computed_font_size():
+        def fget(self):
+            ans = self.get('computed_font_size', '')
+            if ans == '':
+                return None
+            return float(ans)
+        
+        def fset(self, val):
+            self.set('computed_font_size', repr(val))
+        
+        return property(fget=fget, fset=fset)
+    
+    def remove_font_size_information(self):
+        for elem in self.iter():
+            for p in ('computed', 'specified'):
+                elem.attrib.pop(p+'_font_size', None)
+                
+    def getpath(self):
+        return self.getroottree().getpath(self)
+
+class Lookup(HtmlElementClassLookup):
+    
+    def lookup(self, node_type, document, namespace, name):
+        if node_type == 'element':
+            return HTMLElement
+        return HtmlElementClassLookup.lookup(self, node_type, document, namespace, name)
+
+class HTMLParser(_HTMLParser):
+    
+    def __init__(self, **kwargs):
+        super(HTMLParser, self).__init__(**kwargs)
+        self.set_element_class_lookup(Lookup())
+        
+parser = HTMLParser()
+
+def fromstring(raw, **kw):
+    return _fromstring(raw, parser=parser, **kw)

 def tostring(root, pretty_print=False):
-    return html.tostring(root, encoding='utf-8', method='xml', 
+    return _tostring(root, encoding='utf-8', method='xml', 
                         include_meta_content_type=True, 
                         pretty_print=pretty_print)
    
@ -372,11 +432,11 @@ class Parser(PreProcessor, LoggingInterface):
        for pat in ENCODING_PATS:
            src = pat.sub('', src)
        try:
-            self.root =  html.fromstring(src)
+            self.root = fromstring(src)
        except:
            if self.opts.verbose:
                self.log_exception('lxml based parsing failed')
-            self.root = soupparser.fromstring(src)
+            self.root = soupparser.fromstring(src, makeelement=parser.makeelement)
        head = self.root.xpath('./head')
        if head:
            head = head[0]
@ -402,7 +462,7 @@ class Parser(PreProcessor, LoggingInterface):
            os.makedirs(tdir)
        with open(os.path.join(tdir, '%s-%s.html'%\
                    (os.path.basename(self.htmlfile.path), name)), 'wb') as f:
-            f.write(html.tostring(self.root, encoding='utf-8'))
+            f.write(tostring(self.root, encoding='utf-8'))
            self.log_debug(_('Written processed HTML to ')+f.name)
    
            
@ -443,19 +503,21 @@ class Processor(Parser):
    '''
    
    LINKS_PATH = XPath('//a[@href]')
+    PIXEL_PAT  = re.compile(r'([-]?\d+|[-]?\d*\.\d+)px')
+    
+    def __init__(self, *args, **kwargs):
+        Parser.__init__(self, *args, **kwargs)
+        temp = LoggingInterface(logging.getLogger('cssutils'))
+        temp.setup_cli_handler(self.opts.verbose)
+        self.css_parser = CSSParser(log=temp.logger, loglevel=logging.ERROR)
+        self.stylesheet = self.font_css = self.override_css = None
    
    def detect_chapters(self):
        self.detected_chapters = self.opts.chapter(self.root)
        for elem in self.detected_chapters:
            text = u' '.join([t.strip() for t in elem.xpath('descendant::text()')])
            self.log_info('\tDetected chapter: %s', text[:50])
-            if self.opts.chapter_mark in ('both', 'pagebreak'):
-                style = elem.get('style', '').strip()
-                if style and not style.endswith(';'):
-                    style += '; '
-                style += 'page-break-before: always'
-                elem.set('style', style)
-            if self.opts.chapter_mark in ('both', 'rule'):
+            if self.opts.chapter_mark != 'none':
                hr = etree.Element('hr')
                if elem.getprevious() is None:
                    elem.getparent()[:0] = [hr]
@ -466,16 +528,28 @@ class Processor(Parser):
                            insert = i
                            break
                    elem.getparent()[insert:insert] = [hr]
+                if self.opts.chapter_mark != 'rule':
+                    hr.set('style', 'width:0pt;page-break-before:always')
+                    if self.opts.chapter_mark == 'both':
+                        hr2 = etree.Element('hr')
+                        hr2.tail = u'\u00a0'
+                        p = hr.getparent()
+                        i = p.index(hr)
+                        p[i:i] = [hr2]
+                
                    
        
    def save(self):
-        style_path = os.path.basename(self.save_path())+'.css'
-        style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', 
-                                                       'href':'resources/'+style_path,
-                                                       'charset':'UTF-8'})
-        style.tail = '\n'
-        style_path = os.path.join(os.path.dirname(self.save_path()), 'resources', style_path)
-        open(style_path, 'wb').write(self.css.encode('utf-8'))
+        style_path = os.path.splitext(os.path.basename(self.save_path()))[0]
+        for i, sheet in enumerate([self.stylesheet, self.font_css, self.override_css]):
+            if sheet is not None:
+                style = etree.SubElement(self.head, 'link', attrib={'type':'text/css', 'rel':'stylesheet', 
+                                                           'href':'resources/%s_%d.css'%(style_path, i),
+                                                           'charset':'UTF-8'})
+                style.tail = '\n'
+                path = os.path.join(os.path.dirname(self.save_path()), *(style.get('href').split('/')))
+                self.resource_map[path] = style.get('href')
+                open(path, 'wb').write(getattr(sheet, 'cssText', sheet).encode('utf-8'))
        return Parser.save(self)
    
    def populate_toc(self, toc):
@ -491,14 +565,45 @@ class Processor(Parser):
                text = text[:50] + u'\u2026'
            return target.add_item(href, fragment, text, type=type)
        
-        # Add chapters to TOC
+        name = self.htmlfile_map[self.htmlfile.path]
+        href = 'content/'+name
+        
+        # Add level 1 and level 2 TOC items
        counter = 0
+        if self.opts.level1_toc is not None:
+            level1 = self.opts.level1_toc(self.root)
+            if level1:
+                added = {}
+                for elem in level1:
+                    text = (u''.join(elem.xpath('string()'))).strip()
+                    if text:
+                        id = elem.get('id', 'calibre_chapter_%d'%counter)
+                        counter += 1
+                        elem.set('id', id)
+                        added[elem] = add_item(href, id, text, toc, type='chapter')
+                        add_item(href, id, 'Top', added[elem], type='chapter')
+                if self.opts.level2_toc is not None:
+                    level2 = list(self.opts.level2_toc(self.root))
+                    for elem in level2:
+                        level1 = None
+                        for item in self.root.iterdescendants():
+                            if item in added.keys():
+                                level1 = added[item]
+                            elif item == elem and level1 is not None:
+                                text = (u''.join(elem.xpath('string()'))).strip()
+                                if text:
+                                    id = elem.get('id', 'calibre_chapter_%d'%counter)
+                                    counter += 1
+                                    elem.set('id', id)
+                                    add_item(href, id, text, level1, type='chapter')
+                    
+        
+        # Add chapters to TOC
+        
        if not self.opts.no_chapters_in_toc:
            for elem in getattr(self, 'detected_chapters', []):
                text = (u''.join(elem.xpath('string()'))).strip()
                if text:
-                    name = self.htmlfile_map[self.htmlfile.path]
-                    href = 'content/'+name
                    counter += 1
                    id = elem.get('id', 'calibre_chapter_%d'%counter)
                    elem.set('id', id)
@ -518,8 +623,7 @@ class Processor(Parser):
                pass
            
        
-        name = self.htmlfile_map[self.htmlfile.path]
-        href = 'content/'+name
+        
        
        
        if referrer.href != href: # Happens for root file
@ -541,13 +645,24 @@ class Processor(Parser):
                            name = self.htmlfile_map[self.htmlfile.referrer.path]
                        add_item(href, fragment, text, target)
                        
-                    
+    @classmethod
+    def preprocess_css(cls, css, dpi=96):
+        def rescale(match):
+            val = match.group(1)
+            try:
+                val = float(val)
+            except ValueError:
+                return ''
+            return '%fpt'%(72 * val/dpi)
        
-    def extract_css(self):
+        return cls.PIXEL_PAT.sub(rescale, css)
+        
+    def extract_css(self, parsed_sheets):
        '''
-        Remove all CSS information from the document and store in self.raw_css. 
-        This includes <font> tags.
+        Remove all CSS information from the document and store it as 
+        :class:`StyleSheet` objects.
        '''
+        
        def get_id(chapter, counter, prefix='calibre_css_'):
            new_id = '%s_%d'%(prefix, counter)
            if chapter.tag.lower() == 'a' and  'name' in chapter.keys():
@ -562,17 +677,40 @@ class Processor(Parser):
                chapter.set('id', id)
            return id
    
-        css = []
+        self.external_stylesheets, self.stylesheet = [], self.css_parser.parseString('')
        for link in self.root.xpath('//link'):
            if 'css' in link.get('type', 'text/css').lower():
-                file = os.path.join(self.tdir, link.get('href', ''))
-                if file and os.path.exists(file) and os.path.isfile(file):
-                    css.append(open(file, 'rb').read().decode('utf-8'))
-                link.getparent().remove(link)
-                    
+                file = os.path.join(self.tdir, *(link.get('href', '').split('/')))
+                if file and not 'http:' in file:
+                    if not parsed_sheets.has_key(file):
+                        try:
+                            self.log_info('Processing stylesheet %s...'%file)
+                            css = self.preprocess_css(open(file).read())
+                        except (IOError, OSError):
+                            self.log_error('Failed to open stylesheet: %s'%file)
+                        else:
+                            try:
+                                parsed_sheets[file] = self.css_parser.parseString(css)
+                            except:
+                                parsed_sheets[file] = css.decode('utf8', 'replace')
+                                self.log_warning('Failed to parse stylesheet: %s'%file)
+                                if self.opts.verbose > 1:
+                                    self.log_exception('')
+                    if parsed_sheets.has_key(file):
+                        self.external_stylesheets.append(parsed_sheets[file])
+                
+        
        for style in self.root.xpath('//style'):
            if 'css' in style.get('type', 'text/css').lower():
-                css.append('\n'.join(style.xpath('./text()')))
+                raw = '\n'.join(style.xpath('./text()'))
+                css = self.preprocess_css(raw)
+                try:
+                    sheet = self.css_parser.parseString(css)
+                except:
+                    self.log_debug('Failed to parse style element')
+                else:
+                    for rule in sheet:
+                        self.stylesheet.add(rule)
                style.getparent().remove(style)
        
        cache = {}
@ -613,57 +751,19 @@ class Processor(Parser):
            elem.set('class', cn)
            elem.attrib.pop('style')
        
-        for setting, cn in cache.items():
-            css.append('.%s {%s}'%(cn, setting))
-        
-            
-        self.raw_css = '\n\n'.join(css)
-        self.css = unicode(self.raw_css)
+        css = '\n'.join(['.%s {%s;}'%(cn, setting) for \
+                         setting, cn in cache.items()])
+        self.stylesheet = self.css_parser.parseString(self.preprocess_css(css))
+        css = ''
        if self.opts.override_css:
-            self.css += '\n\n'+self.opts.override_css
-        self.do_layout()
-        # TODO: Figure out what to do about CSS imports from linked stylesheets
-    
-    def relativize_font_sizes(self, dpi=100, base=16):
-        '''
-        Convert all absolute font sizes to percentages of ``base`` using ``dpi``
-        to convert from screen to paper units.
-        :param base: Base size in pixels. Adobe DE seems to need base size to be 16
-        irrespective of the unit of the length being converted
-        :param dpi: Dots per inch used to convert pixels to absolute lengths. Since
-        most HTML files are created on computers with monitors of DPI ~ 100, we use
-        100 by default.
-        '''
-        size_value_pat = re.compile(r'(?<!/)(?P<num>[0-9.]+)(?P<unit>cm|mm|in|pt|pc|px)', re.I)
+            css += '\n\n' + self.opts.override_css
+        css += '\n\n' + 'body {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt;}'
+        css += '\n\n@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
+        if self.opts.remove_paragraph_spacing:
+            css += '\n\np {text-indent: 2.1em; margin-top:1pt; margin-bottom:1pt; padding:0pt; border:0pt;}'
+        self.override_css = self.css_parser.parseString(self.preprocess_css(css))
        
-        # points per unit
-        ptu = { # Convert to pt
-                  'px' : 72./dpi,
-                  'pt' : 1.0,
-                  'pc' : 1/12.,
-                  'in' : 72.,
-                  'cm' : 72/2.54,
-                  'mm' : 72/25.4,
-                  }
        
-        def relativize(match):
-            val  = float(match.group('num'))
-            unit = match.group('unit').lower()
-            val  *= ptu[unit]
-            return '%.1f%%'%((val/base) * 100)
-             
-        
-        def sub(match):
-            rule = match.group(1)
-            value = size_value_pat.sub(relativize, match.group(2))
-            return '%s : %s'%(rule, value)
-        
-        self.css = re.compile(r'(font|font-size)\s*:\s*([^;]+)', re.I).sub(sub, self.css)
-    
-    def do_layout(self):
-        self.css += '\nbody {margin-top: 0pt; margin-bottom: 0pt; margin-left: 0pt; margin-right: 0pt; font-size: %f%%}\n'%self.opts.base_font_size
-        self.css += '@page {margin-top: %fpt; margin-bottom: %fpt; margin-left: %fpt; margin-right: %fpt}\n'%(self.opts.margin_top, self.opts.margin_bottom, self.opts.margin_left, self.opts.margin_right)
-
 def config(defaults=None, config_name='html',
           desc=_('Options to control the traversal of HTML')):
    if defaults is None:
--- a/src/calibre/gui2/dialogs/epub.py
+++ b/src/calibre/gui2/dialogs/epub.py
@ -17,6 +17,7 @@ from calibre.ebooks.epub.from_any import SOURCE_FORMATS, config
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ptempfile import PersistentTemporaryFile
 from calibre.ebooks.metadata.opf import OPFCreator
+from lxml.etree import XPath

 class Config(QDialog, Ui_Dialog):
    
@ -234,6 +235,16 @@ class Config(QDialog, Ui_Dialog):
                    self.source_format = d.format()
                
    def accept(self):
+        for opt in ('chapter', 'level1_toc', 'level2_toc'):
+            text = unicode(getattr(self, 'opt_'+opt).text())
+            if text:
+                try:
+                    XPath(text,namespaces={'re':'http://exslt.org/regular-expressions'})
+                except Exception, err:
+                    error_dialog(self, _('Invalid XPath expression'),
+                        _('The expression %s is invalid. Error: %s')%(text, err) 
+                                 ).exec_()
+                    return
        mi = self.get_metadata()
        self.read_settings()
        self.cover_file = None
--- a/src/calibre/gui2/dialogs/epub.ui
+++ b/src/calibre/gui2/dialogs/epub.ui
@ -77,7 +77,7 @@
     <item>
      <widget class="QStackedWidget" name="stack" >
       <property name="currentIndex" >
-        <number>1</number>
+        <number>3</number>
       </property>
       <widget class="QWidget" name="metadata_page" >
        <layout class="QGridLayout" name="gridLayout_4" >
@ -416,29 +416,36 @@
              <string>Base &amp;font size:</string>
             </property>
             <property name="buddy" >
-              <cstring>opt_base_font_size</cstring>
+              <cstring>opt_base_font_size2</cstring>
             </property>
            </widget>
           </item>
           <item row="1" column="2" >
-            <widget class="QDoubleSpinBox" name="opt_base_font_size" >
+            <widget class="QDoubleSpinBox" name="opt_base_font_size2" >
             <property name="suffix" >
-              <string> %</string>
+              <string> pt</string>
             </property>
             <property name="decimals" >
              <number>0</number>
             </property>
             <property name="minimum" >
-              <double>10.000000000000000</double>
+              <double>0.000000000000000</double>
             </property>
             <property name="maximum" >
-              <double>500.000000000000000</double>
+              <double>30.000000000000000</double>
             </property>
             <property name="singleStep" >
-              <double>5.000000000000000</double>
+              <double>1.000000000000000</double>
             </property>
             <property name="value" >
-              <double>100.000000000000000</double>
+              <double>30.000000000000000</double>
+             </property>
+            </widget>
+           </item>
+           <item row="2" column="0" >
+            <widget class="QCheckBox" name="opt_remove_paragraph_spacing" >
+             <property name="text" >
+              <string>Remove &amp;spacing between paragraphs</string>
             </property>
            </widget>
           </item>
@ -674,6 +681,32 @@ p, li { white-space: pre-wrap; }
              </property>
             </widget>
            </item>
+            <item row="4" column="1" >
+             <widget class="QLineEdit" name="opt_level1_toc" />
+            </item>
+            <item row="4" column="0" >
+             <widget class="QLabel" name="label_19" >
+              <property name="text" >
+               <string>Level &amp;1 TOC</string>
+              </property>
+              <property name="buddy" >
+               <cstring>opt_level1_toc</cstring>
+              </property>
+             </widget>
+            </item>
+            <item row="5" column="0" >
+             <widget class="QLabel" name="label_20" >
+              <property name="text" >
+               <string>Level &amp;2 TOC</string>
+              </property>
+              <property name="buddy" >
+               <cstring>opt_level2_toc</cstring>
+              </property>
+             </widget>
+            </item>
+            <item row="5" column="1" >
+             <widget class="QLineEdit" name="opt_level2_toc" />
+            </item>
           </layout>
          </widget>
         </item>
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@ -295,6 +295,11 @@ complete -o nospace  -F _prs500 prs500
 ''')
        f.close()
        print 'done'
+    except TypeError, err:
+        if 'resolve_entities' in str(err):
+            print 'You need python-lxml >= 2.0.5 for calibre'
+            sys.exit(1)
+        raise
    except:
        if fatal_errors:
            raise
--- a/src/calibre/trac/plugins/download.py
+++ b/src/calibre/trac/plugins/download.py
@ -45,7 +45,7 @@ class Distribution(object):
    INSTALLERS = ('emerge -avn', 'apt-get install', 'yum install')
    AS_ROOT    = (True, False, True)
    
-    TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Interpid Ibex',
+    TITLEMAP = {'gentoo':'Gentoo', 'ubuntu':'Ubuntu Intrepid Ibex',
                'fedora':'Fedora 10', 'debian':'Debian sid', 'generic': 'Install from source'}
    
    MANUAL_MAP = {
--- a/src/calibre/web/feeds/init.py
+++ b/src/calibre/web/feeds/init.py
@ -5,7 +5,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Contains the logic for parsing feeds.
 '''
-import time, logging, traceback
+import time, logging, traceback, copy
 from datetime import datetime

 from calibre.web.feeds.feedparser import parse
@ -17,7 +17,7 @@ class Article(object):
    def __init__(self, id, title, url, summary, published, content):
        self.downloaded = False
        self.id = id
-        self.title = title
+        self.title = title.strip() if title else title
        self.url = url
        self.summary = summary
        self.content = content
@ -38,7 +38,14 @@ Has content : %s

    def __str__(self):
        return repr(self)
-
+    
+    def is_same_as(self, other_article):
+        #if self.title != getattr(other_article, 'title', False):
+        #    return False
+        if self.url:
+            return self.url == getattr(other_article, 'url', False)
+        return self.content == getattr(other_article, 'content', False)
+    

 class Feed(object):

@ -169,7 +176,72 @@ class Feed(object):
                              len(a.summary if a.summary else ''))
                
        return length > 2000 * len(self)
+    
+    def has_article(self, article):
+        for a in self:
+            if a.is_same_as(article):
+                return True
+        return False
+    
+    def find(self, article):
+        for i, a in enumerate(self):
+            if a.is_same_as(article):
+                return i
+        return -1
+    
+    def remove(self, article):
+        i = self.index(article)
+        if i > -1:
+            self.articles[i:i+1] = []

+class FeedCollection(list):
+    
+    def __init__(self, feeds):
+        list.__init__(self, [f for f in feeds if len(f.articles) > 0])
+        found_articles = set([])
+        duplicates = set([])
+        
+        def in_set(s, a):
+            for x in s:
+                if a.is_same_as(x):
+                    return x
+            return None
+        
+        print '#feeds', len(self)
+        print map(len, self)
+        for f in self:
+            dups = []
+            for a in f:
+                first = in_set(found_articles, a)
+                if first is not None:
+                    dups.append(a)
+                    duplicates.add((first, f))
+                else:
+                    found_articles.add(a)
+            for x in dups:
+                f.articles.remove(x)
+                
+        self.duplicates = duplicates
+        print len(duplicates)
+        print map(len, self)
+        #raise
+                
+    def find_article(self, article):
+        for j, f in enumerate(self):
+            for i, a in enumerate(f):
+                if a is article:
+                    return (j, i)
+    
+    def restore_duplicates(self):
+        temp = []
+        for article, feed in self.duplicates:
+            art = copy.deepcopy(article)
+            j, i = self.find_article(article)
+            art.url = '../feed_%d/article_%d/index.html'%(j, i)
+            temp.append((feed, art))
+        for feed, art in temp:
+            feed.articles.append(art)
+        

 def feed_from_xml(raw_xml, title=None, oldest_article=7, 
                  max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -289,15 +289,16 @@ class BasicNewsRecipe(object, LoggingInterface):
        '''
        return soup
    
-    def postprocess_html(self, soup):
+    def postprocess_html(self, soup, first_fetch):
        '''
        This method is called with the source of each downloaded :term:`HTML` file, after
        it is parsed for links and images. 
        It can be used to do arbitrarily powerful post-processing on the :term:`HTML`.
        It should return `soup` after processing it. 
        
-        `soup`: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ 
+        :param soup: A `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_ 
        instance containing the downloaded :term:`HTML`.
+        :param first_fetch: True if this is the first page of an article.
        '''
        return soup
    
@ -482,7 +483,7 @@ class BasicNewsRecipe(object, LoggingInterface):
                elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                body.insert(0, elem)
            
-        return self.postprocess_html(soup)
+        return self.postprocess_html(soup, first_fetch)
        
    
    def download(self):
--- a/src/calibre/web/feeds/recipes/espn.py
+++ b/src/calibre/web/feeds/recipes/espn.py
@ -67,7 +67,7 @@ class ESPN(BasicNewsRecipe):
        
        return soup
    
-    def postprocess_html(self, soup):
+    def postprocess_html(self, soup, first_fetch):
        for div in soup.findAll('div', style=True):
            div['style'] = div['style'].replace('center', 'left')
        return soup
--- a/src/calibre/web/feeds/recipes/newsweek.py
+++ b/src/calibre/web/feeds/recipes/newsweek.py
@ -92,7 +92,7 @@ class Newsweek(BasicNewsRecipe):
        return sections
        
    
-    def postprocess_html(self,  soup):
+    def postprocess_html(self, soup, first_fetch):
        divs = list(soup.findAll('div', 'pagination'))
        if not divs:
            return
--- a/src/calibre/web/feeds/recipes/outlook_india.py
+++ b/src/calibre/web/feeds/recipes/outlook_india.py
@ -73,7 +73,7 @@ class OutlookIndia(BasicNewsRecipe):
                    
        return feeds

-    def postprocess_html(self, soup):
+    def postprocess_html(self, soup, first_fetch):
        bad = []
        for table in soup.findAll('table'):
            if table.find(text=re.compile(r'\(\d+ of \d+\)')):
--- a/src/calibre/web/feeds/recipes/scientific_american.py
+++ b/src/calibre/web/feeds/recipes/scientific_american.py
@ -7,14 +7,16 @@ __docformat__ = 'restructuredtext en'
 sciam.com
 '''
 import re
+from lxml import html
 from calibre.web.feeds.news import BasicNewsRecipe

 class ScientificAmerican(BasicNewsRecipe):
    title = u'Scientific American'
-    description = u'Popular science' 
+    description = u'Popular science. Monthly magazine.' 
    __author__ = 'Kovid Goyal'
    oldest_article = 30 
    max_articles_per_feed = 100
+    no_stylesheets = True
    use_embedded_content   = False
    remove_tags_before = dict(name='div', attrs={'class':'headline'})
    remove_tags_after  = dict(id='article')
@ -26,25 +28,102 @@ class ScientificAmerican(BasicNewsRecipe):
    html2lrf_options = ['--base-font-size', '8']
    recursions = 1
    match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14)']
-    feeds = [
-             (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), 
-             (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), 
-             (u'Health', u'http://rss.sciam.com/sciam/health'), 
-             (u'Space', u'http://rss.sciam.com/sciam/space'), 
-             (u'Technology', u'http://rss.sciam.com/sciam/technology'), 
-             (u'Biology', u'http://rss.sciam.com/sciam/biology'), 
-             (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), 
-             (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), 
-             (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), 
-             (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), 
-             (u'Math', u'http://rss.sciam.com/sciam/math'), 
-             (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), 
-             (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), 
-             (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
-            ]
+#    feeds = [
+#             (u'Latest News', u'http://rss.sciam.com/ScientificAmerican-News'), 
+#             (u'Global', u'http://rss.sciam.com/ScientificAmerican-Global'), 
+#             (u'Health', u'http://rss.sciam.com/sciam/health'), 
+#             (u'Space', u'http://rss.sciam.com/sciam/space'), 
+#             (u'Technology', u'http://rss.sciam.com/sciam/technology'), 
+#             (u'Biology', u'http://rss.sciam.com/sciam/biology'), 
+#             (u'Mind & Brain', u'http://rss.sciam.com/sciam/mind-and-brain'), 
+#             (u"What's Next", u'http://rss.sciam.com/sciam/whats-next'), 
+#             (u'Archeology and Paleontology', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=archaeology-and-paleontology'), 
+#             (u'Physics', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=physics'), 
+#             (u'Math', u'http://rss.sciam.com/sciam/math'), 
+#             (u'History of Science', u'http://www.sciam.com/page.cfm?section=rsscategory&alias=history-of-science'), 
+#             (u'Chemistry', u'http://rss.sciam.com/sciam/chemistry'), 
+#             (u'Mind Matters', u'http://rss.sciam.com/ScientificAmerican-MindBlog')
+#            ]
+#    
+    def parse_index(self):
+        src = self.browser.open('http://www.sciam.com/sciammag/').read()
+        root = html.fromstring(src)
+        self.cover_url = root.xpath('//img[re:match(@src, "cover_")]', 
+                                    namespaces={'re':'http://exslt.org/regular-expressions'}
+                                    )[0].get('src')
+        self.timefmt = root.xpath('//div[@id = "magazine-month"]')[0].text
+        feeds = []
+        features = []
+        for a in root.xpath('//a[@href and @title = "Feature"]'):
+            if not a.text.strip():
+                continue
+            article = {
+                       'url'    : a.get('href'),
+                       'title'  : u''.join(a.xpath('./text()')),
+                       'date'   : '',
+                       'description' : '',   
+                       }
+            for s in a.itersiblings('span'):
+                if s.get('class', '') == 'sub':
+                    article['description'] += u''.join(s.xpath('./text()')) + ' '
+            features.append(article)
+        if features:
+            feeds.append(('Features', features))
+            
+        departments = []
+        for a in root.xpath('//a[@href and @class="title"]'):
+            txt = u''.join(a.xpath('./text()')).strip()
+            if not txt:
+                continue
+            article = {
+                       'url'    : a.get('href'),
+                       'title'  : txt,
+                       'date'   : '',
+                       'description' : '',   
+                       }
+            p = a.getparent()
+            p.remove(a)
+            article['description'] = u''.join(p.xpath('./text()'))
+            departments.append(article)
+            
+        feeds.append(('Departments', departments))
+        opinion = []
+        for a in root.xpath('//div[@id = "opinion"]//a[@href]'):
+            txt = u''.join(a.xpath('./text()')).strip()
+            if not txt:
+                continue
+            article = {
+                       'url'    : a.get('href'),
+                       'title'  : txt,
+                       'date'   : '',
+                       'description' : '',   
+                       }
+            opinion.append(article)
+        feeds.append(('Opinion', opinion))
+        
+        ontheweb = []
+        for a in root.xpath('//div[@id = "ontheweb"]//a[@href]'):
+            txt = u''.join(a.xpath('./text()')).strip()
+            if not txt:
+                continue
+            article = {
+                       'url'    : a.get('href'),
+                       'title'  : txt,
+                       'date'   : '',
+                       'description' : '',   
+                       }
+            ontheweb.append(article)
+        feeds.append(('On the web', ontheweb))
+        
+        return feeds
+        
    
-    def postprocess_html(self,  soup):
+    def postprocess_html(self, soup, first_fetch):
        if soup is not None:
            for span in soup.findAll('span', attrs={'class':'pagination'}):
                span.extract()
+            if not first_fetch:
+                div = soup.find('div', attrs={'class':'headline'})
+                if div: 
+                    div.extract()
        return soup
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -198,7 +198,7 @@ class RecursiveFetcher(object, LoggingInterface):
                try:
                    f = self.fetch_url(iurl)
                except Exception, err:
-                    self.log_warning('Could not fetch stylesheet %s', iurl)
+                    self.log_debug('Could not fetch stylesheet %s', iurl)
                    self.log_debug('Error: %s', str(err), exc_info=True)
                    continue
                stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')