Untested implementation of HTML input. Uses a new transform that 'packages' an OEB book into a folder structure (the same folder structure that was used in the old codebase for EPUB output). This may have broken other thin gs, so use with care.

2025-07-09 03:04:10 -04:00 · 2009-04-08 17:44:29 -07:00 · 2009-04-08 17:44:29 -07:00 · 093b98a9f1
commit 093b98a9f1
parent b2bfab32cf
17 changed files with 609 additions and 206 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -122,8 +122,9 @@ class InputFormatPlugin(Plugin):
    def convert(self, stream, options, file_ext, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
-        the path to the created OPF file. All output should be contained in
-        the current directory. If this plugin creates files outside the current
+        the path to the created OPF file or an :class:`OEBBook` instance.
+        All output should be contained in the current directory.
+        If this plugin creates files outside the current
        directory they must be deleted/marked for deletion before this method
        returns.

--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -299,21 +299,15 @@ OptionRecommendation(name='language',

        # Create an OEBBook from the input file. The input plugin does all the
        # heavy lifting.
-        from calibre.ebooks.oeb.reader import OEBReader
-        from calibre.ebooks.oeb.base import OEBBook
        accelerators = {}

        tdir = PersistentTemporaryDirectory('_plumber')

-        opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
+        self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
                                    self.input_fmt, self.log,
                                    accelerators, tdir)
-        html_preprocessor = HTMLPreProcessor()
-        self.reader = OEBReader()
-        self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
-        # Read OEB Book into OEBBook
-        self.log.info('Parsing all content...')
-        self.reader(self.oeb, opfpath)
+        if not hasattr(self.oeb, 'manifest'):
+            self.oeb = create_oebbook(self.log, self.oeb)

        self.opts.source = self.opts.input_profile
        self.opts.dest = self.opts.output_profile
@ -340,7 +334,20 @@ OptionRecommendation(name='language',
        trimmer(self.oeb, self.opts)

        self.log.info('Creating %s...'%self.output_plugin.name)
-        self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts,
-                self.log)
+        self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
+                self.opts, self.log)

+def create_oebbook(log, opfpath):
+    '''
+    Create an OEBBook from an OPF file.
+    '''
+    from calibre.ebooks.oeb.reader import OEBReader
+    from calibre.ebooks.oeb.base import OEBBook
+    html_preprocessor = HTMLPreProcessor()
+    reader = OEBReader()
+    oeb = OEBBook(log, html_preprocessor=html_preprocessor)
+    # Read OEB Book into OEBBook
+    log.info('Parsing all content...')
+    reader(oeb, opfpath)
+    return oeb

--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -10,23 +10,23 @@ import sys, textwrap, re, os, uuid
 from itertools import cycle
 from calibre.utils.config import Config, StringConfig
 from calibre.utils.zipfile import ZipFile, ZIP_STORED
-from calibre.ebooks.html import config as common_config, tostring
+from calibre.ebooks.html import tostring
 from lxml import etree

 class DefaultProfile(object):
-    
+
    flow_size            = sys.maxint
    screen_size          = None
    remove_special_chars = False
    remove_object_tags   = False
-    
+
 class PRS505(DefaultProfile):
-    
+
    flow_size            = 270000
    screen_size          = (590, 765)
    remove_special_chars = re.compile(u'[\u200b\u00ad]')
    remove_object_tags   = True
-        
+

 PROFILES = {
            'PRS505' : PRS505,
@ -64,11 +64,11 @@ def config(defaults=None, name='epub'):
        c = Config(name, desc)
    else:
        c = StringConfig(defaults, desc)
-    
+
    c.update(common_config())
    c.remove_opt('output')
    c.remove_opt('zip')
-    
+
    c.add_opt('output', ['-o', '--output'], default=None,
             help=_('The output EPUB file. If not specified, it is '
                    'derived from the input file name.'))
@ -81,22 +81,22 @@ def config(defaults=None, name='epub'):
              help=_('Either the path to a CSS stylesheet or raw CSS. '
                     'This CSS will override any existing CSS '
                     'declarations in the source files.'))
-    structure = c.add_group('structure detection', 
+    structure = c.add_group('structure detection',
                            _('Control auto-detection of document structure.'))
-    structure('chapter', ['--chapter'], 
+    structure('chapter', ['--chapter'],
              default="//*[re:match(name(), 'h[1-2]') and "
              "re:test(., 'chapter|book|section|part', 'i')] | "
              "//*[@class = 'chapter']",
            help=_('''\
 An XPath expression to detect chapter titles. The default is to consider <h1> or
-<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as 
-well as any tags that have class="chapter". 
+<h2> tags that contain the words "chapter","book","section" or "part" as chapter titles as
+well as any tags that have class="chapter".
 The expression used must evaluate to a list of elements. To disable chapter detection,
 use the expression "/". See the XPath Tutorial in the calibre User Manual for further
 help on using this feature.
 ''').replace('\n', ' '))
    structure('chapter_mark', ['--chapter-mark'], choices=['pagebreak', 'rule', 'both', 'none'],
-              default='pagebreak', 
+              default='pagebreak',
              help=_('Specify how to mark detected chapters. A value of '
                     '"pagebreak" will insert page breaks before chapters. '
                     'A value of "rule" will insert a line before chapters. '
@ -129,13 +129,13 @@ help on using this feature.
              help=_('XPath expression to find the name of each page in the '
                     'pagination map relative to its boundary element. '
                     'Default is to number all pages staring with 1.'))
-    toc = c.add_group('toc', 
+    toc = c.add_group('toc',
        _('''\
 Control the automatic generation of a Table of Contents. If an OPF file is detected
 and it specifies a Table of Contents, then that will be used rather than trying
 to auto-generate a Table of Contents.
 ''').replace('\n', ' '))
-    toc('max_toc_links', ['--max-toc-links'], default=50, 
+    toc('max_toc_links', ['--max-toc-links'], default=50,
        help=_('Maximum number of links to insert into the TOC. Set to 0 '
               'to disable. Default is: %default. Links are only added to the '
               'TOC if less than the --toc-threshold number of chapters were detected.'))
@ -166,15 +166,15 @@ to auto-generate a Table of Contents.
        help=_('Normally, if the source file already has a Table of Contents, '
               'it is used in preference to the auto-generated one. '
               'With this option, the auto-generated one is always used.'))
-    
+
    layout = c.add_group('page layout', _('Control page layout'))
-    layout('margin_top', ['--margin-top'], default=5.0, 
+    layout('margin_top', ['--margin-top'], default=5.0,
           help=_('Set the top margin in pts. Default is %default'))
-    layout('margin_bottom', ['--margin-bottom'], default=5.0, 
+    layout('margin_bottom', ['--margin-bottom'], default=5.0,
           help=_('Set the bottom margin in pts. Default is %default'))
-    layout('margin_left', ['--margin-left'], default=5.0, 
+    layout('margin_left', ['--margin-left'], default=5.0,
           help=_('Set the left margin in pts. Default is %default'))
-    layout('margin_right', ['--margin-right'], default=5.0, 
+    layout('margin_right', ['--margin-right'], default=5.0,
           help=_('Set the right margin in pts. Default is %default'))
    layout('base_font_size2', ['--base-font-size'], default=12.0,
           help=_('The base font size in pts. Default is %defaultpt. '
@ -195,12 +195,12 @@ to auto-generate a Table of Contents.
                  'This is only neccessary if the HTML files contain CSS that '
                  'uses sibling selectors. Enabling this greatly slows down '
                  'processing of large HTML files.'))
-    
+
    c.add_opt('show_opf', ['--show-opf'], default=False, group='debug',
              help=_('Print generated OPF file to stdout'))
    c.add_opt('show_ncx', ['--show-ncx'], default=False, group='debug',
              help=_('Print generated NCX file to stdout'))
-    c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug', 
+    c.add_opt('keep_intermediate', ['--keep-intermediate-files'], group='debug',
              default=False,
              help=_('Keep intermediate files during processing by html2epub'))
    c.add_opt('extract_to', ['--extract-to'], group='debug', default=None,
--- a/src/calibre/ebooks/epub/fonts.py
+++ b/src/calibre/ebooks/epub/fonts.py
@ -14,7 +14,7 @@ from lxml.cssselect import CSSSelector
 from lxml import etree
 from lxml.html import HtmlElement

-from calibre.ebooks.html import fromstring
+from calibre.ebooks.html_old import fromstring
 from calibre.ebooks.epub import rules
 from cssutils import CSSParser

@ -24,7 +24,7 @@ absolute_size = r'(?P<abs>(x?x-)?(small|large)|medium)'
 relative_size = r'(?P<rel>smaller|larger)'

 font_size_pat   = re.compile('|'.join((relative_size, absolute_size, length)), re.I)
-line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))  
+line_height_pat = re.compile(r'({num})(px|in|cm|mm|pt|pc)'.replace('{num}', num))

 PTU = {
       'in' : 72.,
@ -37,12 +37,12 @@ PTU = {
 DEFAULT_FONT_SIZE = 12

 class Rationalizer(object):
-    
+
    @classmethod
    def specificity(cls, s):
        '''Map CSS specificity tuple to a single integer'''
-        return sum([10**(4-i) + x for i,x in enumerate(s)]) 
-        
+        return sum([10**(4-i) + x for i,x in enumerate(s)])
+
    @classmethod
    def compute_font_size(cls, elem):
        '''
@ -59,7 +59,7 @@ class Rationalizer(object):
            elem.computed_font_size = sfs(parent.computed_font_size)
        else:
            elem.computed_font_size = sfs
-        
+
    @classmethod
    def calculate_font_size(cls, style):
        'Return font size in pts from style object. For relative units returns a callable'
@ -69,7 +69,7 @@ class Rationalizer(object):
            fs = match.group()
        if style.fontSize:
            fs = style.fontSize
-            
+
        match = font_size_pat.search(fs)
        if match is None:
            return None
@ -89,8 +89,8 @@ class Rationalizer(object):
            return 12 * x
        if match.get('zero', False):
            return 0.
-        return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8) 
-        
+        return functools.partial(operator.mul, 1.2) if 'larger' in fs.lower() else functools.partial(operator.mul, 0.8)
+
    @classmethod
    def resolve_rules(cls, stylesheets):
        for sheet in stylesheets:
@ -104,12 +104,12 @@ class Rationalizer(object):
                    if font_size is not None:
                        for s in r.selectorList:
                            sheet.fs_rules.append([CSSSelector(s.selectorText), font_size])
-                    orig = line_height_pat.search(r.style.lineHeight) 
+                    orig = line_height_pat.search(r.style.lineHeight)
                    if orig is not None:
                        for s in r.selectorList:
                            sheet.lh_rules.append([CSSSelector(s.selectorText), float(orig.group(1)) * PTU[orig.group(2).lower()]])
-    
-        
+
+
    @classmethod
    def apply_font_size_rules(cls, stylesheets, root):
        'Add a ``specified_font_size`` attribute to every element that has a specified font size'
@ -119,7 +119,7 @@ class Rationalizer(object):
                elems = selector(root)
                for elem in elems:
                    elem.specified_font_size = font_size
-    
+
    @classmethod
    def remove_font_size_information(cls, stylesheets):
        for r in rules(stylesheets):
@ -134,17 +134,17 @@ class Rationalizer(object):
                r.style.removeProperty('font')
            if line_height_pat.search(r.style.lineHeight) is not None:
                r.style.removeProperty('line-height')
-    
+
    @classmethod
    def compute_font_sizes(cls, root, stylesheets, base=12):
        stylesheets = [s for s in stylesheets if hasattr(s, 'cssText')]
        cls.apply_font_size_rules(stylesheets, root)
-        
+
        # Compute the effective font size of all tags
        root.computed_font_size = DEFAULT_FONT_SIZE
        for elem in root.iter(etree.Element):
            cls.compute_font_size(elem)
-        
+
        extra_css = {}
        if base > 0:
            # Calculate the "base" (i.e. most common) font size
@ -157,20 +157,20 @@ class Rationalizer(object):
                    if t: t = t.strip()
                    if t:
                        font_sizes[elem.computed_font_size] += len(t)
-                    
+
                t = getattr(elem, 'tail', '')
                if t: t = t.strip()
                if t:
                    parent = elem.getparent()
                    if parent.tag not in IGNORE:
                        font_sizes[parent.computed_font_size] += len(t)
-                
+
            try:
                most_common = max(font_sizes.items(), key=operator.itemgetter(1))[0]
                scale = base/most_common if most_common > 0 else 1.
            except ValueError:
                scale = 1.
-            
+
            # rescale absolute line-heights
            counter = 0
            for sheet in stylesheets:
@ -181,17 +181,17 @@ class Rationalizer(object):
                        if not extra_css.has_key(elem.get('id')):
                            extra_css[elem.get('id')] = []
                        extra_css[elem.get('id')].append('line-height:%fpt'%(lh*scale))
-            
-        
-            
+
+
+
            # Rescale all computed font sizes
            for elem in body.iter(etree.Element):
                if isinstance(elem, HtmlElement):
                    elem.computed_font_size *= scale
-        
-        # Remove all font size specifications from the last stylesheet 
+
+        # Remove all font size specifications from the last stylesheet
        cls.remove_font_size_information(stylesheets[-1:])
-                    
+
        # Create the CSS to implement the rescaled font sizes
        for elem in body.iter(etree.Element):
            cfs, pcfs = map(operator.attrgetter('computed_font_size'), (elem, elem.getparent()))
@ -201,12 +201,12 @@ class Rationalizer(object):
                if not extra_css.has_key(elem.get('id')):
                    extra_css[elem.get('id')] = []
                extra_css[elem.get('id')].append('font-size: %f%%'%(100*(cfs/pcfs)))
-                
+
        css = CSSParser(loglevel=logging.ERROR).parseString('')
        for id, r in extra_css.items():
            css.add('#%s {%s}'%(id, ';'.join(r)))
        return css
-    
+
    @classmethod
    def rationalize(cls, stylesheets, root, opts):
        logger     = logging.getLogger('html2epub')
@ -229,7 +229,7 @@ class Rationalizer(object):
 ################################################################################

 class FontTest(unittest.TestCase):
-    
+
    def setUp(self):
        from calibre.ebooks.epub import config
        self.opts = config(defaults='').parse()
@ -246,10 +246,10 @@ class FontTest(unittest.TestCase):
                <p id="p2">Some other <span class="it">text</span>.</p>
                <p id="longest">The longest piece of single font size text in this entire file. Used to test resizing.</p>
            </body>
-        </html> 
+        </html>
        '''
        self.root = fromstring(self.html)
-        
+
    def do_test(self, css, base=DEFAULT_FONT_SIZE, scale=1):
        root1 = copy.deepcopy(self.root)
        root1.computed_font_size = DEFAULT_FONT_SIZE
@ -262,39 +262,39 @@ class FontTest(unittest.TestCase):
        for elem in root2.iter(etree.Element):
            Rationalizer.compute_font_size(elem)
        for e1, e2 in zip(root1.xpath('//body')[0].iter(etree.Element), root2.xpath('//body')[0].iter(etree.Element)):
-            self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size, 
+            self.assertAlmostEqual(e1.computed_font_size, e2.computed_font_size,
                msg='Computed font sizes for %s not equal. Original: %f Processed: %f'%\
                (root1.getroottree().getpath(e1), e1.computed_font_size, e2.computed_font_size))
        return stylesheet2.cssText
-        
+
    def testStripping(self):
        'Test that any original entries are removed from the CSS'
        css = 'p { font: bold 10px italic smaller; font-size: x-large} \na { font-size: 0 }'
        css = CSSParser(loglevel=logging.ERROR).parseString(css)
        Rationalizer.compute_font_sizes(copy.deepcopy(self.root), [css])
-        self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''), 
+        self.assertEqual(css.cssText.replace(' ', '').replace('\n', ''),
                         'p{font:bolditalic}')
-    
+
    def testIdentity(self):
        'Test that no unnecessary font size changes are made'
        extra_css = self.do_test('div {font-size:12pt} \nspan {font-size:100%}')
        self.assertEqual(extra_css.strip(), '')
-        
+
    def testRelativization(self):
        'Test conversion of absolute to relative sizes'
        self.do_test('#p1 {font: 24pt} b {font: 12pt} .it {font: 48pt} #p2 {font: 100%}')
-        
+
    def testResizing(self):
        'Test resizing of fonts'
        self.do_test('#longest {font: 24pt} .it {font:20pt; line-height:22pt}')
-        
+

 def suite():
    return unittest.TestLoader().loadTestsFromTestCase(FontTest)
-    
+
 def test():
    unittest.TextTestRunner(verbosity=2).run(suite())

 if __name__ == '__main__':
-    sys.exit(test())    
-        
+    sys.exit(test())
+
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -38,7 +38,7 @@ from lxml.etree import XPath
 from lxml import html, etree
 from PyQt4.Qt import QApplication, QPixmap

-from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
+from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\
    opf_traverse, create_metadata, rebase_toc, Link, parser
 from calibre.ebooks.epub import config as common_config, tostring
 from calibre.ptempfile import TemporaryDirectory
--- a/src/calibre/ebooks/epub/iterator.py
+++ b/src/calibre/ebooks/epub/iterator.py
@ -16,7 +16,7 @@ from calibre.ebooks.epub import config
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.chardet import xml_to_unicode
-from calibre.ebooks.html import create_dir
+from calibre.ebooks.html_old import create_dir
 from calibre.utils.zipfile import safe_replace, ZipFile
 from calibre.utils.config import DynamicConfig

--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
 Split the flows in an epub file to conform to size limitations.
 '''

-import os, math, logging, functools, collections, re, copy, sys
+import os, math, functools, collections, re, copy, sys

 from lxml.etree import XPath as _XPath
 from lxml import etree, html
@ -24,16 +24,16 @@ SPLIT_ATTR       = 'cs'
 SPLIT_POINT_ATTR = 'csp'

 class SplitError(ValueError):
-    
+
    def __init__(self, path, root):
        size = len(tostring(root))/1024.
-        ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% 
+        ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')%
                            (os.path.basename(path), size))

-    
+

 class Splitter(object):
-    
+
    def __init__(self, path, opts, stylesheet_map, opf):
        self.setup_cli_handler(opts.verbose)
        self.path = path
@ -44,10 +44,10 @@ class Splitter(object):
        self.orig_size = os.stat(content(path)).st_size
        self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.)
        root = html.fromstring(open(content(path)).read())
-            
+
        self.page_breaks, self.trees = [], []
        self.split_size = 0
-        
+
        # Split on page breaks
        self.splitting_on_page_breaks = True
        if not opts.dont_split_on_page_breaks:
@ -59,7 +59,7 @@ class Splitter(object):
        else:
            self.trees = [root.getroottree()]
            trees = list(self.trees)
-        
+
        # Split any remaining over-sized trees
        self.splitting_on_page_breaks = False
        if self.opts.profile.flow_size < sys.maxint:
@ -67,7 +67,7 @@ class Splitter(object):
            self.log_info('\tLooking for large trees...')
            for i, tree in enumerate(list(trees)):
                self.trees = []
-                size = len(tostring(tree.getroot())) 
+                size = len(tostring(tree.getroot()))
                if size > self.opts.profile.flow_size:
                    lt_found = True
                    try:
@ -81,7 +81,7 @@ class Splitter(object):
                    trees[i:i+1] = list(self.trees)
            if not lt_found:
                self.log_info('\tNo large trees found')
-        
+
        self.trees = trees
        self.was_split = len(self.trees) > 1
        if self.was_split:
@ -91,17 +91,17 @@ class Splitter(object):
                for f in self.files:
                    self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.)
            self.fix_opf(opf)
-            
+
        self.trees = None
-        
-    
+
+
    def split_text(self, text, root, size):
        self.log_debug('\t\t\tSplitting text of length: %d'%len(text))
        rest = text.replace('\r', '')
        parts = re.split('\n\n', rest)
        self.log_debug('\t\t\t\tFound %d parts'%len(parts))
        if max(map(len, parts)) > size:
-            raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root) 
+            raise SplitError('Cannot split as file contains a <pre> tag with a very large paragraph', root)
        ans = []
        buf = ''
        for part in parts:
@ -111,8 +111,8 @@ class Splitter(object):
                ans.append(buf)
                buf = part
        return ans
-            
-    
+
+
    def split_to_size(self, tree):
        self.log_debug('\t\tSplitting...')
        root = tree.getroot()
@ -134,7 +134,7 @@ class Splitter(object):
                p = pre.getparent()
                i = p.index(pre)
                p[i:i+1] = new_pres
-        
+
        split_point, before = self.find_split_point(root)
        if split_point is None or self.split_size > 6*self.orig_size:
            if not self.always_remove:
@ -142,7 +142,7 @@ class Splitter(object):
                                'structure preservation. This may cause '
                                'incorrect rendering.'))
            raise SplitError(self.path, root)
-        
+
        for t in self.do_split(tree, split_point, before):
            r = t.getroot()
            if self.is_page_empty(r):
@ -151,12 +151,12 @@ class Splitter(object):
            if size <= self.opts.profile.flow_size:
                self.trees.append(t)
                #print tostring(t.getroot(), pretty_print=True)
-                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)', 
+                self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
                               len(self.trees), size/1024.)
                self.split_size += size
            else:
                self.split_to_size(t)
-    
+
    def is_page_empty(self, root):
        body = root.find('body')
        if body is None:
@ -170,14 +170,14 @@ class Splitter(object):
            if img.get('style', '') != 'display:none':
                return False
        return True
-                
+
    def do_split(self, tree, split_point, before):
        '''
-        Split ``tree`` into a *before* and *after* tree at ``split_point``, 
-        preserving tag structure, but not duplicating any text. 
+        Split ``tree`` into a *before* and *after* tree at ``split_point``,
+        preserving tag structure, but not duplicating any text.
        All tags that have had their text and tail
        removed have the attribute ``calibre_split`` set to 1.
-        
+
        :param before: If True tree is split before split_point, otherwise after split_point
        :return: before_tree, after_tree
        '''
@ -188,7 +188,7 @@ class Splitter(object):
        body, body2  = root.body, root2.body
        split_point  = root.xpath(path)[0]
        split_point2 = root2.xpath(path)[0]
-        
+
        def nix_element(elem, top=True):
            if self.always_remove:
                parent = elem.getparent()
@ -198,18 +198,18 @@ class Splitter(object):
                else:
                    index = parent.index(elem)
                    parent[index:index+1] = list(elem.iterchildren())
-                
+
            else:
                elem.text = u''
                elem.tail = u''
                elem.set(SPLIT_ATTR, '1')
                if elem.tag.lower() in ['ul', 'ol', 'dl', 'table', 'hr', 'img']:
                    elem.set('style', 'display:none')
-        
+
        def fix_split_point(sp):
            if not self.splitting_on_page_breaks:
-                sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid') 
-        
+                sp.set('style', sp.get('style', '')+'page-break-before:avoid;page-break-after:avoid')
+
        # Tree 1
        hit_split_point = False
        for elem in list(body.iterdescendants(etree.Element)):
@ -223,8 +223,8 @@ class Splitter(object):
                continue
            if hit_split_point:
                nix_element(elem)
-            
-            
+
+
        # Tree 2
        hit_split_point = False
        for elem in list(body2.iterdescendants(etree.Element)):
@ -238,17 +238,17 @@ class Splitter(object):
                continue
            if not hit_split_point:
                nix_element(elem, top=False)
-        
+
        return tree, tree2
-                
-    
+
+
    def split_on_page_breaks(self, orig_tree):
        ordered_ids = []
        for elem in orig_tree.xpath('//*[@id]'):
            id = elem.get('id')
            if id in self.page_break_ids:
                ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
-                
+
        self.trees = []
        tree = orig_tree
        for pattern, before in ordered_ids:
@ -260,13 +260,13 @@ class Splitter(object):
                tree = after
        self.trees.append(tree)
        self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
-                
-            
-                
+
+
+
    def find_page_breaks(self, stylesheets, root):
        '''
        Find all elements that have either page-break-before or page-break-after set.
-        Populates `self.page_breaks` with id based XPath selectors (for elements that don't 
+        Populates `self.page_breaks` with id based XPath selectors (for elements that don't
        have ids, an id is created).
        '''
        page_break_selectors = set([])
@ -283,16 +283,16 @@ class Splitter(object):
                    page_break_selectors.add((CSSSelector(rule.selectorText), False))
            except:
                pass
-            
+
        page_breaks = set([])
        for selector, before in page_break_selectors:
            for elem in selector(root):
                elem.pb_before = before
                page_breaks.add(elem)
-                
+
        for i, elem in enumerate(root.iter()):
            elem.pb_order = i
-            
+
        page_breaks = list(page_breaks)
        page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
        self.page_break_ids = []
@ -300,12 +300,12 @@ class Splitter(object):
            x.set('id', x.get('id', 'calibre_pb_%d'%i))
            id = x.get('id')
            self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
-            self.page_break_ids.append(id)                        
-        
-        
+            self.page_break_ids.append(id)
+
+
    def find_split_point(self, root):
        '''
-        Find the tag at which to split the tree rooted at `root`. 
+        Find the tag at which to split the tree rooted at `root`.
        Search order is:
            * Heading tags
            * <div> tags
@ -314,7 +314,7 @@ class Splitter(object):
            * <p> tags
            * <br> tags
            * <li> tags
-            
+
        We try to split in the "middle" of the file (as defined by tag counts.
        '''
        def pick_elem(elems):
@ -325,18 +325,18 @@ class Splitter(object):
                    i = int(math.floor(len(elems)/2.))
                    elems[i].set(SPLIT_POINT_ATTR, '1')
                    return elems[i]
-    
+
        for path in (
-                     '//*[re:match(name(), "h[1-6]", "i")]', 
+                     '//*[re:match(name(), "h[1-6]", "i")]',
                     '/html/body/div',
                     '//pre',
-                     '//hr', 
+                     '//hr',
                     '//p',
                     '//div',
                     '//br',
                     '//li',
                     ):
-            elems = root.xpath(path, 
+            elems = root.xpath(path,
                    namespaces={'re':'http://exslt.org/regular-expressions'})
            elem = pick_elem(elems)
            if elem is not None:
@ -345,9 +345,9 @@ class Splitter(object):
                except:
                    continue
                return elem, True
-            
+
        return None, True
-    
+
    def commit(self):
        '''
        Commit all changes caused by the split. This removes the previously
@ -357,7 +357,7 @@ class Splitter(object):
        '''
        self.anchor_map = collections.defaultdict(lambda :self.base%0)
        self.files = []
-        
+
        for i, tree in enumerate(self.trees):
            root = tree.getroot()
            self.files.append(self.base%i)
@ -367,7 +367,7 @@ class Splitter(object):
            for elem in root.xpath('//*[@%s or @%s]'%(SPLIT_ATTR, SPLIT_POINT_ATTR)):
                elem.attrib.pop(SPLIT_ATTR, None)
                elem.attrib.pop(SPLIT_POINT_ATTR, '0')
-                
+
        for current, tree in zip(self.files, self.trees):
            for a in tree.getroot().xpath('//a[@href]'):
                href = a.get('href').strip()
@ -375,10 +375,10 @@ class Splitter(object):
                    anchor = href[1:]
                    file = self.anchor_map[anchor]
                    if file != current:
-                        a.set('href', file+href)            
+                        a.set('href', file+href)
            open(content(current), 'wb').\
                write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
-            
+
        os.remove(content(self.path))


@ -391,12 +391,12 @@ class Splitter(object):
        id_map = {}
        for item in items:
            id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
-        
+
        for id in id_map.keys():
            opf.replace_spine_items_by_idref(id, id_map[id])
-        
+
        for ref in opf.iterguide():
-            href = ref.get('href', '') 
+            href = ref.get('href', '')
            if href.startswith('content/'+self.path):
                href = href.split('#')
                frag = None
@ -408,8 +408,8 @@ class Splitter(object):
                new_file = self.anchor_map[frag]
                ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))

-          
-                
+
+
 def fix_content_links(html_files, changes, opts):
    split_files = [f.path for f in changes]
    anchor_maps = [f.anchor_map for f in changes]
@ -420,7 +420,7 @@ def fix_content_links(html_files, changes, opts):
            files[i:i+1] = changes[j].files
        except ValueError:
            continue
-        
+
    for htmlfile in files:
        changed = False
        root = html.fromstring(open(content(htmlfile), 'rb').read())
@ -439,7 +439,7 @@ def fix_content_links(html_files, changes, opts):
                    frag = ('#'+anchor) if anchor else ''
                    a.set('href', newf+frag)
                    changed = True
-                    
+
        if changed:
            open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))

@ -448,7 +448,7 @@ def fix_ncx(path, changes):
    anchor_maps = [f.anchor_map for f in changes]
    tree = etree.parse(path)
    changed = False
-    for content in tree.getroot().xpath('//x:content[@src]', 
+    for content in tree.getroot().xpath('//x:content[@src]',
                    namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
        href = content.get('src')
        if not href.startswith('#'):
@ -481,21 +481,21 @@ def find_html_files(opf):
            if os.path.exists(content(f)):
                html_files.append(f)
    return html_files
-        
+

 def split(pathtoopf, opts, stylesheet_map):
    pathtoopf = os.path.abspath(pathtoopf)
    opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
-    
+
    with CurrentDir(os.path.dirname(pathtoopf)):
        html_files = find_html_files(opf)
        changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
        changes = [c for c in changes if c.was_split]
-        
+
        fix_content_links(html_files, changes, opts)
        for item in opf.itermanifest():
            if item.get('media-type', '') == 'application/x-dtbncx+xml':
                fix_ncx(item.get('href'), changes)
-                break 
+                break

        open(pathtoopf, 'wb').write(opf.render())
--- a/src/calibre/ebooks/html/init.py
+++ b/src/calibre/ebooks/html/init.py
@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from lxml.etree import tostring as _tostring
+
+def tostring(root, strip_comments=False, pretty_print=False):
+    '''
+    Serialize processed XHTML.
+    '''
+    root.set('xmlns', 'http://www.w3.org/1999/xhtml')
+    root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
+    for x in root.iter():
+        if x.tag.rpartition('}')[-1].lower() == 'svg':
+            x.set('xmlns', 'http://www.w3.org/2000/svg')
+
+    ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
+    if strip_comments:
+        ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
+    ans = '<?xml version="1.0" encoding="utf-8" ?>\n'+ans
+
+    return ans
+
+
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -0,0 +1,342 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+
+'''
+Input plugin for HTML or OPF ebooks.
+'''
+
+import os, re, sys, cStringIO
+from urlparse import urlparse, urlunparse
+from urllib import unquote
+
+from calibre.customize.conversion import InputFormatPlugin
+from calibre.ebooks.metadata.meta import get_metadata
+from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
+from calibre.ebooks.metadata import MetaInformation
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.customize.conversion import OptionRecommendation
+from calibre import unicode_path
+
+class Link(object):
+    '''
+    Represents a link in a HTML file.
+    '''
+
+    @classmethod
+    def url_to_local_path(cls, url, base):
+        path = urlunparse(('', '', url.path, url.params, url.query, ''))
+        path = unquote(path)
+        if os.path.isabs(path):
+            return path
+        return os.path.abspath(os.path.join(base, path))
+
+    def __init__(self, url, base):
+        '''
+        :param url:  The url this link points to. Must be an unquoted unicode string.
+        :param base: The base directory that relative URLs are with respect to.
+                     Must be a unicode string.
+        '''
+        assert isinstance(url, unicode) and isinstance(base, unicode)
+        self.url         = url
+        self.parsed_url  = urlparse(self.url)
+        self.is_local    = self.parsed_url.scheme in ('', 'file')
+        self.is_internal = self.is_local and not bool(self.parsed_url.path)
+        self.path        = None
+        self.fragment    = unquote(self.parsed_url.fragment)
+        if self.is_local and not self.is_internal:
+            self.path = self.url_to_local_path(self.parsed_url, base)
+
+    def __hash__(self):
+        if self.path is None:
+            return hash(self.url)
+        return hash(self.path)
+
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+
+    def __str__(self):
+        return u'Link: %s --> %s'%(self.url, self.path)
+
+
+class IgnoreFile(Exception):
+
+    def __init__(self, msg, errno):
+        Exception.__init__(self, msg)
+        self.doesnt_exist = errno == 2
+        self.errno = errno
+
+class HTMLFile(object):
+    '''
+    Contains basic information about an HTML file. This
+    includes a list of links to other files as well as
+    the encoding of each file. Also tries to detect if the file is not a HTML
+    file in which case :member:`is_binary` is set to True.
+
+    The encoding of the file is available as :member:`encoding`.
+    '''
+
+    HTML_PAT  = re.compile(r'<\s*html', re.IGNORECASE)
+    TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
+    LINK_PAT  = re.compile(
+    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
+    re.DOTALL|re.IGNORECASE)
+
+    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
+        '''
+        :param level: The level of this file. Should be 0 for the root file.
+        :param encoding: Use `encoding` to decode HTML.
+        :param referrer: The :class:`HTMLFile` that first refers to this file.
+        '''
+        self.path     = unicode_path(path_to_html_file, abs=True)
+        self.title    = os.path.splitext(os.path.basename(self.path))[0]
+        self.base     = os.path.dirname(self.path)
+        self.level    = level
+        self.referrer = referrer
+        self.links    = []
+
+        try:
+            with open(self.path, 'rb') as f:
+                src = f.read()
+        except IOError, err:
+            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
+            if level == 0:
+                raise IOError(msg)
+            raise IgnoreFile(msg, err.errno)
+
+        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
+        if not self.is_binary:
+            if encoding is None:
+                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
+                self.encoding = encoding
+            else:
+                self.encoding = encoding
+
+            src = src.decode(encoding, 'replace')
+            match = self.TITLE_PAT.search(src)
+            self.title = match.group(1) if match is not None else self.title
+            self.find_links(src)
+
+
+
+    def __eq__(self, other):
+        return self.path == getattr(other, 'path', other)
+
+    def __str__(self):
+        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
+
+    def __repr__(self):
+        return str(self)
+
+
+    def find_links(self, src):
+        for match in self.LINK_PAT.finditer(src):
+            url = None
+            for i in ('url1', 'url2', 'url3'):
+                url = match.group(i)
+                if url:
+                    break
+            link = self.resolve(url)
+            if link not in self.links:
+                self.links.append(link)
+
+    def resolve(self, url):
+        return Link(url, self.base)
+
+
+def depth_first(root, flat, visited=set([])):
+    yield root
+    visited.add(root)
+    for link in root.links:
+        if link.path is not None and link not in visited:
+            try:
+                index = flat.index(link)
+            except ValueError: # Can happen if max_levels is used
+                continue
+            hf = flat[index]
+            if hf not in visited:
+                yield hf
+                visited.add(hf)
+                for hf in depth_first(hf, flat, visited):
+                    if hf not in visited:
+                        yield hf
+                        visited.add(hf)
+
+
+def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
+    '''
+    Recursively traverse all links in the HTML file.
+
+    :param max_levels: Maximum levels of recursion. Must be non-negative. 0
+                       implies that no links in the root HTML file are followed.
+    :param encoding:   Specify character encoding of HTML files. If `None` it is
+                       auto-detected.
+    :return:           A pair of lists (breadth_first, depth_first). Each list contains
+                       :class:`HTMLFile` objects.
+    '''
+    assert max_levels >= 0
+    level = 0
+    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
+    next_level = list(flat)
+    while level < max_levels and len(next_level) > 0:
+        level += 1
+        nl = []
+        for hf in next_level:
+            rejects = []
+            for link in hf.links:
+                if link.path is None or link.path in flat:
+                    continue
+                try:
+                    nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
+                    if nf.is_binary:
+                        raise IgnoreFile('%s is a binary file'%nf.path, -1)
+                    nl.append(nf)
+                    flat.append(nf)
+                except IgnoreFile, err:
+                    rejects.append(link)
+                    if not err.doesnt_exist or verbose > 1:
+                        print repr(err)
+            for link in rejects:
+                hf.links.remove(link)
+
+        next_level = list(nl)
+    orec = sys.getrecursionlimit()
+    sys.setrecursionlimit(500000)
+    try:
+        return flat, list(depth_first(flat[0], flat))
+    finally:
+        sys.setrecursionlimit(orec)
+
+
+def opf_traverse(opf_reader, verbose=0, encoding=None):
+    '''
+    Return a list of :class:`HTMLFile` objects in the order specified by the
+    `<spine>` element of the OPF.
+
+    :param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
+    :param encoding:   Specify character encoding of HTML files. If `None` it is
+                       auto-detected.
+    '''
+    if not opf_reader.spine:
+        raise ValueError('OPF does not have a spine')
+    flat = []
+    for path in opf_reader.spine.items():
+        path = os.path.abspath(path)
+        if path not in flat:
+            flat.append(os.path.abspath(path))
+    for item in opf_reader.manifest:
+        if 'html' in item.mime_type:
+            path = os.path.abspath(item.path)
+            if path not in flat:
+                flat.append(path)
+    for i, path in enumerate(flat):
+        if not os.path.exists(path):
+            path = path.replace('&', '%26')
+            if os.path.exists(path):
+                flat[i] = path
+                for item in opf_reader.itermanifest():
+                    item.set('href', item.get('href').replace('&', '%26'))
+    ans = []
+    for path in flat:
+        if os.path.exists(path):
+            ans.append(HTMLFile(path, 0, encoding, verbose))
+        else:
+            print 'WARNING: OPF spine item %s does not exist'%path
+    ans = [f for f in ans if not f.is_binary]
+    return ans
+
+def search_for_opf(dir):
+    for f in os.listdir(dir):
+        if f.lower().endswith('.opf'):
+            return OPF(open(os.path.join(dir, f), 'rb'), dir)
+
+def get_filelist(htmlfile, dir, opts, log):
+    '''
+    Build list of files referenced by html file or try to detect and use an
+    OPF file instead.
+    '''
+    print 'Building file list...'
+    opf = search_for_opf(dir)
+    filelist = None
+    if opf is not None:
+        try:
+            filelist = opf_traverse(opf, verbose=opts.verbose,
+                    encoding=opts.input_encoding)
+        except:
+            pass
+    if not filelist:
+        filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
+                            verbose=opts.verbose,
+                            encoding=opts.input_encoding)\
+                    [0 if opts.breadth_first else 1]
+    if opts.verbose:
+        log.debug('\tFound files...')
+        for f in filelist:
+            log.debug('\t\t', f)
+    return opf, filelist
+
+
+class HTMLInput(InputFormatPlugin):
+
+    name        = 'HTML Input'
+    author      = 'Kovid Goyal'
+    description = 'Convert HTML and OPF files to an OEB'
+    file_types  = set(['opf', 'html', 'htm', 'xhtml', 'xhtm'])
+
+    options = set([
+        OptionRecommendation(name='breadth_first',
+            recommended_value=False, level=OptionRecommendation.LOW,
+            help=_('Traverse links in HTML files breadth first. Normally, '
+                    'they are traversed depth first.'
+                   )
+        ),
+
+        OptionRecommendation(name='max_levels',
+            recommended_value=5, level=OptionRecommendation.LOW,
+            help=_('Maximum levels of recursion when following links in '
+                   'HTML files. Must be non-negative. 0 implies that no '
+                   'links in the root HTML file are followed. Default is '
+                   '%default.'
+                   )
+        ),
+
+    ])
+
+    def convert(self, stream, opts, file_ext, log,
+                accelerators):
+        basedir = os.getcwd()
+        if hasattr(stream, 'name'):
+            basedir = os.path.dirname(stream.name)
+        if file_ext == 'opf':
+            opf = OPF(stream, basedir)
+            filelist = opf_traverse(opf, verbose=opts.verbose,
+                    encoding=opts.input_encoding)
+            mi = MetaInformation(opf)
+        else:
+            opf, filelist = get_filelist(stream.name, basedir, opts, log)
+            mi = MetaInformation(opf)
+            mi.smart_update(get_metadata(stream, 'html'))
+
+        mi = OPFCreator(os.getcwdu(), mi)
+        mi.guide = None
+        entries = [(f.path, 'application/xhtml+xml') for f in filelist]
+        mi.create_manifest(entries)
+        mi.create_spine([f.path for f in filelist])
+
+        tocbuf = cStringIO.StringIO()
+        mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
+        toc = tocbuf.getvalue()
+        if toc:
+            open('toc.ncx', 'wb').write(toc)
+
+        from calibre.ebooks.conversion.plumber import create_oebbook
+        return create_oebbook(log, os.path.abspath('metadata.opf'))
+
+
+
+
--- a/src/calibre/ebooks/html_old.py
+++ b/src/calibre/ebooks/html_old.py
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@ -683,26 +683,6 @@ class OPF(object):

        return property(fget=fget, fset=fset)

-    @dynamic_property
-    def title_sort(self):
-
-        def fget(self):
-            matches = self.title_path(self.metadata)
-            if matches:
-                for match in matches:
-                    ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None)
-                    if not ans:
-                        ans = match.get('file-as', None)
-                    if ans:
-                        return ans
-
-        def fset(self, val):
-            matches = self.title_path(self.metadata)
-            if matches:
-                matches[0].set('file-as', unicode(val))
-
-        return property(fget=fget, fset=fset)
-
    @dynamic_property
    def tags(self):

@ -943,9 +923,10 @@ class OPFCreator(MetaInformation):
        from calibre.resources import opf_template
        from calibre.utils.genshi.template import MarkupTemplate
        template = MarkupTemplate(opf_template)
+        toc = getattr(self, 'toc', None)
        if self.manifest:
            self.manifest.set_basedir(self.base_path)
-            if ncx_manifest_entry is not None:
+            if ncx_manifest_entry is not None and toc is not None:
                if not os.path.isabs(ncx_manifest_entry):
                    ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
                remove = [i for i in self.manifest if i.id == 'ncx']
@ -965,7 +946,6 @@ class OPFCreator(MetaInformation):
        opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
        opf_stream.write(opf)
        opf_stream.flush()
-        toc = getattr(self, 'toc', None)
        if toc is not None and ncx_stream is not None:
            toc.render(ncx_stream, self.application_id)
            ncx_stream.flush()
@ -1030,19 +1010,8 @@ class OPFTest(unittest.TestCase):
        self.opf.smart_update(MetaInformation(self.opf))
        self.testReading()

-    def testCreator(self):
-        opf = OPFCreator(os.getcwd(), self.opf)
-        buf = cStringIO.StringIO()
-        opf.render(buf)
-        raw = buf.getvalue()
-        self.testReading(opf=OPF(cStringIO.StringIO(raw), os.getcwd()))
-
-    def testSmartUpdate(self):
-        self.opf.smart_update(self.opf)
-        self.testReading()
-
 def suite():
    return unittest.TestLoader().loadTestsFromTestCase(OPFTest)

 def test():
-    unittest.TextTestRunner(verbosity=2).run(suite())
+    unittest.TextTestRunner(verbosity=2).run(suite())
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
            with open(f, 'wb') as q:
                q.write(html.tostring(root, encoding='utf-8', method='xml',
                    include_meta_content_type=False))
-            accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
+            accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
        return mr.created_opf_path
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -522,7 +522,7 @@ class MobiReader(object):
        else:
            raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
        if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
-            self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
+            self.mobi_html = self.mobi_html.replace('\r ', '\n\n').replace('\0', '')
        return processed_records


--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -151,7 +151,7 @@ def resolve_base_href(root):
        return
    make_links_absolute(root, base_href, resolve_base_href=False)

-def rewrite_links(root, link_repl_func, resolve_base_href=True):
+def rewrite_links(root, link_repl_func, resolve_base_href=False):
    '''
    Rewrite all the links in the document.  For each link
    ``link_repl_func(link)`` will be called, and the return value
--- a/src/calibre/ebooks/oeb/transforms/package.py
+++ b/src/calibre/ebooks/oeb/transforms/package.py
@ -6,9 +6,16 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import os, shutil
+import os
+from urllib import unquote as urlunquote
+from functools import partial

-from calibre.ebooks.oeb.base import OEB_DOCS
+from lxml import etree
+import cssutils
+
+from calibre.constants import islinux
+from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
+                                    rewrite_links

 class Package(object):

@ -29,18 +36,69 @@ class Package(object):
        self.new_base_path = os.path.abspath(base)

    def rewrite_links_in(self, item):
-        new_items = []
-        return new_items
+        base = os.path.join(self.new_base_path, *item.href.split('/'))
+        base = os.path.dirname(base)
+
+        if etree.iselement(item.data):
+            self.rewrite_links_in_xml(item.data, base)
+        elif hasattr(item.data, 'cssText'):
+            self.rewrite_links_in_css(item.data, base)
+
+    def link_replacer(self, link_, base=''):
+        link = urlnormalize(link_)
+        link, frag = urldefrag(link)
+        link = urlunquote(link).replace('/', os.sep)
+        if base and not os.path.isabs(link):
+            link = os.path.join(base, link)
+        link = os.path.abspath(link)
+        if not islinux:
+            link = link.lower()
+        if link not in self.map:
+            return link_
+        nlink = os.path.relpath(self.map[link], base)
+        if frag:
+            nlink = '#'.join(nlink, frag)
+        return nlink.replace(os.sep, '/')
+
+    def rewrite_links_in_css(self, sheet, base):
+        repl = partial(self.link_replacer, base=base)
+        cssutils.replaceUrls(sheet, repl)
+
+    def rewrite_links_in_xml(self, root, base):
+        repl = partial(self.link_replacer, base=base)
+        rewrite_links(root, repl)

    def move_manifest_item(self, item):
        item.data # Make sure the data has been loaded and cached
-        old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
-        bname = item.href.split('/')[-1]
-        new_href = 'content/' + \
-                ('resources/' if item.media_type in OEB_DOCS else '')+bname
+        old_abspath = os.path.join(self.old_base_path,
+                *(urldefrag(item.href)[0].split('/')))
+        old_abspath = os.path.abspath(old_abspath)
+        bname = item.href.split('/')[-1].partition('#')[0]
+        new_href = 'content/resources/'
+        if item.media_type in OEB_DOCS:
+            new_href = 'content/'
+        elif item.href.lower().endswith('.ncx'):
+            new_href = ''
+        new_href += bname
+
+        new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
+        new_abspath = os.path.abspath(new_abspath)
+        item.href   = new_href
+        if not islinux:
+            old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
+        if old_abspath != new_abspath:
+            self.map[old_abspath] = new_abspath
+
+    def rewrite_links_in_toc(self, toc):
+        if toc.href:
+            toc.href = self.link_replacer(toc.href, base=self.new_base_path)
+
+        for x in toc:
+            self.rewrite_links_in_toc(x)

    def __call__(self, oeb, context):
        self.map = {}
+        self.log = self.oeb.log
        self.old_base_path = os.path.abspath(oeb.container.rootdir)

        for item in self.oeb.manifest:
@ -49,4 +107,9 @@ class Package(object):
        for item in self.oeb.manifest:
            self.rewrite_links_in(item)

+        if getattr(oeb.toc, 'nodes', False):
+            self.rewrite_links_in_toc(oeb.toc)

+        if hasattr(oeb, 'guide'):
+            for ref in oeb.guide.values():
+                ref.href = self.link_replacer(ref.href, base=self.new_base_path)
--- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py
+++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py
@ -6,11 +6,12 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

-from itertools import chain
 from urlparse import urldefrag
+
+import cssutils
+
 from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
-from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE
-from calibre.ebooks.oeb.base import urlnormalize
+from calibre.ebooks.oeb.base import urlnormalize, iterlinks

 class ManifestTrimmer(object):
    @classmethod
@ -44,16 +45,15 @@ class ManifestTrimmer(object):
                if (item.media_type in OEB_DOCS or
                    item.media_type[-4:] in ('/xml', '+xml')) and \
                   item.data is not None:
-                    hrefs = [sel(item.data) for sel in LINK_SELECTORS]
-                    for href in chain(*hrefs):
+                    hrefs = [r[2] for r in iterlinks(item.data)]
+                    for href in hrefs:
                        href = item.abshref(urlnormalize(href))
                        if href in oeb.manifest.hrefs:
                            found = oeb.manifest.hrefs[href]
                            if found not in used:
                                new.add(found)
                elif item.media_type == CSS_MIME:
-                    for match in CSSURL_RE.finditer(item.data.cssText):
-                        href = match.group('url')
+                    for href in cssutils.getUrls(item.data):
                        href = item.abshref(urlnormalize(href))
                        if href in oeb.manifest.hrefs:
                            found = oeb.manifest.hrefs[href]
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@ -22,9 +22,6 @@ entry_points = {
             'web2disk           = calibre.web.fetch.simple:main',
             'feeds2disk         = calibre.web.feeds.main:main',
             'calibre-server     = calibre.library.server:main',
-             'feeds2lrf          = calibre.ebooks.lrf.feeds.convert_from:main',
-             'feeds2epub         = calibre.ebooks.epub.from_feeds:main',
-             'feeds2mobi         = calibre.ebooks.mobi.from_feeds:main',
             'web2lrf            = calibre.ebooks.lrf.web.convert_from:main',
             'lrf2lrs            = calibre.ebooks.lrf.lrfparser:main',
             'lrs2lrf            = calibre.ebooks.lrf.lrs.convert_from:main',
@ -154,10 +151,7 @@ def setup_completion(fatal_errors):
        from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
        from calibre.web.feeds.main import option_parser as feeds2disk
        from calibre.web.feeds.recipes import titles as feed_titles
-        from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
        from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
-        from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
-        from calibre.ebooks.mobi.from_feeds import option_parser as feeds2mobi
        from calibre.ebooks.epub.from_comic import option_parser as comic2epub
        from calibre.ebooks.metadata.fetch import option_parser as fem_op
        from calibre.gui2.main import option_parser as guiop
@ -192,9 +186,6 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr']))
        f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
-        f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
-        f.write(opts_and_words('feeds2epub', feeds2epub, feed_titles))
-        f.write(opts_and_words('feeds2mobi', feeds2mobi, feed_titles))
        f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
        f.write(opts_and_words('calibre-smtp', smtp_op, []))
        f.write('''