Untested implementation of HTML input. Uses a new transform that 'packages' an OEB book into a folder structure (the same folder structure that was used in the old codebase for EPUB output). This may have broken other thin gs, so use with care.

2025-07-09 03:04:10 -04:00 · 2009-04-08 17:44:29 -07:00 · 2009-04-08 17:44:29 -07:00 · 093b98a9f1
commit 093b98a9f1
parent b2bfab32cf
17 changed files with 609 additions and 206 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -122,8 +122,9 @@ class InputFormatPlugin(Plugin):
    def convert(self, stream, options, file_ext, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
-        the path to the created OPF file. All output should be contained in
+        the path to the created OPF file or an :class:`OEBBook` instance.
-        the current directory. If this plugin creates files outside the current
+        All output should be contained in the current directory.
        If this plugin creates files outside the current
        directory they must be deleted/marked for deletion before this method
        returns.
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -299,21 +299,15 @@ OptionRecommendation(name='language',
        # Create an OEBBook from the input file. The input plugin does all the
        # heavy lifting.
        from calibre.ebooks.oeb.reader import OEBReader
        from calibre.ebooks.oeb.base import OEBBook
        accelerators = {}
        tdir = PersistentTemporaryDirectory('_plumber')
-        opfpath = self.input_plugin(open(self.input, 'rb'), self.opts,
+        self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
                                    self.input_fmt, self.log,
                                    accelerators, tdir)
-        html_preprocessor = HTMLPreProcessor()
+        if not hasattr(self.oeb, 'manifest'):
-        self.reader = OEBReader()
+            self.oeb = create_oebbook(self.log, self.oeb)
        self.oeb = OEBBook(self.log, html_preprocessor=html_preprocessor)
        # Read OEB Book into OEBBook
        self.log.info('Parsing all content...')
        self.reader(self.oeb, opfpath)
        self.opts.source = self.opts.input_profile
        self.opts.dest = self.opts.output_profile
@ -340,7 +334,20 @@ OptionRecommendation(name='language',
        trimmer(self.oeb, self.opts)
        self.log.info('Creating %s...'%self.output_plugin.name)
-        self.output_plugin.convert(self.oeb, self.output, self.input_plugin, self.opts,
+        self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
-                self.log)
+                self.opts, self.log)
 def create_oebbook(log, opfpath):
    '''
    Create an OEBBook from an OPF file.
    '''
    from calibre.ebooks.oeb.reader import OEBReader
    from calibre.ebooks.oeb.base import OEBBook
    html_preprocessor = HTMLPreProcessor()
    reader = OEBReader()
    oeb = OEBBook(log, html_preprocessor=html_preprocessor)
    # Read OEB Book into OEBBook
    log.info('Parsing all content...')
    reader(oeb, opfpath)
    return oeb
--- a/src/calibre/ebooks/epub/init.py
+++ b/src/calibre/ebooks/epub/init.py
@ -10,7 +10,7 @@ import sys, textwrap, re, os, uuid
 from itertools import cycle
 from calibre.utils.config import Config, StringConfig
 from calibre.utils.zipfile import ZipFile, ZIP_STORED
-from calibre.ebooks.html import config as common_config, tostring
+from calibre.ebooks.html import tostring
 from lxml import etree
 class DefaultProfile(object):
--- a/src/calibre/ebooks/epub/fonts.py
+++ b/src/calibre/ebooks/epub/fonts.py
@ -14,7 +14,7 @@ from lxml.cssselect import CSSSelector
 from lxml import etree
 from lxml.html import HtmlElement
-from calibre.ebooks.html import fromstring
+from calibre.ebooks.html_old import fromstring
 from calibre.ebooks.epub import rules
 from cssutils import CSSParser
--- a/src/calibre/ebooks/epub/from_html.py
+++ b/src/calibre/ebooks/epub/from_html.py
@ -38,7 +38,7 @@ from lxml.etree import XPath
 from lxml import html, etree
 from PyQt4.Qt import QApplication, QPixmap
-from calibre.ebooks.html import Processor, merge_metadata, get_filelist,\
+from calibre.ebooks.html_old import Processor, merge_metadata, get_filelist,\
    opf_traverse, create_metadata, rebase_toc, Link, parser
 from calibre.ebooks.epub import config as common_config, tostring
 from calibre.ptempfile import TemporaryDirectory
--- a/src/calibre/ebooks/epub/iterator.py
+++ b/src/calibre/ebooks/epub/iterator.py
@ -16,7 +16,7 @@ from calibre.ebooks.epub import config
 from calibre.ebooks.metadata.opf2 import OPF
 from calibre.ptempfile import TemporaryDirectory
 from calibre.ebooks.chardet import xml_to_unicode
-from calibre.ebooks.html import create_dir
+from calibre.ebooks.html_old import create_dir
 from calibre.utils.zipfile import safe_replace, ZipFile
 from calibre.utils.config import DynamicConfig
--- a/src/calibre/ebooks/epub/split.py
+++ b/src/calibre/ebooks/epub/split.py
@ -7,7 +7,7 @@ __docformat__ = 'restructuredtext en'
 Split the flows in an epub file to conform to size limitations.
 '''
-import os, math, logging, functools, collections, re, copy, sys
+import os, math, functools, collections, re, copy, sys
 from lxml.etree import XPath as _XPath
 from lxml import etree, html
--- a/src/calibre/ebooks/html/init.py
+++ b/src/calibre/ebooks/html/init.py
@ -0,0 +1,30 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from lxml.etree import tostring as _tostring
 def tostring(root, strip_comments=False, pretty_print=False):
    '''
    Serialize processed XHTML.
    '''
    root.set('xmlns', 'http://www.w3.org/1999/xhtml')
    root.set('{http://www.w3.org/1999/xhtml}xlink', 'http://www.w3.org/1999/xlink')
    for x in root.iter():
        if x.tag.rpartition('}')[-1].lower() == 'svg':
            x.set('xmlns', 'http://www.w3.org/2000/svg')
    ans = _tostring(root, encoding='utf-8', pretty_print=pretty_print)
    if strip_comments:
        ans = re.compile(r'<!--.*?-->', re.DOTALL).sub('', ans)
    ans = '<?xml version="1.0" encoding="utf-8" ?>\n'+ans
    return ans
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -0,0 +1,342 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 '''
 Input plugin for HTML or OPF ebooks.
 '''
 import os, re, sys, cStringIO
 from urlparse import urlparse, urlunparse
 from urllib import unquote
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.metadata.meta import get_metadata
 from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.customize.conversion import OptionRecommendation
 from calibre import unicode_path
 class Link(object):
    '''
    Represents a link in a HTML file.
    '''
    @classmethod
    def url_to_local_path(cls, url, base):
        path = urlunparse(('', '', url.path, url.params, url.query, ''))
        path = unquote(path)
        if os.path.isabs(path):
            return path
        return os.path.abspath(os.path.join(base, path))
    def __init__(self, url, base):
        '''
        :param url:  The url this link points to. Must be an unquoted unicode string.
        :param base: The base directory that relative URLs are with respect to.
                     Must be a unicode string.
        '''
        assert isinstance(url, unicode) and isinstance(base, unicode)
        self.url         = url
        self.parsed_url  = urlparse(self.url)
        self.is_local    = self.parsed_url.scheme in ('', 'file')
        self.is_internal = self.is_local and not bool(self.parsed_url.path)
        self.path        = None
        self.fragment    = unquote(self.parsed_url.fragment)
        if self.is_local and not self.is_internal:
            self.path = self.url_to_local_path(self.parsed_url, base)
    def __hash__(self):
        if self.path is None:
            return hash(self.url)
        return hash(self.path)
    def __eq__(self, other):
        return self.path == getattr(other, 'path', other)
    def __str__(self):
        return u'Link: %s --> %s'%(self.url, self.path)
 class IgnoreFile(Exception):
    def __init__(self, msg, errno):
        Exception.__init__(self, msg)
        self.doesnt_exist = errno == 2
        self.errno = errno
 class HTMLFile(object):
    '''
    Contains basic information about an HTML file. This
    includes a list of links to other files as well as
    the encoding of each file. Also tries to detect if the file is not a HTML
    file in which case :member:`is_binary` is set to True.
    The encoding of the file is available as :member:`encoding`.
    '''
    HTML_PAT  = re.compile(r'<\s*html', re.IGNORECASE)
    TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
    LINK_PAT  = re.compile(
    r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
    re.DOTALL|re.IGNORECASE)
    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param encoding: Use `encoding` to decode HTML.
        :param referrer: The :class:`HTMLFile` that first refers to this file.
        '''
        self.path     = unicode_path(path_to_html_file, abs=True)
        self.title    = os.path.splitext(os.path.basename(self.path))[0]
        self.base     = os.path.dirname(self.path)
        self.level    = level
        self.referrer = referrer
        self.links    = []
        try:
            with open(self.path, 'rb') as f:
                src = f.read()
        except IOError, err:
            msg = 'Could not read from file: %s with error: %s'%(self.path, unicode(err))
            if level == 0:
                raise IOError(msg)
            raise IgnoreFile(msg, err.errno)
        self.is_binary = not bool(self.HTML_PAT.search(src[:1024]))
        if not self.is_binary:
            if encoding is None:
                encoding = xml_to_unicode(src[:4096], verbose=verbose)[-1]
                self.encoding = encoding
            else:
                self.encoding = encoding
            src = src.decode(encoding, 'replace')
            match = self.TITLE_PAT.search(src)
            self.title = match.group(1) if match is not None else self.title
            self.find_links(src)
    def __eq__(self, other):
        return self.path == getattr(other, 'path', other)
    def __str__(self):
        return u'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
    def __repr__(self):
        return str(self)
    def find_links(self, src):
        for match in self.LINK_PAT.finditer(src):
            url = None
            for i in ('url1', 'url2', 'url3'):
                url = match.group(i)
                if url:
                    break
            link = self.resolve(url)
            if link not in self.links:
                self.links.append(link)
    def resolve(self, url):
        return Link(url, self.base)
 def depth_first(root, flat, visited=set([])):
    yield root
    visited.add(root)
    for link in root.links:
        if link.path is not None and link not in visited:
            try:
                index = flat.index(link)
            except ValueError: # Can happen if max_levels is used
                continue
            hf = flat[index]
            if hf not in visited:
                yield hf
                visited.add(hf)
                for hf in depth_first(hf, flat, visited):
                    if hf not in visited:
                        yield hf
                        visited.add(hf)
 def traverse(path_to_html_file, max_levels=sys.maxint, verbose=0, encoding=None):
    '''
    Recursively traverse all links in the HTML file.
    :param max_levels: Maximum levels of recursion. Must be non-negative. 0
                       implies that no links in the root HTML file are followed.
    :param encoding:   Specify character encoding of HTML files. If `None` it is
                       auto-detected.
    :return:           A pair of lists (breadth_first, depth_first). Each list contains
                       :class:`HTMLFile` objects.
    '''
    assert max_levels >= 0
    level = 0
    flat =  [HTMLFile(path_to_html_file, level, encoding, verbose)]
    next_level = list(flat)
    while level < max_levels and len(next_level) > 0:
        level += 1
        nl = []
        for hf in next_level:
            rejects = []
            for link in hf.links:
                if link.path is None or link.path in flat:
                    continue
                try:
                    nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
                    if nf.is_binary:
                        raise IgnoreFile('%s is a binary file'%nf.path, -1)
                    nl.append(nf)
                    flat.append(nf)
                except IgnoreFile, err:
                    rejects.append(link)
                    if not err.doesnt_exist or verbose > 1:
                        print repr(err)
            for link in rejects:
                hf.links.remove(link)
        next_level = list(nl)
    orec = sys.getrecursionlimit()
    sys.setrecursionlimit(500000)
    try:
        return flat, list(depth_first(flat[0], flat))
    finally:
        sys.setrecursionlimit(orec)
 def opf_traverse(opf_reader, verbose=0, encoding=None):
    '''
    Return a list of :class:`HTMLFile` objects in the order specified by the
    `<spine>` element of the OPF.
    :param opf_reader: An :class:`calibre.ebooks.metadata.opf2.OPF` instance.
    :param encoding:   Specify character encoding of HTML files. If `None` it is
                       auto-detected.
    '''
    if not opf_reader.spine:
        raise ValueError('OPF does not have a spine')
    flat = []
    for path in opf_reader.spine.items():
        path = os.path.abspath(path)
        if path not in flat:
            flat.append(os.path.abspath(path))
    for item in opf_reader.manifest:
        if 'html' in item.mime_type:
            path = os.path.abspath(item.path)
            if path not in flat:
                flat.append(path)
    for i, path in enumerate(flat):
        if not os.path.exists(path):
            path = path.replace('&', '%26')
            if os.path.exists(path):
                flat[i] = path
                for item in opf_reader.itermanifest():
                    item.set('href', item.get('href').replace('&', '%26'))
    ans = []
    for path in flat:
        if os.path.exists(path):
            ans.append(HTMLFile(path, 0, encoding, verbose))
        else:
            print 'WARNING: OPF spine item %s does not exist'%path
    ans = [f for f in ans if not f.is_binary]
    return ans
 def search_for_opf(dir):
    for f in os.listdir(dir):
        if f.lower().endswith('.opf'):
            return OPF(open(os.path.join(dir, f), 'rb'), dir)
 def get_filelist(htmlfile, dir, opts, log):
    '''
    Build list of files referenced by html file or try to detect and use an
    OPF file instead.
    '''
    print 'Building file list...'
    opf = search_for_opf(dir)
    filelist = None
    if opf is not None:
        try:
            filelist = opf_traverse(opf, verbose=opts.verbose,
                    encoding=opts.input_encoding)
        except:
            pass
    if not filelist:
        filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
                            verbose=opts.verbose,
                            encoding=opts.input_encoding)\
                    [0 if opts.breadth_first else 1]
    if opts.verbose:
        log.debug('\tFound files...')
        for f in filelist:
            log.debug('\t\t', f)
    return opf, filelist
 class HTMLInput(InputFormatPlugin):
    name        = 'HTML Input'
    author      = 'Kovid Goyal'
    description = 'Convert HTML and OPF files to an OEB'
    file_types  = set(['opf', 'html', 'htm', 'xhtml', 'xhtm'])
    options = set([
        OptionRecommendation(name='breadth_first',
            recommended_value=False, level=OptionRecommendation.LOW,
            help=_('Traverse links in HTML files breadth first. Normally, '
                    'they are traversed depth first.'
                   )
        ),
        OptionRecommendation(name='max_levels',
            recommended_value=5, level=OptionRecommendation.LOW,
            help=_('Maximum levels of recursion when following links in '
                   'HTML files. Must be non-negative. 0 implies that no '
                   'links in the root HTML file are followed. Default is '
                   '%default.'
                   )
        ),
    ])
    def convert(self, stream, opts, file_ext, log,
                accelerators):
        basedir = os.getcwd()
        if hasattr(stream, 'name'):
            basedir = os.path.dirname(stream.name)
        if file_ext == 'opf':
            opf = OPF(stream, basedir)
            filelist = opf_traverse(opf, verbose=opts.verbose,
                    encoding=opts.input_encoding)
            mi = MetaInformation(opf)
        else:
            opf, filelist = get_filelist(stream.name, basedir, opts, log)
            mi = MetaInformation(opf)
            mi.smart_update(get_metadata(stream, 'html'))
        mi = OPFCreator(os.getcwdu(), mi)
        mi.guide = None
        entries = [(f.path, 'application/xhtml+xml') for f in filelist]
        mi.create_manifest(entries)
        mi.create_spine([f.path for f in filelist])
        tocbuf = cStringIO.StringIO()
        mi.render(open('metadata.opf', 'wb'), tocbuf, 'toc.ncx')
        toc = tocbuf.getvalue()
        if toc:
            open('toc.ncx', 'wb').write(toc)
        from calibre.ebooks.conversion.plumber import create_oebbook
        return create_oebbook(log, os.path.abspath('metadata.opf'))
--- a/src/calibre/ebooks/html_old.py
+++ b/src/calibre/ebooks/html_old.py
--- a/src/calibre/ebooks/metadata/opf2.py
+++ b/src/calibre/ebooks/metadata/opf2.py
@ -683,26 +683,6 @@ class OPF(object):
        return property(fget=fget, fset=fset)
    @dynamic_property
    def title_sort(self):
        def fget(self):
            matches = self.title_path(self.metadata)
            if matches:
                for match in matches:
                    ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None)
                    if not ans:
                        ans = match.get('file-as', None)
                    if ans:
                        return ans
        def fset(self, val):
            matches = self.title_path(self.metadata)
            if matches:
                matches[0].set('file-as', unicode(val))
        return property(fget=fget, fset=fset)
    @dynamic_property
    def tags(self):
@ -943,9 +923,10 @@ class OPFCreator(MetaInformation):
        from calibre.resources import opf_template
        from calibre.utils.genshi.template import MarkupTemplate
        template = MarkupTemplate(opf_template)
        toc = getattr(self, 'toc', None)
        if self.manifest:
            self.manifest.set_basedir(self.base_path)
-            if ncx_manifest_entry is not None:
+            if ncx_manifest_entry is not None and toc is not None:
                if not os.path.isabs(ncx_manifest_entry):
                    ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
                remove = [i for i in self.manifest if i.id == 'ncx']
@ -965,7 +946,6 @@ class OPFCreator(MetaInformation):
        opf = template.generate(__appname__=__appname__, mi=self, __version__=__version__).render('xml')
        opf_stream.write(opf)
        opf_stream.flush()
        toc = getattr(self, 'toc', None)
        if toc is not None and ncx_stream is not None:
            toc.render(ncx_stream, self.application_id)
            ncx_stream.flush()
@ -1030,17 +1010,6 @@ class OPFTest(unittest.TestCase):
        self.opf.smart_update(MetaInformation(self.opf))
        self.testReading()
    def testCreator(self):
        opf = OPFCreator(os.getcwd(), self.opf)
        buf = cStringIO.StringIO()
        opf.render(buf)
        raw = buf.getvalue()
        self.testReading(opf=OPF(cStringIO.StringIO(raw), os.getcwd()))
    def testSmartUpdate(self):
        self.opf.smart_update(self.opf)
        self.testReading()
 def suite():
    return unittest.TestLoader().loadTestsFromTestCase(OPFTest)
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@ -29,5 +29,5 @@ class MOBIInput(InputFormatPlugin):
            with open(f, 'wb') as q:
                q.write(html.tostring(root, encoding='utf-8', method='xml',
                    include_meta_content_type=False))
-            accelerators['pagebreaks'] = {f: '//div[@class="mbp_pagebreak"]'}
+            accelerators['pagebreaks'] = {f: '//*[@class="mbp_pagebreak"]'}
        return mr.created_opf_path
--- a/src/calibre/ebooks/mobi/reader.py
+++ b/src/calibre/ebooks/mobi/reader.py
@ -522,7 +522,7 @@ class MobiReader(object):
        else:
            raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
        if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
-            self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
+            self.mobi_html = self.mobi_html.replace('\r ', '\n\n').replace('\0', '')
        return processed_records
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -151,7 +151,7 @@ def resolve_base_href(root):
        return
    make_links_absolute(root, base_href, resolve_base_href=False)
-def rewrite_links(root, link_repl_func, resolve_base_href=True):
+def rewrite_links(root, link_repl_func, resolve_base_href=False):
    '''
    Rewrite all the links in the document.  For each link
    ``link_repl_func(link)`` will be called, and the return value
--- a/src/calibre/ebooks/oeb/transforms/package.py
+++ b/src/calibre/ebooks/oeb/transforms/package.py
@ -6,9 +6,16 @@ __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
-import os, shutil
+import os
 from urllib import unquote as urlunquote
 from functools import partial
-from calibre.ebooks.oeb.base import OEB_DOCS
+from lxml import etree
 import cssutils
 from calibre.constants import islinux
 from calibre.ebooks.oeb.base import OEB_DOCS, urlnormalize, urldefrag, \
                                    rewrite_links
 class Package(object):
@ -29,18 +36,69 @@ class Package(object):
        self.new_base_path = os.path.abspath(base)
    def rewrite_links_in(self, item):
-        new_items = []
+        base = os.path.join(self.new_base_path, *item.href.split('/'))
-        return new_items
+        base = os.path.dirname(base)
        if etree.iselement(item.data):
            self.rewrite_links_in_xml(item.data, base)
        elif hasattr(item.data, 'cssText'):
            self.rewrite_links_in_css(item.data, base)
    def link_replacer(self, link_, base=''):
        link = urlnormalize(link_)
        link, frag = urldefrag(link)
        link = urlunquote(link).replace('/', os.sep)
        if base and not os.path.isabs(link):
            link = os.path.join(base, link)
        link = os.path.abspath(link)
        if not islinux:
            link = link.lower()
        if link not in self.map:
            return link_
        nlink = os.path.relpath(self.map[link], base)
        if frag:
            nlink = '#'.join(nlink, frag)
        return nlink.replace(os.sep, '/')
    def rewrite_links_in_css(self, sheet, base):
        repl = partial(self.link_replacer, base=base)
        cssutils.replaceUrls(sheet, repl)
    def rewrite_links_in_xml(self, root, base):
        repl = partial(self.link_replacer, base=base)
        rewrite_links(root, repl)
    def move_manifest_item(self, item):
        item.data # Make sure the data has been loaded and cached
-        old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
+        old_abspath = os.path.join(self.old_base_path,
-        bname = item.href.split('/')[-1]
+                *(urldefrag(item.href)[0].split('/')))
-        new_href = 'content/' + \
+        old_abspath = os.path.abspath(old_abspath)
-                ('resources/' if item.media_type in OEB_DOCS else '')+bname
+        bname = item.href.split('/')[-1].partition('#')[0]
        new_href = 'content/resources/'
        if item.media_type in OEB_DOCS:
            new_href = 'content/'
        elif item.href.lower().endswith('.ncx'):
            new_href = ''
        new_href += bname
        new_abspath = os.path.join(self.new_base_path, *new_href.split('/'))
        new_abspath = os.path.abspath(new_abspath)
        item.href   = new_href
        if not islinux:
            old_abspath, new_abspath = old_abspath.lower(), new_abspath.lower()
        if old_abspath != new_abspath:
            self.map[old_abspath] = new_abspath
    def rewrite_links_in_toc(self, toc):
        if toc.href:
            toc.href = self.link_replacer(toc.href, base=self.new_base_path)
        for x in toc:
            self.rewrite_links_in_toc(x)
    def __call__(self, oeb, context):
        self.map = {}
        self.log = self.oeb.log
        self.old_base_path = os.path.abspath(oeb.container.rootdir)
        for item in self.oeb.manifest:
@ -49,4 +107,9 @@ class Package(object):
        for item in self.oeb.manifest:
            self.rewrite_links_in(item)
        if getattr(oeb.toc, 'nodes', False):
            self.rewrite_links_in_toc(oeb.toc)
        if hasattr(oeb, 'guide'):
            for ref in oeb.guide.values():
                ref.href = self.link_replacer(ref.href, base=self.new_base_path)
--- a/src/calibre/ebooks/oeb/transforms/trimmanifest.py
+++ b/src/calibre/ebooks/oeb/transforms/trimmanifest.py
@ -6,11 +6,12 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 from itertools import chain
 from urlparse import urldefrag
 import cssutils
 from calibre.ebooks.oeb.base import CSS_MIME, OEB_DOCS
-from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE
+from calibre.ebooks.oeb.base import urlnormalize, iterlinks
 from calibre.ebooks.oeb.base import urlnormalize
 class ManifestTrimmer(object):
    @classmethod
@ -44,16 +45,15 @@ class ManifestTrimmer(object):
                if (item.media_type in OEB_DOCS or
                    item.media_type[-4:] in ('/xml', '+xml')) and \
                   item.data is not None:
-                    hrefs = [sel(item.data) for sel in LINK_SELECTORS]
+                    hrefs = [r[2] for r in iterlinks(item.data)]
-                    for href in chain(*hrefs):
+                    for href in hrefs:
                        href = item.abshref(urlnormalize(href))
                        if href in oeb.manifest.hrefs:
                            found = oeb.manifest.hrefs[href]
                            if found not in used:
                                new.add(found)
                elif item.media_type == CSS_MIME:
-                    for match in CSSURL_RE.finditer(item.data.cssText):
+                    for href in cssutils.getUrls(item.data):
                        href = match.group('url')
                        href = item.abshref(urlnormalize(href))
                        if href in oeb.manifest.hrefs:
                            found = oeb.manifest.hrefs[href]
--- a/src/calibre/linux.py
+++ b/src/calibre/linux.py
@ -22,9 +22,6 @@ entry_points = {
             'web2disk           = calibre.web.fetch.simple:main',
             'feeds2disk         = calibre.web.feeds.main:main',
             'calibre-server     = calibre.library.server:main',
             'feeds2lrf          = calibre.ebooks.lrf.feeds.convert_from:main',
             'feeds2epub         = calibre.ebooks.epub.from_feeds:main',
             'feeds2mobi         = calibre.ebooks.mobi.from_feeds:main',
             'web2lrf            = calibre.ebooks.lrf.web.convert_from:main',
             'lrf2lrs            = calibre.ebooks.lrf.lrfparser:main',
             'lrs2lrf            = calibre.ebooks.lrf.lrs.convert_from:main',
@ -154,10 +151,7 @@ def setup_completion(fatal_errors):
        from calibre.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
        from calibre.web.feeds.main import option_parser as feeds2disk
        from calibre.web.feeds.recipes import titles as feed_titles
        from calibre.ebooks.lrf.feeds.convert_from import option_parser as feeds2lrf
        from calibre.ebooks.lrf.comic.convert_from import option_parser as comicop
        from calibre.ebooks.epub.from_feeds import option_parser as feeds2epub
        from calibre.ebooks.mobi.from_feeds import option_parser as feeds2mobi
        from calibre.ebooks.epub.from_comic import option_parser as comic2epub
        from calibre.ebooks.metadata.fetch import option_parser as fem_op
        from calibre.gui2.main import option_parser as guiop
@ -192,9 +186,6 @@ def setup_completion(fatal_errors):
        f.write(opts_and_exts('comic2mobi', comic2epub, ['cbz', 'cbr']))
        f.write(opts_and_exts('comic2pdf', comic2epub, ['cbz', 'cbr']))
        f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
        f.write(opts_and_words('feeds2lrf', feeds2lrf, feed_titles))
        f.write(opts_and_words('feeds2epub', feeds2epub, feed_titles))
        f.write(opts_and_words('feeds2mobi', feeds2mobi, feed_titles))
        f.write(opts_and_words('fetch-ebook-metadata', fem_op, []))
        f.write(opts_and_words('calibre-smtp', smtp_op, []))
        f.write('''