Make iterating over links in XML and CSS documents more robust

2025-11-15 02:53:02 -05:00 · 2009-04-08 13:35:51 -07:00 · 2009-04-08 13:35:51 -07:00 · b2bfab32cf
commit b2bfab32cf
parent 1d7e56c9d8
5 changed files with 190 additions and 27 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -7,14 +7,16 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 __docformat__ = 'restructuredtext en'
-import os, re, uuid
+import os, re, uuid, logging
 from mimetypes import types_map
 from collections import defaultdict
 from itertools import count
 from urlparse import urldefrag, urlparse, urlunparse
 from urllib import unquote as urlunquote
-import logging
+from urlparse import urljoin
 from lxml import etree, html
 import calibre
 from cssutils import CSSParser
 from calibre.translations.dynamic import translate
@ -77,16 +79,117 @@ def XLINK(name):
 def CALIBRE(name):
    return '{%s}%s' % (CALIBRE_NS, name)
-def LINK_SELECTORS():
+_css_url_re = re.compile(r'url\((.*?)\)', re.I)
-    results = []
+_css_import_re = re.compile(r'@import "(.*?)"')
-    for expr in ('h:head/h:link/@href', 'h:body//h:a/@href',
+_archive_re = re.compile(r'[^ ]+')
-                 'h:body//h:img/@src', 'h:body//h:object/@data',
+
-                 'h:body//*/@xl:href', '//ncx:content/@src',
+def iterlinks(root):
-                 'o2:page/@href'):
+    '''
-        results.append(etree.XPath(expr, namespaces=XPNSMAP))
+    Iterate over all links in a OEB Document.
-    return results
+
    :param root: A valid lxml.etree element.
    '''
    assert etree.iselement(root)
    link_attrs = set(html.defs.link_attrs)
    link_attrs.add(XLINK('href'))
    for el in root.iter():
        attribs = el.attrib
        if el.tag == XHTML('object'):
            codebase = None
            ## <object> tags have attributes that are relative to
            ## codebase
            if 'codebase' in attribs:
                codebase = el.get('codebase')
                yield (el, 'codebase', codebase, 0)
            for attrib in 'classid', 'data':
                if attrib in attribs:
                    value = el.get(attrib)
                    if codebase is not None:
                        value = urljoin(codebase, value)
                    yield (el, attrib, value, 0)
            if 'archive' in attribs:
                for match in _archive_re.finditer(el.get('archive')):
                    value = match.group(0)
                    if codebase is not None:
                        value = urljoin(codebase, value)
                    yield (el, 'archive', value, match.start())
        else:
            for attr in attribs:
                if attr in link_attrs:
                    yield (el, attr, attribs[attr], 0)
        if el.tag == XHTML('style') and el.text:
            for match in _css_url_re.finditer(el.text):
                yield (el, None, match.group(1), match.start(1))
            for match in _css_import_re.finditer(el.text):
                yield (el, None, match.group(1), match.start(1))
        if 'style' in attribs:
            for match in _css_url_re.finditer(attribs['style']):
                yield (el, 'style', match.group(1), match.start(1))
 def make_links_absolute(root, base_url):
    '''
    Make all links in the document absolute, given the
    ``base_url`` for the document (the full URL where the document
    came from)
    '''
    def link_repl(href):
        return urljoin(base_url, href)
    rewrite_links(root, link_repl)
 def resolve_base_href(root):
    base_href = None
    basetags = root.xpath('//base[@href]|//h:base[@href]',
            namespaces=XPNSMAP)
    for b in basetags:
        base_href = b.get('href')
        b.drop_tree()
    if not base_href:
        return
    make_links_absolute(root, base_href, resolve_base_href=False)
 def rewrite_links(root, link_repl_func, resolve_base_href=True):
    '''
    Rewrite all the links in the document.  For each link
    ``link_repl_func(link)`` will be called, and the return value
    will replace the old link.
    Note that links may not be absolute (unless you first called
    ``make_links_absolute()``), and may be internal (e.g.,
    ``'#anchor'``).  They can also be values like
    ``'mailto:email'`` or ``'javascript:expr'``.
    If the ``link_repl_func`` returns None, the attribute or
    tag text will be removed completely.
    '''
    if resolve_base_href:
        resolve_base_href(root)
    for el, attrib, link, pos in iterlinks(root):
        new_link = link_repl_func(link.strip())
        if new_link == link:
            continue
        if new_link is None:
            # Remove the attribute or element content
            if attrib is None:
                el.text = ''
            else:
                del el.attrib[attrib]
            continue
        if attrib is None:
            new = el.text[:pos] + new_link + el.text[pos+len(link):]
            el.text = new
        else:
            cur = el.attrib[attrib]
            if not pos and len(cur) == len(link):
                # Most common case
                el.attrib[attrib] = new_link
            else:
                new = cur[:pos] + new_link + cur[pos+len(link):]
                el.attrib[attrib] = new
 LINK_SELECTORS = LINK_SELECTORS()
 EPUB_MIME      = types_map['.epub']
 XHTML_MIME     = types_map['.xhtml']
@ -199,7 +302,7 @@ def urlnormalize(href):
    characters URL quoted.
    """
    parts = urlparse(href)
-    if not parts.scheme:
+    if not parts.scheme or parts.scheme == 'file':
        path, frag = urldefrag(href)
        parts = ('', '', path, '', '', frag)
    parts = (part.replace('\\', '/') for part in parts)
@ -724,7 +827,7 @@ class Manifest(object):
            if isinstance(data, unicode):
                return data.encode('utf-8')
            return str(data)
-            
+
        def __unicode__(self):
            data = self.data
            if isinstance(data, etree._Element):
@ -778,8 +881,13 @@ class Manifest(object):
            """Convert the URL provided in :param:`href` from a reference
            relative to this manifest item to a book-absolute reference.
            """
-            if urlparse(href).scheme:
+            purl = urlparse(href)
            scheme = purl.scheme
            if scheme and scheme != 'file':
                return href
            purl = list(purl)
            purl[0] = ''
            href = urlunparse(purl)
            path, frag = urldefrag(href)
            if not path:
                return '#'.join((self.href, frag))
--- a/src/calibre/ebooks/oeb/output.py
+++ b/src/calibre/ebooks/oeb/output.py
@ -22,6 +22,7 @@ class OEBOutput(OutputFormatPlugin):
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
        from calibre.ebooks.html import tostring as html_tostring
        with CurrentDir(output_path):
            results = oeb_book.to_opf2(page_map=True)
            for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
@ -42,9 +43,8 @@ class OEBOutput(OutputFormatPlugin):
                    if hasattr(raw, 'cssText'):
                        raw = raw.cssText
                    else:
-                        raw = etree.tostring(raw, encoding='utf-8',
+                        raw = html_tostring(raw,
                                pretty_print=opts.pretty_print)
                        raw = '<?xml version="1.0" encoding="utf-8" ?>\n'+raw
                if isinstance(raw, unicode):
                    raw = raw.encode('utf-8')
                with open(path, 'wb') as f:
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -7,18 +7,21 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 import sys, os, uuid, copy
-from itertools import izip, chain
+from itertools import izip
 from urlparse import urldefrag, urlparse
 from urllib import unquote as urlunquote
 from mimetypes import guess_type
 from collections import defaultdict
 from lxml import etree
 import cssutils
 from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
    DC_NSES, OPF
 from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
    PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
-from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \
+from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
-    ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE
+    ENTITY_RE, MS_COVER_TYPE, iterlinks
 from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
                                    urlnormalize, BINARY_MIME, \
                                    OEBError, OEBBook, DirContainer
@ -191,8 +194,8 @@ class OEBReader(object):
                if (item.media_type in OEB_DOCS or
                    item.media_type[-4:] in ('/xml', '+xml')) and \
                   item.data is not None:
-                    hrefs = [sel(item.data) for sel in LINK_SELECTORS]
+                    hrefs = [r[2] for r in iterlinks(item.data)]
-                    for href in chain(*hrefs):
+                    for href in hrefs:
                        href, _ = urldefrag(href)
                        if not href:
                            continue
@ -201,8 +204,8 @@ class OEBReader(object):
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in OEB_STYLES:
-                    for match in CSSURL_RE.finditer(item.data.cssText):
+                    for url in cssutils.getUrls(item.data):
-                        href, _ = urldefrag(match.group('url'))
+                        href, _ = urldefrag(url)
                        href = item.abshref(urlnormalize(href))
                        scheme = urlparse(href).scheme
                        if not scheme and href not in known:
--- a/src/calibre/ebooks/oeb/transforms/package.py
+++ b/src/calibre/ebooks/oeb/transforms/package.py
@ -0,0 +1,52 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import os, shutil
 from calibre.ebooks.oeb.base import OEB_DOCS
 class Package(object):
    '''
    Move all the parts of an OEB into a folder structure rooted
    at the specified folder. All links in recognized content types
    are processed, the linked to resources are copied into the local
    folder tree and all references to those resources are updated.
    The created folder structure is
    Base directory(OPF, NCX) -- content (XHTML) -- resources (CSS, Images, etc)
    '''
    def __init__(self, base='.'):
        ':param base: The base folder at which the OEB will be rooted'
        self.new_base_path = os.path.abspath(base)
    def rewrite_links_in(self, item):
        new_items = []
        return new_items
    def move_manifest_item(self, item):
        item.data # Make sure the data has been loaded and cached
        old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
        bname = item.href.split('/')[-1]
        new_href = 'content/' + \
                ('resources/' if item.media_type in OEB_DOCS else '')+bname
    def __call__(self, oeb, context):
        self.map = {}
        self.old_base_path = os.path.abspath(oeb.container.rootdir)
        for item in self.oeb.manifest:
            self.move_manifest_item(item)
        for item in self.oeb.manifest:
            self.rewrite_links_in(item)
--- a/src/calibre/ebooks/oeb/writer.py
+++ b/src/calibre/ebooks/oeb/writer.py
@ -6,9 +6,9 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
-import sys, os, logging
+import os
 from calibre.ebooks.oeb.base import OPF_MIME, xml2str
-from calibre.ebooks.oeb.base import DirContainer, OEBBook
+from calibre.ebooks.oeb.base import DirContainer, OEBError
 __all__ = ['OEBWriter']
@ -18,7 +18,7 @@ class OEBWriter(object):
    TRANSFORMS = []
    """List of transforms to apply to content written with this Writer."""
-    
+
    def __init__(self, version='2.0', page_map=False, pretty_print=False):
        self.version = version
        self.page_map = page_map
@ -46,7 +46,7 @@ class OEBWriter(object):
        pretty_print = opts.pretty_print
        return cls(version=version, page_map=page_map,
                   pretty_print=pretty_print)
-    
+
    def __call__(self, oeb, path):
        """Read the book in the :class:`OEBBook` object :param:`oeb` to a file
        at :param:`path`.