Make iterating over links in XML and CSS documents more robust

2025-08-11 09:13:57 -04:00 · 2009-04-08 13:35:51 -07:00 · 2009-04-08 13:35:51 -07:00 · b2bfab32cf
commit b2bfab32cf
parent 1d7e56c9d8
5 changed files with 190 additions and 27 deletions
--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -7,14 +7,16 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
 __docformat__ = 'restructuredtext en'

-import os, re, uuid
+import os, re, uuid, logging
 from mimetypes import types_map
 from collections import defaultdict
 from itertools import count
 from urlparse import urldefrag, urlparse, urlunparse
 from urllib import unquote as urlunquote
-import logging
+from urlparse import urljoin
+
 from lxml import etree, html
+
 import calibre
 from cssutils import CSSParser
 from calibre.translations.dynamic import translate
@ -77,16 +79,117 @@ def XLINK(name):
 def CALIBRE(name):
    return '{%s}%s' % (CALIBRE_NS, name)

-def LINK_SELECTORS():
-    results = []
-    for expr in ('h:head/h:link/@href', 'h:body//h:a/@href',
-                 'h:body//h:img/@src', 'h:body//h:object/@data',
-                 'h:body//*/@xl:href', '//ncx:content/@src',
-                 'o2:page/@href'):
-        results.append(etree.XPath(expr, namespaces=XPNSMAP))
-    return results
+_css_url_re = re.compile(r'url\((.*?)\)', re.I)
+_css_import_re = re.compile(r'@import "(.*?)"')
+_archive_re = re.compile(r'[^ ]+')
+
+def iterlinks(root):
+    '''
+    Iterate over all links in a OEB Document.
+
+    :param root: A valid lxml.etree element.
+    '''
+    assert etree.iselement(root)
+    link_attrs = set(html.defs.link_attrs)
+    link_attrs.add(XLINK('href'))
+
+    for el in root.iter():
+        attribs = el.attrib
+
+        if el.tag == XHTML('object'):
+            codebase = None
+            ## <object> tags have attributes that are relative to
+            ## codebase
+            if 'codebase' in attribs:
+                codebase = el.get('codebase')
+                yield (el, 'codebase', codebase, 0)
+            for attrib in 'classid', 'data':
+                if attrib in attribs:
+                    value = el.get(attrib)
+                    if codebase is not None:
+                        value = urljoin(codebase, value)
+                    yield (el, attrib, value, 0)
+            if 'archive' in attribs:
+                for match in _archive_re.finditer(el.get('archive')):
+                    value = match.group(0)
+                    if codebase is not None:
+                        value = urljoin(codebase, value)
+                    yield (el, 'archive', value, match.start())
+        else:
+            for attr in attribs:
+                if attr in link_attrs:
+                    yield (el, attr, attribs[attr], 0)
+
+
+        if el.tag == XHTML('style') and el.text:
+            for match in _css_url_re.finditer(el.text):
+                yield (el, None, match.group(1), match.start(1))
+            for match in _css_import_re.finditer(el.text):
+                yield (el, None, match.group(1), match.start(1))
+        if 'style' in attribs:
+            for match in _css_url_re.finditer(attribs['style']):
+                yield (el, 'style', match.group(1), match.start(1))
+
+def make_links_absolute(root, base_url):
+    '''
+    Make all links in the document absolute, given the
+    ``base_url`` for the document (the full URL where the document
+    came from)
+    '''
+    def link_repl(href):
+        return urljoin(base_url, href)
+    rewrite_links(root, link_repl)
+
+def resolve_base_href(root):
+    base_href = None
+    basetags = root.xpath('//base[@href]|//h:base[@href]',
+            namespaces=XPNSMAP)
+    for b in basetags:
+        base_href = b.get('href')
+        b.drop_tree()
+    if not base_href:
+        return
+    make_links_absolute(root, base_href, resolve_base_href=False)
+
+def rewrite_links(root, link_repl_func, resolve_base_href=True):
+    '''
+    Rewrite all the links in the document.  For each link
+    ``link_repl_func(link)`` will be called, and the return value
+    will replace the old link.
+
+    Note that links may not be absolute (unless you first called
+    ``make_links_absolute()``), and may be internal (e.g.,
+    ``'#anchor'``).  They can also be values like
+    ``'mailto:email'`` or ``'javascript:expr'``.
+
+    If the ``link_repl_func`` returns None, the attribute or
+    tag text will be removed completely.
+    '''
+    if resolve_base_href:
+        resolve_base_href(root)
+    for el, attrib, link, pos in iterlinks(root):
+        new_link = link_repl_func(link.strip())
+        if new_link == link:
+            continue
+        if new_link is None:
+            # Remove the attribute or element content
+            if attrib is None:
+                el.text = ''
+            else:
+                del el.attrib[attrib]
+            continue
+        if attrib is None:
+            new = el.text[:pos] + new_link + el.text[pos+len(link):]
+            el.text = new
+        else:
+            cur = el.attrib[attrib]
+            if not pos and len(cur) == len(link):
+                # Most common case
+                el.attrib[attrib] = new_link
+            else:
+                new = cur[:pos] + new_link + cur[pos+len(link):]
+                el.attrib[attrib] = new

-LINK_SELECTORS = LINK_SELECTORS()

 EPUB_MIME      = types_map['.epub']
 XHTML_MIME     = types_map['.xhtml']
@ -199,7 +302,7 @@ def urlnormalize(href):
    characters URL quoted.
    """
    parts = urlparse(href)
-    if not parts.scheme:
+    if not parts.scheme or parts.scheme == 'file':
        path, frag = urldefrag(href)
        parts = ('', '', path, '', '', frag)
    parts = (part.replace('\\', '/') for part in parts)
@ -778,8 +881,13 @@ class Manifest(object):
            """Convert the URL provided in :param:`href` from a reference
            relative to this manifest item to a book-absolute reference.
            """
-            if urlparse(href).scheme:
+            purl = urlparse(href)
+            scheme = purl.scheme
+            if scheme and scheme != 'file':
                return href
+            purl = list(purl)
+            purl[0] = ''
+            href = urlunparse(purl)
            path, frag = urldefrag(href)
            if not path:
                return '#'.join((self.href, frag))
--- a/src/calibre/ebooks/oeb/output.py
+++ b/src/calibre/ebooks/oeb/output.py
@ -22,6 +22,7 @@ class OEBOutput(OutputFormatPlugin):
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
+        from calibre.ebooks.html import tostring as html_tostring
        with CurrentDir(output_path):
            results = oeb_book.to_opf2(page_map=True)
            for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
@ -42,9 +43,8 @@ class OEBOutput(OutputFormatPlugin):
                    if hasattr(raw, 'cssText'):
                        raw = raw.cssText
                    else:
-                        raw = etree.tostring(raw, encoding='utf-8',
+                        raw = html_tostring(raw,
                                pretty_print=opts.pretty_print)
-                        raw = '<?xml version="1.0" encoding="utf-8" ?>\n'+raw
                if isinstance(raw, unicode):
                    raw = raw.encode('utf-8')
                with open(path, 'wb') as f:
--- a/src/calibre/ebooks/oeb/reader.py
+++ b/src/calibre/ebooks/oeb/reader.py
@ -7,18 +7,21 @@ __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

 import sys, os, uuid, copy
-from itertools import izip, chain
+from itertools import izip
 from urlparse import urldefrag, urlparse
 from urllib import unquote as urlunquote
 from mimetypes import guess_type
 from collections import defaultdict
+
 from lxml import etree
+import cssutils
+
 from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
    DC_NSES, OPF
 from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
    PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
-from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \
-    ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE
+from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
+    ENTITY_RE, MS_COVER_TYPE, iterlinks
 from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
                                    urlnormalize, BINARY_MIME, \
                                    OEBError, OEBBook, DirContainer
@ -191,8 +194,8 @@ class OEBReader(object):
                if (item.media_type in OEB_DOCS or
                    item.media_type[-4:] in ('/xml', '+xml')) and \
                   item.data is not None:
-                    hrefs = [sel(item.data) for sel in LINK_SELECTORS]
-                    for href in chain(*hrefs):
+                    hrefs = [r[2] for r in iterlinks(item.data)]
+                    for href in hrefs:
                        href, _ = urldefrag(href)
                        if not href:
                            continue
@ -201,8 +204,8 @@ class OEBReader(object):
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in OEB_STYLES:
-                    for match in CSSURL_RE.finditer(item.data.cssText):
-                        href, _ = urldefrag(match.group('url'))
+                    for url in cssutils.getUrls(item.data):
+                        href, _ = urldefrag(url)
                        href = item.abshref(urlnormalize(href))
                        scheme = urlparse(href).scheme
                        if not scheme and href not in known:
--- a/src/calibre/ebooks/oeb/transforms/package.py
+++ b/src/calibre/ebooks/oeb/transforms/package.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import with_statement
+
+__license__   = 'GPL v3'
+__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import os, shutil
+
+from calibre.ebooks.oeb.base import OEB_DOCS
+
+class Package(object):
+
+    '''
+    Move all the parts of an OEB into a folder structure rooted
+    at the specified folder. All links in recognized content types
+    are processed, the linked to resources are copied into the local
+    folder tree and all references to those resources are updated.
+
+    The created folder structure is
+
+    Base directory(OPF, NCX) -- content (XHTML) -- resources (CSS, Images, etc)
+
+    '''
+
+    def __init__(self, base='.'):
+        ':param base: The base folder at which the OEB will be rooted'
+        self.new_base_path = os.path.abspath(base)
+
+    def rewrite_links_in(self, item):
+        new_items = []
+        return new_items
+
+    def move_manifest_item(self, item):
+        item.data # Make sure the data has been loaded and cached
+        old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
+        bname = item.href.split('/')[-1]
+        new_href = 'content/' + \
+                ('resources/' if item.media_type in OEB_DOCS else '')+bname
+
+    def __call__(self, oeb, context):
+        self.map = {}
+        self.old_base_path = os.path.abspath(oeb.container.rootdir)
+
+        for item in self.oeb.manifest:
+            self.move_manifest_item(item)
+
+        for item in self.oeb.manifest:
+            self.rewrite_links_in(item)
+
+
--- a/src/calibre/ebooks/oeb/writer.py
+++ b/src/calibre/ebooks/oeb/writer.py
@ -6,9 +6,9 @@ from __future__ import with_statement
 __license__   = 'GPL v3'
 __copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

-import sys, os, logging
+import os
 from calibre.ebooks.oeb.base import OPF_MIME, xml2str
-from calibre.ebooks.oeb.base import DirContainer, OEBBook
+from calibre.ebooks.oeb.base import DirContainer, OEBError

 __all__ = ['OEBWriter']