Make iterating over links in XML and CSS documents more robust

This commit is contained in:
Kovid Goyal 2009-04-08 13:35:51 -07:00
parent 1d7e56c9d8
commit b2bfab32cf
5 changed files with 190 additions and 27 deletions

View File

@ -7,14 +7,16 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'
import os, re, uuid
import os, re, uuid, logging
from mimetypes import types_map
from collections import defaultdict
from itertools import count
from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
import logging
from urlparse import urljoin
from lxml import etree, html
import calibre
from cssutils import CSSParser
from calibre.translations.dynamic import translate
@ -77,16 +79,117 @@ def XLINK(name):
def CALIBRE(name):
return '{%s}%s' % (CALIBRE_NS, name)
def LINK_SELECTORS():
results = []
for expr in ('h:head/h:link/@href', 'h:body//h:a/@href',
'h:body//h:img/@src', 'h:body//h:object/@data',
'h:body//*/@xl:href', '//ncx:content/@src',
'o2:page/@href'):
results.append(etree.XPath(expr, namespaces=XPNSMAP))
return results
_css_url_re = re.compile(r'url\((.*?)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_archive_re = re.compile(r'[^ ]+')
def iterlinks(root):
'''
Iterate over all links in a OEB Document.
:param root: A valid lxml.etree element.
'''
assert etree.iselement(root)
link_attrs = set(html.defs.link_attrs)
link_attrs.add(XLINK('href'))
for el in root.iter():
attribs = el.attrib
if el.tag == XHTML('object'):
codebase = None
## <object> tags have attributes that are relative to
## codebase
if 'codebase' in attribs:
codebase = el.get('codebase')
yield (el, 'codebase', codebase, 0)
for attrib in 'classid', 'data':
if attrib in attribs:
value = el.get(attrib)
if codebase is not None:
value = urljoin(codebase, value)
yield (el, attrib, value, 0)
if 'archive' in attribs:
for match in _archive_re.finditer(el.get('archive')):
value = match.group(0)
if codebase is not None:
value = urljoin(codebase, value)
yield (el, 'archive', value, match.start())
else:
for attr in attribs:
if attr in link_attrs:
yield (el, attr, attribs[attr], 0)
if el.tag == XHTML('style') and el.text:
for match in _css_url_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
for match in _css_import_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
if 'style' in attribs:
for match in _css_url_re.finditer(attribs['style']):
yield (el, 'style', match.group(1), match.start(1))
def make_links_absolute(root, base_url):
'''
Make all links in the document absolute, given the
``base_url`` for the document (the full URL where the document
came from)
'''
def link_repl(href):
return urljoin(base_url, href)
rewrite_links(root, link_repl)
def resolve_base_href(root):
base_href = None
basetags = root.xpath('//base[@href]|//h:base[@href]',
namespaces=XPNSMAP)
for b in basetags:
base_href = b.get('href')
b.drop_tree()
if not base_href:
return
make_links_absolute(root, base_href, resolve_base_href=False)
def rewrite_links(root, link_repl_func, resolve_base_href=True):
'''
Rewrite all the links in the document. For each link
``link_repl_func(link)`` will be called, and the return value
will replace the old link.
Note that links may not be absolute (unless you first called
``make_links_absolute()``), and may be internal (e.g.,
``'#anchor'``). They can also be values like
``'mailto:email'`` or ``'javascript:expr'``.
If the ``link_repl_func`` returns None, the attribute or
tag text will be removed completely.
'''
if resolve_base_href:
resolve_base_href(root)
for el, attrib, link, pos in iterlinks(root):
new_link = link_repl_func(link.strip())
if new_link == link:
continue
if new_link is None:
# Remove the attribute or element content
if attrib is None:
el.text = ''
else:
del el.attrib[attrib]
continue
if attrib is None:
new = el.text[:pos] + new_link + el.text[pos+len(link):]
el.text = new
else:
cur = el.attrib[attrib]
if not pos and len(cur) == len(link):
# Most common case
el.attrib[attrib] = new_link
else:
new = cur[:pos] + new_link + cur[pos+len(link):]
el.attrib[attrib] = new
LINK_SELECTORS = LINK_SELECTORS()
EPUB_MIME = types_map['.epub']
XHTML_MIME = types_map['.xhtml']
@ -199,7 +302,7 @@ def urlnormalize(href):
characters URL quoted.
"""
parts = urlparse(href)
if not parts.scheme:
if not parts.scheme or parts.scheme == 'file':
path, frag = urldefrag(href)
parts = ('', '', path, '', '', frag)
parts = (part.replace('\\', '/') for part in parts)
@ -778,8 +881,13 @@ class Manifest(object):
"""Convert the URL provided in :param:`href` from a reference
relative to this manifest item to a book-absolute reference.
"""
if urlparse(href).scheme:
purl = urlparse(href)
scheme = purl.scheme
if scheme and scheme != 'file':
return href
purl = list(purl)
purl[0] = ''
href = urlunparse(purl)
path, frag = urldefrag(href)
if not path:
return '#'.join((self.href, frag))

View File

@ -22,6 +22,7 @@ class OEBOutput(OutputFormatPlugin):
if not os.path.exists(output_path):
os.makedirs(output_path)
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
from calibre.ebooks.html import tostring as html_tostring
with CurrentDir(output_path):
results = oeb_book.to_opf2(page_map=True)
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
@ -42,9 +43,8 @@ class OEBOutput(OutputFormatPlugin):
if hasattr(raw, 'cssText'):
raw = raw.cssText
else:
raw = etree.tostring(raw, encoding='utf-8',
raw = html_tostring(raw,
pretty_print=opts.pretty_print)
raw = '<?xml version="1.0" encoding="utf-8" ?>\n'+raw
if isinstance(raw, unicode):
raw = raw.encode('utf-8')
with open(path, 'wb') as f:

View File

@ -7,18 +7,21 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, uuid, copy
from itertools import izip, chain
from itertools import izip
from urlparse import urldefrag, urlparse
from urllib import unquote as urlunquote
from mimetypes import guess_type
from collections import defaultdict
from lxml import etree
import cssutils
from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
DC_NSES, OPF
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \
ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
ENTITY_RE, MS_COVER_TYPE, iterlinks
from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
urlnormalize, BINARY_MIME, \
OEBError, OEBBook, DirContainer
@ -191,8 +194,8 @@ class OEBReader(object):
if (item.media_type in OEB_DOCS or
item.media_type[-4:] in ('/xml', '+xml')) and \
item.data is not None:
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
for href in chain(*hrefs):
hrefs = [r[2] for r in iterlinks(item.data)]
for href in hrefs:
href, _ = urldefrag(href)
if not href:
continue
@ -201,8 +204,8 @@ class OEBReader(object):
if not scheme and href not in known:
new.add(href)
elif item.media_type in OEB_STYLES:
for match in CSSURL_RE.finditer(item.data.cssText):
href, _ = urldefrag(match.group('url'))
for url in cssutils.getUrls(item.data):
href, _ = urldefrag(url)
href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme
if not scheme and href not in known:

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, shutil
from calibre.ebooks.oeb.base import OEB_DOCS
class Package(object):
'''
Move all the parts of an OEB into a folder structure rooted
at the specified folder. All links in recognized content types
are processed, the linked to resources are copied into the local
folder tree and all references to those resources are updated.
The created folder structure is
Base directory(OPF, NCX) -- content (XHTML) -- resources (CSS, Images, etc)
'''
def __init__(self, base='.'):
':param base: The base folder at which the OEB will be rooted'
self.new_base_path = os.path.abspath(base)
def rewrite_links_in(self, item):
new_items = []
return new_items
def move_manifest_item(self, item):
item.data # Make sure the data has been loaded and cached
old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
bname = item.href.split('/')[-1]
new_href = 'content/' + \
('resources/' if item.media_type in OEB_DOCS else '')+bname
def __call__(self, oeb, context):
self.map = {}
self.old_base_path = os.path.abspath(oeb.container.rootdir)
for item in self.oeb.manifest:
self.move_manifest_item(item)
for item in self.oeb.manifest:
self.rewrite_links_in(item)

View File

@ -6,9 +6,9 @@ from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
import sys, os, logging
import os
from calibre.ebooks.oeb.base import OPF_MIME, xml2str
from calibre.ebooks.oeb.base import DirContainer, OEBBook
from calibre.ebooks.oeb.base import DirContainer, OEBError
__all__ = ['OEBWriter']