diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 7d489ec3ae..2abf658697 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -7,14 +7,16 @@ __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' __docformat__ = 'restructuredtext en' -import os, re, uuid +import os, re, uuid, logging from mimetypes import types_map from collections import defaultdict from itertools import count from urlparse import urldefrag, urlparse, urlunparse from urllib import unquote as urlunquote -import logging +from urlparse import urljoin + from lxml import etree, html + import calibre from cssutils import CSSParser from calibre.translations.dynamic import translate @@ -77,16 +79,117 @@ def XLINK(name): def CALIBRE(name): return '{%s}%s' % (CALIBRE_NS, name) -def LINK_SELECTORS(): - results = [] - for expr in ('h:head/h:link/@href', 'h:body//h:a/@href', - 'h:body//h:img/@src', 'h:body//h:object/@data', - 'h:body//*/@xl:href', '//ncx:content/@src', - 'o2:page/@href'): - results.append(etree.XPath(expr, namespaces=XPNSMAP)) - return results +_css_url_re = re.compile(r'url\((.*?)\)', re.I) +_css_import_re = re.compile(r'@import "(.*?)"') +_archive_re = re.compile(r'[^ ]+') + +def iterlinks(root): + ''' + Iterate over all links in a OEB Document. + + :param root: A valid lxml.etree element. + ''' + assert etree.iselement(root) + link_attrs = set(html.defs.link_attrs) + link_attrs.add(XLINK('href')) + + for el in root.iter(): + attribs = el.attrib + + if el.tag == XHTML('object'): + codebase = None + ## tags have attributes that are relative to + ## codebase + if 'codebase' in attribs: + codebase = el.get('codebase') + yield (el, 'codebase', codebase, 0) + for attrib in 'classid', 'data': + if attrib in attribs: + value = el.get(attrib) + if codebase is not None: + value = urljoin(codebase, value) + yield (el, attrib, value, 0) + if 'archive' in attribs: + for match in _archive_re.finditer(el.get('archive')): + value = match.group(0) + if codebase is not None: + value = urljoin(codebase, value) + yield (el, 'archive', value, match.start()) + else: + for attr in attribs: + if attr in link_attrs: + yield (el, attr, attribs[attr], 0) + + + if el.tag == XHTML('style') and el.text: + for match in _css_url_re.finditer(el.text): + yield (el, None, match.group(1), match.start(1)) + for match in _css_import_re.finditer(el.text): + yield (el, None, match.group(1), match.start(1)) + if 'style' in attribs: + for match in _css_url_re.finditer(attribs['style']): + yield (el, 'style', match.group(1), match.start(1)) + +def make_links_absolute(root, base_url): + ''' + Make all links in the document absolute, given the + ``base_url`` for the document (the full URL where the document + came from) + ''' + def link_repl(href): + return urljoin(base_url, href) + rewrite_links(root, link_repl) + +def resolve_base_href(root): + base_href = None + basetags = root.xpath('//base[@href]|//h:base[@href]', + namespaces=XPNSMAP) + for b in basetags: + base_href = b.get('href') + b.drop_tree() + if not base_href: + return + make_links_absolute(root, base_href, resolve_base_href=False) + +def rewrite_links(root, link_repl_func, resolve_base_href=True): + ''' + Rewrite all the links in the document. For each link + ``link_repl_func(link)`` will be called, and the return value + will replace the old link. + + Note that links may not be absolute (unless you first called + ``make_links_absolute()``), and may be internal (e.g., + ``'#anchor'``). They can also be values like + ``'mailto:email'`` or ``'javascript:expr'``. + + If the ``link_repl_func`` returns None, the attribute or + tag text will be removed completely. + ''' + if resolve_base_href: + resolve_base_href(root) + for el, attrib, link, pos in iterlinks(root): + new_link = link_repl_func(link.strip()) + if new_link == link: + continue + if new_link is None: + # Remove the attribute or element content + if attrib is None: + el.text = '' + else: + del el.attrib[attrib] + continue + if attrib is None: + new = el.text[:pos] + new_link + el.text[pos+len(link):] + el.text = new + else: + cur = el.attrib[attrib] + if not pos and len(cur) == len(link): + # Most common case + el.attrib[attrib] = new_link + else: + new = cur[:pos] + new_link + cur[pos+len(link):] + el.attrib[attrib] = new -LINK_SELECTORS = LINK_SELECTORS() EPUB_MIME = types_map['.epub'] XHTML_MIME = types_map['.xhtml'] @@ -199,7 +302,7 @@ def urlnormalize(href): characters URL quoted. """ parts = urlparse(href) - if not parts.scheme: + if not parts.scheme or parts.scheme == 'file': path, frag = urldefrag(href) parts = ('', '', path, '', '', frag) parts = (part.replace('\\', '/') for part in parts) @@ -724,7 +827,7 @@ class Manifest(object): if isinstance(data, unicode): return data.encode('utf-8') return str(data) - + def __unicode__(self): data = self.data if isinstance(data, etree._Element): @@ -778,8 +881,13 @@ class Manifest(object): """Convert the URL provided in :param:`href` from a reference relative to this manifest item to a book-absolute reference. """ - if urlparse(href).scheme: + purl = urlparse(href) + scheme = purl.scheme + if scheme and scheme != 'file': return href + purl = list(purl) + purl[0] = '' + href = urlunparse(purl) path, frag = urldefrag(href) if not path: return '#'.join((self.href, frag)) diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index fc1366fbcd..ea986f49fa 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -22,6 +22,7 @@ class OEBOutput(OutputFormatPlugin): if not os.path.exists(output_path): os.makedirs(output_path) from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME + from calibre.ebooks.html import tostring as html_tostring with CurrentDir(output_path): results = oeb_book.to_opf2(page_map=True) for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): @@ -42,9 +43,8 @@ class OEBOutput(OutputFormatPlugin): if hasattr(raw, 'cssText'): raw = raw.cssText else: - raw = etree.tostring(raw, encoding='utf-8', + raw = html_tostring(raw, pretty_print=opts.pretty_print) - raw = '\n'+raw if isinstance(raw, unicode): raw = raw.encode('utf-8') with open(path, 'wb') as f: diff --git a/src/calibre/ebooks/oeb/reader.py b/src/calibre/ebooks/oeb/reader.py index f4430ac07c..0c5a4ad97c 100644 --- a/src/calibre/ebooks/oeb/reader.py +++ b/src/calibre/ebooks/oeb/reader.py @@ -7,18 +7,21 @@ __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' import sys, os, uuid, copy -from itertools import izip, chain +from itertools import izip from urlparse import urldefrag, urlparse from urllib import unquote as urlunquote from mimetypes import guess_type from collections import defaultdict + from lxml import etree +import cssutils + from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ DC_NSES, OPF from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME -from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \ - ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE +from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \ + ENTITY_RE, MS_COVER_TYPE, iterlinks from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \ urlnormalize, BINARY_MIME, \ OEBError, OEBBook, DirContainer @@ -191,8 +194,8 @@ class OEBReader(object): if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')) and \ item.data is not None: - hrefs = [sel(item.data) for sel in LINK_SELECTORS] - for href in chain(*hrefs): + hrefs = [r[2] for r in iterlinks(item.data)] + for href in hrefs: href, _ = urldefrag(href) if not href: continue @@ -201,8 +204,8 @@ class OEBReader(object): if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: - for match in CSSURL_RE.finditer(item.data.cssText): - href, _ = urldefrag(match.group('url')) + for url in cssutils.getUrls(item.data): + href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: diff --git a/src/calibre/ebooks/oeb/transforms/package.py b/src/calibre/ebooks/oeb/transforms/package.py new file mode 100644 index 0000000000..d8fb485dde --- /dev/null +++ b/src/calibre/ebooks/oeb/transforms/package.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import os, shutil + +from calibre.ebooks.oeb.base import OEB_DOCS + +class Package(object): + + ''' + Move all the parts of an OEB into a folder structure rooted + at the specified folder. All links in recognized content types + are processed, the linked to resources are copied into the local + folder tree and all references to those resources are updated. + + The created folder structure is + + Base directory(OPF, NCX) -- content (XHTML) -- resources (CSS, Images, etc) + + ''' + + def __init__(self, base='.'): + ':param base: The base folder at which the OEB will be rooted' + self.new_base_path = os.path.abspath(base) + + def rewrite_links_in(self, item): + new_items = [] + return new_items + + def move_manifest_item(self, item): + item.data # Make sure the data has been loaded and cached + old_abspath = os.path.join(self.old_base_path, *item.href.split('/')) + bname = item.href.split('/')[-1] + new_href = 'content/' + \ + ('resources/' if item.media_type in OEB_DOCS else '')+bname + + def __call__(self, oeb, context): + self.map = {} + self.old_base_path = os.path.abspath(oeb.container.rootdir) + + for item in self.oeb.manifest: + self.move_manifest_item(item) + + for item in self.oeb.manifest: + self.rewrite_links_in(item) + + diff --git a/src/calibre/ebooks/oeb/writer.py b/src/calibre/ebooks/oeb/writer.py index 8789d03470..1e5e5aea11 100644 --- a/src/calibre/ebooks/oeb/writer.py +++ b/src/calibre/ebooks/oeb/writer.py @@ -6,9 +6,9 @@ from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2008, Marshall T. Vandegrift ' -import sys, os, logging +import os from calibre.ebooks.oeb.base import OPF_MIME, xml2str -from calibre.ebooks.oeb.base import DirContainer, OEBBook +from calibre.ebooks.oeb.base import DirContainer, OEBError __all__ = ['OEBWriter'] @@ -18,7 +18,7 @@ class OEBWriter(object): TRANSFORMS = [] """List of transforms to apply to content written with this Writer.""" - + def __init__(self, version='2.0', page_map=False, pretty_print=False): self.version = version self.page_map = page_map @@ -46,7 +46,7 @@ class OEBWriter(object): pretty_print = opts.pretty_print return cls(version=version, page_map=page_map, pretty_print=pretty_print) - + def __call__(self, oeb, path): """Read the book in the :class:`OEBBook` object :param:`oeb` to a file at :param:`path`.