mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Make iterating over links in XML and CSS documents more robust
This commit is contained in:
parent
1d7e56c9d8
commit
b2bfab32cf
@ -7,14 +7,16 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import os, re, uuid
|
import os, re, uuid, logging
|
||||||
from mimetypes import types_map
|
from mimetypes import types_map
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from itertools import count
|
from itertools import count
|
||||||
from urlparse import urldefrag, urlparse, urlunparse
|
from urlparse import urldefrag, urlparse, urlunparse
|
||||||
from urllib import unquote as urlunquote
|
from urllib import unquote as urlunquote
|
||||||
import logging
|
from urlparse import urljoin
|
||||||
|
|
||||||
from lxml import etree, html
|
from lxml import etree, html
|
||||||
|
|
||||||
import calibre
|
import calibre
|
||||||
from cssutils import CSSParser
|
from cssutils import CSSParser
|
||||||
from calibre.translations.dynamic import translate
|
from calibre.translations.dynamic import translate
|
||||||
@ -77,16 +79,117 @@ def XLINK(name):
|
|||||||
def CALIBRE(name):
|
def CALIBRE(name):
|
||||||
return '{%s}%s' % (CALIBRE_NS, name)
|
return '{%s}%s' % (CALIBRE_NS, name)
|
||||||
|
|
||||||
def LINK_SELECTORS():
|
_css_url_re = re.compile(r'url\((.*?)\)', re.I)
|
||||||
results = []
|
_css_import_re = re.compile(r'@import "(.*?)"')
|
||||||
for expr in ('h:head/h:link/@href', 'h:body//h:a/@href',
|
_archive_re = re.compile(r'[^ ]+')
|
||||||
'h:body//h:img/@src', 'h:body//h:object/@data',
|
|
||||||
'h:body//*/@xl:href', '//ncx:content/@src',
|
def iterlinks(root):
|
||||||
'o2:page/@href'):
|
'''
|
||||||
results.append(etree.XPath(expr, namespaces=XPNSMAP))
|
Iterate over all links in a OEB Document.
|
||||||
return results
|
|
||||||
|
:param root: A valid lxml.etree element.
|
||||||
|
'''
|
||||||
|
assert etree.iselement(root)
|
||||||
|
link_attrs = set(html.defs.link_attrs)
|
||||||
|
link_attrs.add(XLINK('href'))
|
||||||
|
|
||||||
|
for el in root.iter():
|
||||||
|
attribs = el.attrib
|
||||||
|
|
||||||
|
if el.tag == XHTML('object'):
|
||||||
|
codebase = None
|
||||||
|
## <object> tags have attributes that are relative to
|
||||||
|
## codebase
|
||||||
|
if 'codebase' in attribs:
|
||||||
|
codebase = el.get('codebase')
|
||||||
|
yield (el, 'codebase', codebase, 0)
|
||||||
|
for attrib in 'classid', 'data':
|
||||||
|
if attrib in attribs:
|
||||||
|
value = el.get(attrib)
|
||||||
|
if codebase is not None:
|
||||||
|
value = urljoin(codebase, value)
|
||||||
|
yield (el, attrib, value, 0)
|
||||||
|
if 'archive' in attribs:
|
||||||
|
for match in _archive_re.finditer(el.get('archive')):
|
||||||
|
value = match.group(0)
|
||||||
|
if codebase is not None:
|
||||||
|
value = urljoin(codebase, value)
|
||||||
|
yield (el, 'archive', value, match.start())
|
||||||
|
else:
|
||||||
|
for attr in attribs:
|
||||||
|
if attr in link_attrs:
|
||||||
|
yield (el, attr, attribs[attr], 0)
|
||||||
|
|
||||||
|
|
||||||
|
if el.tag == XHTML('style') and el.text:
|
||||||
|
for match in _css_url_re.finditer(el.text):
|
||||||
|
yield (el, None, match.group(1), match.start(1))
|
||||||
|
for match in _css_import_re.finditer(el.text):
|
||||||
|
yield (el, None, match.group(1), match.start(1))
|
||||||
|
if 'style' in attribs:
|
||||||
|
for match in _css_url_re.finditer(attribs['style']):
|
||||||
|
yield (el, 'style', match.group(1), match.start(1))
|
||||||
|
|
||||||
|
def make_links_absolute(root, base_url):
|
||||||
|
'''
|
||||||
|
Make all links in the document absolute, given the
|
||||||
|
``base_url`` for the document (the full URL where the document
|
||||||
|
came from)
|
||||||
|
'''
|
||||||
|
def link_repl(href):
|
||||||
|
return urljoin(base_url, href)
|
||||||
|
rewrite_links(root, link_repl)
|
||||||
|
|
||||||
|
def resolve_base_href(root):
|
||||||
|
base_href = None
|
||||||
|
basetags = root.xpath('//base[@href]|//h:base[@href]',
|
||||||
|
namespaces=XPNSMAP)
|
||||||
|
for b in basetags:
|
||||||
|
base_href = b.get('href')
|
||||||
|
b.drop_tree()
|
||||||
|
if not base_href:
|
||||||
|
return
|
||||||
|
make_links_absolute(root, base_href, resolve_base_href=False)
|
||||||
|
|
||||||
|
def rewrite_links(root, link_repl_func, resolve_base_href=True):
|
||||||
|
'''
|
||||||
|
Rewrite all the links in the document. For each link
|
||||||
|
``link_repl_func(link)`` will be called, and the return value
|
||||||
|
will replace the old link.
|
||||||
|
|
||||||
|
Note that links may not be absolute (unless you first called
|
||||||
|
``make_links_absolute()``), and may be internal (e.g.,
|
||||||
|
``'#anchor'``). They can also be values like
|
||||||
|
``'mailto:email'`` or ``'javascript:expr'``.
|
||||||
|
|
||||||
|
If the ``link_repl_func`` returns None, the attribute or
|
||||||
|
tag text will be removed completely.
|
||||||
|
'''
|
||||||
|
if resolve_base_href:
|
||||||
|
resolve_base_href(root)
|
||||||
|
for el, attrib, link, pos in iterlinks(root):
|
||||||
|
new_link = link_repl_func(link.strip())
|
||||||
|
if new_link == link:
|
||||||
|
continue
|
||||||
|
if new_link is None:
|
||||||
|
# Remove the attribute or element content
|
||||||
|
if attrib is None:
|
||||||
|
el.text = ''
|
||||||
|
else:
|
||||||
|
del el.attrib[attrib]
|
||||||
|
continue
|
||||||
|
if attrib is None:
|
||||||
|
new = el.text[:pos] + new_link + el.text[pos+len(link):]
|
||||||
|
el.text = new
|
||||||
|
else:
|
||||||
|
cur = el.attrib[attrib]
|
||||||
|
if not pos and len(cur) == len(link):
|
||||||
|
# Most common case
|
||||||
|
el.attrib[attrib] = new_link
|
||||||
|
else:
|
||||||
|
new = cur[:pos] + new_link + cur[pos+len(link):]
|
||||||
|
el.attrib[attrib] = new
|
||||||
|
|
||||||
LINK_SELECTORS = LINK_SELECTORS()
|
|
||||||
|
|
||||||
EPUB_MIME = types_map['.epub']
|
EPUB_MIME = types_map['.epub']
|
||||||
XHTML_MIME = types_map['.xhtml']
|
XHTML_MIME = types_map['.xhtml']
|
||||||
@ -199,7 +302,7 @@ def urlnormalize(href):
|
|||||||
characters URL quoted.
|
characters URL quoted.
|
||||||
"""
|
"""
|
||||||
parts = urlparse(href)
|
parts = urlparse(href)
|
||||||
if not parts.scheme:
|
if not parts.scheme or parts.scheme == 'file':
|
||||||
path, frag = urldefrag(href)
|
path, frag = urldefrag(href)
|
||||||
parts = ('', '', path, '', '', frag)
|
parts = ('', '', path, '', '', frag)
|
||||||
parts = (part.replace('\\', '/') for part in parts)
|
parts = (part.replace('\\', '/') for part in parts)
|
||||||
@ -724,7 +827,7 @@ class Manifest(object):
|
|||||||
if isinstance(data, unicode):
|
if isinstance(data, unicode):
|
||||||
return data.encode('utf-8')
|
return data.encode('utf-8')
|
||||||
return str(data)
|
return str(data)
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
data = self.data
|
data = self.data
|
||||||
if isinstance(data, etree._Element):
|
if isinstance(data, etree._Element):
|
||||||
@ -778,8 +881,13 @@ class Manifest(object):
|
|||||||
"""Convert the URL provided in :param:`href` from a reference
|
"""Convert the URL provided in :param:`href` from a reference
|
||||||
relative to this manifest item to a book-absolute reference.
|
relative to this manifest item to a book-absolute reference.
|
||||||
"""
|
"""
|
||||||
if urlparse(href).scheme:
|
purl = urlparse(href)
|
||||||
|
scheme = purl.scheme
|
||||||
|
if scheme and scheme != 'file':
|
||||||
return href
|
return href
|
||||||
|
purl = list(purl)
|
||||||
|
purl[0] = ''
|
||||||
|
href = urlunparse(purl)
|
||||||
path, frag = urldefrag(href)
|
path, frag = urldefrag(href)
|
||||||
if not path:
|
if not path:
|
||||||
return '#'.join((self.href, frag))
|
return '#'.join((self.href, frag))
|
||||||
|
@ -22,6 +22,7 @@ class OEBOutput(OutputFormatPlugin):
|
|||||||
if not os.path.exists(output_path):
|
if not os.path.exists(output_path):
|
||||||
os.makedirs(output_path)
|
os.makedirs(output_path)
|
||||||
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
|
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME
|
||||||
|
from calibre.ebooks.html import tostring as html_tostring
|
||||||
with CurrentDir(output_path):
|
with CurrentDir(output_path):
|
||||||
results = oeb_book.to_opf2(page_map=True)
|
results = oeb_book.to_opf2(page_map=True)
|
||||||
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
|
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
|
||||||
@ -42,9 +43,8 @@ class OEBOutput(OutputFormatPlugin):
|
|||||||
if hasattr(raw, 'cssText'):
|
if hasattr(raw, 'cssText'):
|
||||||
raw = raw.cssText
|
raw = raw.cssText
|
||||||
else:
|
else:
|
||||||
raw = etree.tostring(raw, encoding='utf-8',
|
raw = html_tostring(raw,
|
||||||
pretty_print=opts.pretty_print)
|
pretty_print=opts.pretty_print)
|
||||||
raw = '<?xml version="1.0" encoding="utf-8" ?>\n'+raw
|
|
||||||
if isinstance(raw, unicode):
|
if isinstance(raw, unicode):
|
||||||
raw = raw.encode('utf-8')
|
raw = raw.encode('utf-8')
|
||||||
with open(path, 'wb') as f:
|
with open(path, 'wb') as f:
|
||||||
|
@ -7,18 +7,21 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||||
|
|
||||||
import sys, os, uuid, copy
|
import sys, os, uuid, copy
|
||||||
from itertools import izip, chain
|
from itertools import izip
|
||||||
from urlparse import urldefrag, urlparse
|
from urlparse import urldefrag, urlparse
|
||||||
from urllib import unquote as urlunquote
|
from urllib import unquote as urlunquote
|
||||||
from mimetypes import guess_type
|
from mimetypes import guess_type
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
import cssutils
|
||||||
|
|
||||||
from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
|
from calibre.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
|
||||||
DC_NSES, OPF
|
DC_NSES, OPF
|
||||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
|
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
|
||||||
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
|
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
|
||||||
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, CSSURL_RE, \
|
from calibre.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
|
||||||
ENTITY_RE, LINK_SELECTORS, MS_COVER_TYPE
|
ENTITY_RE, MS_COVER_TYPE, iterlinks
|
||||||
from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
|
from calibre.ebooks.oeb.base import namespace, barename, qname, XPath, xpath, \
|
||||||
urlnormalize, BINARY_MIME, \
|
urlnormalize, BINARY_MIME, \
|
||||||
OEBError, OEBBook, DirContainer
|
OEBError, OEBBook, DirContainer
|
||||||
@ -191,8 +194,8 @@ class OEBReader(object):
|
|||||||
if (item.media_type in OEB_DOCS or
|
if (item.media_type in OEB_DOCS or
|
||||||
item.media_type[-4:] in ('/xml', '+xml')) and \
|
item.media_type[-4:] in ('/xml', '+xml')) and \
|
||||||
item.data is not None:
|
item.data is not None:
|
||||||
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
|
hrefs = [r[2] for r in iterlinks(item.data)]
|
||||||
for href in chain(*hrefs):
|
for href in hrefs:
|
||||||
href, _ = urldefrag(href)
|
href, _ = urldefrag(href)
|
||||||
if not href:
|
if not href:
|
||||||
continue
|
continue
|
||||||
@ -201,8 +204,8 @@ class OEBReader(object):
|
|||||||
if not scheme and href not in known:
|
if not scheme and href not in known:
|
||||||
new.add(href)
|
new.add(href)
|
||||||
elif item.media_type in OEB_STYLES:
|
elif item.media_type in OEB_STYLES:
|
||||||
for match in CSSURL_RE.finditer(item.data.cssText):
|
for url in cssutils.getUrls(item.data):
|
||||||
href, _ = urldefrag(match.group('url'))
|
href, _ = urldefrag(url)
|
||||||
href = item.abshref(urlnormalize(href))
|
href = item.abshref(urlnormalize(href))
|
||||||
scheme = urlparse(href).scheme
|
scheme = urlparse(href).scheme
|
||||||
if not scheme and href not in known:
|
if not scheme and href not in known:
|
||||||
|
52
src/calibre/ebooks/oeb/transforms/package.py
Normal file
52
src/calibre/ebooks/oeb/transforms/package.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import os, shutil
|
||||||
|
|
||||||
|
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||||
|
|
||||||
|
class Package(object):
|
||||||
|
|
||||||
|
'''
|
||||||
|
Move all the parts of an OEB into a folder structure rooted
|
||||||
|
at the specified folder. All links in recognized content types
|
||||||
|
are processed, the linked to resources are copied into the local
|
||||||
|
folder tree and all references to those resources are updated.
|
||||||
|
|
||||||
|
The created folder structure is
|
||||||
|
|
||||||
|
Base directory(OPF, NCX) -- content (XHTML) -- resources (CSS, Images, etc)
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, base='.'):
|
||||||
|
':param base: The base folder at which the OEB will be rooted'
|
||||||
|
self.new_base_path = os.path.abspath(base)
|
||||||
|
|
||||||
|
def rewrite_links_in(self, item):
|
||||||
|
new_items = []
|
||||||
|
return new_items
|
||||||
|
|
||||||
|
def move_manifest_item(self, item):
|
||||||
|
item.data # Make sure the data has been loaded and cached
|
||||||
|
old_abspath = os.path.join(self.old_base_path, *item.href.split('/'))
|
||||||
|
bname = item.href.split('/')[-1]
|
||||||
|
new_href = 'content/' + \
|
||||||
|
('resources/' if item.media_type in OEB_DOCS else '')+bname
|
||||||
|
|
||||||
|
def __call__(self, oeb, context):
|
||||||
|
self.map = {}
|
||||||
|
self.old_base_path = os.path.abspath(oeb.container.rootdir)
|
||||||
|
|
||||||
|
for item in self.oeb.manifest:
|
||||||
|
self.move_manifest_item(item)
|
||||||
|
|
||||||
|
for item in self.oeb.manifest:
|
||||||
|
self.rewrite_links_in(item)
|
||||||
|
|
||||||
|
|
@ -6,9 +6,9 @@ from __future__ import with_statement
|
|||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||||
|
|
||||||
import sys, os, logging
|
import os
|
||||||
from calibre.ebooks.oeb.base import OPF_MIME, xml2str
|
from calibre.ebooks.oeb.base import OPF_MIME, xml2str
|
||||||
from calibre.ebooks.oeb.base import DirContainer, OEBBook
|
from calibre.ebooks.oeb.base import DirContainer, OEBError
|
||||||
|
|
||||||
__all__ = ['OEBWriter']
|
__all__ = ['OEBWriter']
|
||||||
|
|
||||||
@ -18,7 +18,7 @@ class OEBWriter(object):
|
|||||||
|
|
||||||
TRANSFORMS = []
|
TRANSFORMS = []
|
||||||
"""List of transforms to apply to content written with this Writer."""
|
"""List of transforms to apply to content written with this Writer."""
|
||||||
|
|
||||||
def __init__(self, version='2.0', page_map=False, pretty_print=False):
|
def __init__(self, version='2.0', page_map=False, pretty_print=False):
|
||||||
self.version = version
|
self.version = version
|
||||||
self.page_map = page_map
|
self.page_map = page_map
|
||||||
@ -46,7 +46,7 @@ class OEBWriter(object):
|
|||||||
pretty_print = opts.pretty_print
|
pretty_print = opts.pretty_print
|
||||||
return cls(version=version, page_map=page_map,
|
return cls(version=version, page_map=page_map,
|
||||||
pretty_print=pretty_print)
|
pretty_print=pretty_print)
|
||||||
|
|
||||||
def __call__(self, oeb, path):
|
def __call__(self, oeb, path):
|
||||||
"""Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
"""Read the book in the :class:`OEBBook` object :param:`oeb` to a file
|
||||||
at :param:`path`.
|
at :param:`path`.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user