This commit is contained in:
Kovid Goyal 2009-02-03 11:20:27 -08:00
commit 072a063be3
6 changed files with 186 additions and 64 deletions

View File

@ -21,6 +21,9 @@ mimetypes.add_type('application/epub+zip', '.epub')
mimetypes.add_type('text/x-sony-bbeb+xml', '.lrs')
mimetypes.add_type('application/x-sony-bbeb', '.lrf')
mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
mimetypes.add_type('application/adobe-page-template+xml', '.xpgt')
mimetypes.add_type('application/x-font-opentype', '.otf')
mimetypes.add_type('application/x-font-truetype', '.ttf')
def to_unicode(raw, encoding='utf-8', errors='strict'):
if isinstance(raw, unicode):

View File

@ -23,7 +23,7 @@ from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit.reader import DirectoryEntry
import calibre.ebooks.lit.maps as maps
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_CSS_MIME, \
from calibre.ebooks.oeb.base import OEB_DOCS, XHTML_MIME, OEB_STYLES, \
CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.oeb.base import namespace, barename, prefixname, \
urlnormalize, xpath
@ -474,7 +474,7 @@ class LitWriter(object):
name = '/data/' + item.id
data = item.data
secnum = 0
if not isinstance(data, basestring):
if isinstance(data, etree._Element):
self._add_folder(name)
rebin = ReBinary(data, item, self._oeb, map=HTML_MAP)
self._add_file(name + '/ahc', rebin.ahc, 0)
@ -483,6 +483,8 @@ class LitWriter(object):
data = rebin.content
name = name + '/content'
secnum = 1
elif isinstance(data, unicode):
data = data.encode('utf-8')
self._add_file(name, data, secnum)
item.size = len(data)
@ -493,7 +495,7 @@ class LitWriter(object):
if item.spine_position is not None:
key = 'linear' if item.linear else 'nonlinear'
manifest[key].append(item)
elif item.media_type == CSS_MIME:
elif item.media_type in OEB_STYLES:
manifest['css'].append(item)
elif item.media_type in LIT_IMAGES:
manifest['images'].append(item)
@ -506,6 +508,11 @@ class LitWriter(object):
data.write(pack('<I', len(items)))
for item in items:
id, media_type = item.id, item.media_type
if media_type in OEB_DOCS:
# Needs to have 'html' in media-type
media_type = XHTML_MIME
elif media_type in OEB_STYLES:
media_type = CSS_MIME
href = urlunquote(item.href)
item.offset = offset \
if state in ('linear', 'nonlinear') else 0
@ -525,7 +532,12 @@ class LitWriter(object):
pb3 = StringIO()
pb3cur = 0
bits = 0
linear = []
nonlinear = []
for item in self._oeb.spine:
dest = linear if item.linear else nonlinear
dest.append(item)
for item in chain(linear, nonlinear):
page_breaks = copy.copy(item.page_breaks)
if not item.linear:
page_breaks.insert(0, (0, []))

View File

@ -62,6 +62,16 @@ def SVG(name): return '{%s}%s' % (SVG_NS, name)
def XLINK(name): return '{%s}%s' % (XLINK_NS, name)
def CALIBRE(name): return '{%s}%s' % (CALIBRE_NS, name)
def LINK_SELECTORS():
results = []
for expr in ('h:head/h:link/@href', 'h:body//h:a/@href',
'h:body//h:img/@src', 'h:body//h:object/@data',
'h:body//*/@xl:href', '//ncx:content/@src',
'o2:page/@href'):
results.append(etree.XPath(expr, namespaces=XPNSMAP))
return results
LINK_SELECTORS = LINK_SELECTORS()
EPUB_MIME = 'application/epub+zip'
XHTML_MIME = 'application/xhtml+xml'
CSS_MIME = 'text/css'
@ -89,6 +99,10 @@ COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
RECOVER_PARSER = etree.XMLParser(recover=True)
def element(parent, *args, **kwargs):
if parent is not None:
@ -140,14 +154,17 @@ def xml2str(root):
return etree.tostring(root, encoding='utf-8', xml_declaration=True)
ASCII_CHARS = set(chr(x) for x in xrange(128))
URL_SAFE = set(u'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
u'abcdefghijklmnopqrstuvwxyz'
u'0123456789' u'_.-/~')
URL_UNSAFE = ASCII_CHARS - URL_SAFE
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789' '_.-/~')
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE]
def urlquote(href):
result = []
unsafe = 0 if isinstance(href, unicode) else 1
unsafe = URL_UNSAFE[unsafe]
for char in href:
if char in URL_UNSAFE:
if char in unsafe:
char = "%%%02x" % ord(char)
result.append(char)
return ''.join(result)
@ -185,7 +202,7 @@ class AbstractContainer(object):
class DirContainer(AbstractContainer):
def __init__(self, rootdir):
self.rootdir = rootdir
self.rootdir = unicode(rootdir)
def read(self, path):
path = os.path.join(self.rootdir, path)
@ -205,16 +222,23 @@ class DirContainer(AbstractContainer):
return os.path.isfile(urlunquote(path))
class DirWriter(object):
def __init__(self, version=2.0):
def __init__(self, version='2.0', page_map=False):
self.version = version
self.page_map = page_map
def dump(self, oeb, path):
version = int(self.version[0])
if not os.path.isdir(path):
os.mkdir(path)
output = DirContainer(path)
for item in oeb.manifest.values():
output.write(item.href, str(item))
metadata = oeb.to_opf2() if self.version == 2 else oeb.to_opf1()
if version == 1:
metadata = oeb.to_opf1()
elif version == 2:
metadata = oeb.to_opf2(page_map=self.page_map)
else:
raise OEBError("Unrecognized OPF version %r" % self.version)
for href, data in metadata.values():
output.write(href, xml2str(data))
return
@ -455,7 +479,6 @@ class Manifest(object):
# Convert to Unicode and normalize line endings
data = self.oeb.decode(data)
data = XMLDECL_RE.sub('', data)
data = data.replace('\r\n', '\n').replace('\r', '\n')
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in data and SVG_NS not in data:
data = data.replace(
@ -480,7 +503,10 @@ class Manifest(object):
if elem.text:
elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding=unicode)
try:
data = etree.fromstring(data)
except etree.XMLSyntaxError:
data = etree.fromstring(data, parser=RECOVER_PARSER)
# Force into the XHTML namespace
if barename(data.tag) != 'html':
raise OEBError(
@ -536,6 +562,8 @@ class Manifest(object):
data = self._force_xhtml(data)
elif self.media_type[-4:] in ('+xml', '/xml'):
data = etree.fromstring(data)
elif self.media_type in OEB_STYLES:
data = self.oeb.decode(data)
self._data = data
return data
def fset(self, value):
@ -549,6 +577,8 @@ class Manifest(object):
data = self.data
if isinstance(data, etree._Element):
return xml2str(data)
if isinstance(data, unicode):
return data.encode('utf-8')
return str(data)
def __eq__(self, other):
@ -572,7 +602,9 @@ class Manifest(object):
return cmp(skey, okey)
def relhref(self, href):
if '/' not in self.href or ':' in href:
if urlparse(href).scheme:
return href
if '/' not in self.href:
return href
base = os.path.dirname(self.href).split('/')
target, frag = urldefrag(href)
@ -588,7 +620,12 @@ class Manifest(object):
return relhref
def abshref(self, href):
if '/' not in self.href or ':' in href:
if urlparse(href).scheme:
return href
path, frag = urldefrag(href)
if not path:
return '#'.join((self.href, frag))
if '/' not in self.href:
return href
dirname = os.path.dirname(self.href)
href = os.path.join(dirname, href)
@ -615,13 +652,15 @@ class Manifest(object):
if item in self.oeb.spine:
self.oeb.spine.remove(item)
def generate(self, id, href):
href = urlnormalize(href)
def generate(self, id=None, href=None):
if id is not None:
base = id
index = 1
while id in self.ids:
id = base + str(index)
index += 1
if href is not None:
href = urlnormalize(href)
base, ext = os.path.splitext(href)
index = 1
while href in self.hrefs:
@ -996,13 +1035,11 @@ class OEBBook(object):
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
for elem in xpath(opf, 'o2:metadata//*'):
if elem.tag in ignored:
continue
if namespace(elem.tag) in DC_NSES:
tag = barename(elem.tag).lower()
elem.tag = '{%s}%s' % (DC11_NS, tag)
for name in elem.attrib:
if name in ('role', 'file-as', 'scheme', 'event'):
nsname = '{%s}%s' % (OPF2_NS, name)
elem.attrib[nsname] = elem.attrib.pop(name)
metadata.append(elem)
for element in xpath(opf, 'o2:metadata//o2:meta'):
metadata.append(element)
@ -1015,7 +1052,6 @@ class OEBBook(object):
data = self.container.read(opfpath)
data = self.decode(data)
data = XMLDECL_RE.sub('', data)
data = data.replace('\r\n', '\n').replace('\r', '\n')
try:
opf = etree.fromstring(data)
except etree.XMLSyntaxError:
@ -1078,6 +1114,45 @@ class OEBBook(object):
self.logger.warn('Title not specified')
metadata.add('title', self.translate(__('Unknown')))
def _manifest_add_missing(self):
manifest = self.manifest
known = set(manifest.hrefs)
unchecked = set(manifest.values())
while unchecked:
new = set()
for item in unchecked:
if (item.media_type in OEB_DOCS or
item.media_type[-4:] in ('/xml', '+xml')) and \
item.data is not None:
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
for href in chain(*hrefs):
href, _ = urldefrag(href)
if not href:
continue
href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme
if not scheme and href not in known:
new.add(href)
elif item.media_type in OEB_STYLES:
for match in CSSURL_RE.finditer(item.data):
href, _ = urldefrag(match.group('url'))
href = item.abshref(urlnormalize(href))
scheme = urlparse(href).scheme
if not scheme and href not in known:
new.add(href)
unchecked.clear()
for href in new:
known.add(href)
if not self.container.exists(href):
self.logger.warn('Referenced file %r not found' % href)
continue
self.logger.warn('Referenced file %r not in manifest' % href)
id, _ = manifest.generate(id='added')
guessed = mimetypes.guess_type(href)[0]
media_type = guessed or BINARY_MIME
added = manifest.add(id, href, media_type)
unchecked.add(added)
def _manifest_from_opf(self, opf):
self.manifest = manifest = Manifest(self)
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
@ -1100,6 +1175,40 @@ class OEBBook(object):
self.logger.warn(u'Duplicate manifest id %r' % id)
id, href = manifest.generate(id, href)
manifest.add(id, href, media_type, fallback)
self._manifest_add_missing()
def _spine_add_extra(self):
manifest = self.manifest
spine = self.spine
unchecked = set(spine)
selector = XPath('h:body//h:a/@href')
extras = set()
while unchecked:
new = set()
for item in unchecked:
if item.media_type not in OEB_DOCS:
# TODO: handle fallback chains
continue
for href in selector(item.data):
href, _ = urldefrag(href)
if not href:
continue
href = item.abshref(urlnormalize(href))
if href not in manifest.hrefs:
continue
found = manifest.hrefs[href]
if found.media_type not in OEB_DOCS or \
found in spine or found in extras:
continue
new.add(found)
extras.update(new)
unchecked = new
version = int(self.version[0])
for item in sorted(extras):
if version >= 2:
self.logger.warn(
'Spine-referenced file %r not in spine' % item.href)
spine.add(item, linear=False)
def _spine_from_opf(self, opf):
self.spine = spine = Spine(self)
@ -1110,16 +1219,9 @@ class OEBBook(object):
continue
item = self.manifest[idref]
spine.add(item, elem.get('linear'))
extras = []
for item in self.manifest.values():
if item.media_type in OEB_DOCS \
and item not in spine:
extras.append(item)
extras.sort()
for item in extras:
spine.add(item, False)
if len(spine) == 0:
raise OEBError("Spine is empty")
self._spine_add_extra()
def _guide_from_opf(self, opf):
self.guide = guide = Guide(self)
@ -1189,12 +1291,11 @@ class OEBBook(object):
href = site.get('href')
if not title or not href:
continue
href = item.abshref(urlnormalize(href))
path, _ = urldefrag(href)
path, _ = urldefrag(urlnormalize(href))
if path not in self.manifest.hrefs:
self.logger.warn('TOC reference %r not found' % href)
continue
id = child.get('id')
id = site.get('id')
toc.add(title, href, id=id)
return True
@ -1217,12 +1318,12 @@ class OEBBook(object):
order = []
for anchor in xpath(html, './/h:a[@href]'):
href = anchor.attrib['href']
href = item.abshref(urlnormalize(href))
path, frag = urldefrag(href)
if not path:
href = '#'.join((itempath, frag))
if path not in self.manifest.hrefs:
continue
title = ' '.join(xpath(anchor, './/text()'))
title = COLLAPSE_RE.sub(' ', title.strip())
href = urlnormalize(href)
if href not in titles:
order.append(href)
titles[href].append(title)
@ -1313,7 +1414,12 @@ class OEBBook(object):
continue
name = COLLAPSE_RE.sub(' ', name.strip())
href = item.abshref(urlnormalize(href))
pages.add(name, href)
type = 'normal'
if not name:
type = 'special'
elif name.lower().strip('ivxlcdm') == '':
type = 'front'
pages.add(name, href, type=type)
return True
def _pages_from_opf(self, opf, item):
@ -1337,8 +1443,10 @@ class OEBBook(object):
if self.metadata.cover:
id = str(self.metadata.cover[0])
item = self.manifest.ids.get(id, None)
if item is not None:
if item is not None and item.media_type in OEB_IMAGES:
return item
else:
self.logger.warn('Invalid cover image @id %r' % id)
hcover = self.spine[0]
if 'cover' in self.guide:
href = self.guide['cover'].href
@ -1376,6 +1484,7 @@ class OEBBook(object):
self.metadata.add('cover', cover.id)
def _all_from_opf(self, opf):
self.version = opf.get('version', '1.2')
self._metadata_from_opf(opf)
self._manifest_from_opf(opf)
self._spine_from_opf(opf)
@ -1408,6 +1517,8 @@ class OEBBook(object):
except UnicodeDecodeError:
pass
data, _ = xml_to_unicode(data)
data = data.replace('\r\n', '\n')
data = data.replace('\r', '\n')
return data
def to_opf1(self):
@ -1447,7 +1558,8 @@ class OEBBook(object):
next += 1
selector = XPath('ncx:content/@src')
for elem in xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]'):
order = playorder[selector(elem)[0]]
href = selector(elem)[0]
order = playorder.get(href, 0)
elem.attrib['playOrder'] = str(order)
return

View File

@ -172,9 +172,8 @@ class Stylizer(object):
if path not in hrefs:
return (None, None)
data = hrefs[path].data
data = self.oeb.decode(data)
data = XHTML_CSS_NAMESPACE + data
return (None, data)
return ('utf-8', data)
def flatten_rule(self, rule, href, index):
results = []

View File

@ -13,13 +13,9 @@ from urlparse import urldefrag
from lxml import etree
import cssutils
from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME, OEB_DOCS
from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE
from calibre.ebooks.oeb.base import urlnormalize
LINK_SELECTORS = []
for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data',
'//*/@xl:href'):
LINK_SELECTORS.append(etree.XPath(expr, namespaces=XPNSMAP))
class ManifestTrimmer(object):
def transform(self, oeb, context):
oeb.logger.info('Trimming unused files from manifest...')
@ -53,15 +49,13 @@ class ManifestTrimmer(object):
if found not in used:
new.add(found)
elif item.media_type == CSS_MIME:
def replacer(uri):
absuri = item.abshref(urlnormalize(uri))
if absuri in oeb.manifest.hrefs:
for match in CSSURL_RE.finditer(item.data):
href = match.group('url')
href = item.abshref(urlnormalize(href))
if href in oeb.manifest.hrefs:
found = oeb.manifest.hrefs[href]
if found not in used:
new.add(found)
return uri
sheet = cssutils.parseString(item.data, href=item.href)
cssutils.replaceUrls(sheet, replacer)
used.update(new)
unchecked = new
for item in oeb.manifest.values():

View File

@ -840,7 +840,9 @@ def _readUrl(url, fetcher=None, overrideEncoding=None, parentEncoding=None):
try:
# encoding may still be wrong if encoding *is lying*!
if content is not None:
if isinstance(content, unicode):
decodedCssText = content
elif content is not None:
decodedCssText = codecs.lookup("css")[1](content, encoding=encoding)[0]
else:
decodedCssText = None