mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Further improve OEBBook handling of bad OPF and strange URIs.
This commit is contained in:
parent
2e704247be
commit
168ef41787
@ -21,6 +21,9 @@ mimetypes.add_type('application/epub+zip', '.epub')
|
|||||||
mimetypes.add_type('text/x-sony-bbeb+xml', '.lrs')
|
mimetypes.add_type('text/x-sony-bbeb+xml', '.lrs')
|
||||||
mimetypes.add_type('application/x-sony-bbeb', '.lrf')
|
mimetypes.add_type('application/x-sony-bbeb', '.lrf')
|
||||||
mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
|
mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
|
||||||
|
mimetypes.add_type('application/adobe-page-template+xml', '.xpgt')
|
||||||
|
mimetypes.add_type('application/x-font-opentype', '.otf')
|
||||||
|
mimetypes.add_type('application/x-font-truetype', '.ttf')
|
||||||
|
|
||||||
def to_unicode(raw, encoding='utf-8', errors='strict'):
|
def to_unicode(raw, encoding='utf-8', errors='strict'):
|
||||||
if isinstance(raw, unicode):
|
if isinstance(raw, unicode):
|
||||||
|
@ -62,6 +62,16 @@ def SVG(name): return '{%s}%s' % (SVG_NS, name)
|
|||||||
def XLINK(name): return '{%s}%s' % (XLINK_NS, name)
|
def XLINK(name): return '{%s}%s' % (XLINK_NS, name)
|
||||||
def CALIBRE(name): return '{%s}%s' % (CALIBRE_NS, name)
|
def CALIBRE(name): return '{%s}%s' % (CALIBRE_NS, name)
|
||||||
|
|
||||||
|
def LINK_SELECTORS():
|
||||||
|
results = []
|
||||||
|
for expr in ('h:head/h:link/@href', 'h:body//h:a/@href',
|
||||||
|
'h:body//h:img/@src', 'h:body//h:object/@data',
|
||||||
|
'h:body//*/@xl:href', '//ncx:content/@src',
|
||||||
|
'o2:page/@href'):
|
||||||
|
results.append(etree.XPath(expr, namespaces=XPNSMAP))
|
||||||
|
return results
|
||||||
|
LINK_SELECTORS = LINK_SELECTORS()
|
||||||
|
|
||||||
EPUB_MIME = 'application/epub+zip'
|
EPUB_MIME = 'application/epub+zip'
|
||||||
XHTML_MIME = 'application/xhtml+xml'
|
XHTML_MIME = 'application/xhtml+xml'
|
||||||
CSS_MIME = 'text/css'
|
CSS_MIME = 'text/css'
|
||||||
@ -89,6 +99,10 @@ COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
|
|||||||
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
|
QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$')
|
||||||
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
|
||||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||||
|
CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
|
||||||
|
|
||||||
|
RECOVER_PARSER = etree.XMLParser(recover=True)
|
||||||
|
|
||||||
|
|
||||||
def element(parent, *args, **kwargs):
|
def element(parent, *args, **kwargs):
|
||||||
if parent is not None:
|
if parent is not None:
|
||||||
@ -140,14 +154,17 @@ def xml2str(root):
|
|||||||
return etree.tostring(root, encoding='utf-8', xml_declaration=True)
|
return etree.tostring(root, encoding='utf-8', xml_declaration=True)
|
||||||
|
|
||||||
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
ASCII_CHARS = set(chr(x) for x in xrange(128))
|
||||||
URL_SAFE = set(u'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
|
||||||
u'abcdefghijklmnopqrstuvwxyz'
|
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
u'0123456789' u'_.-/~')
|
'abcdefghijklmnopqrstuvwxyz'
|
||||||
URL_UNSAFE = ASCII_CHARS - URL_SAFE
|
'0123456789' '_.-/~')
|
||||||
|
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE]
|
||||||
def urlquote(href):
|
def urlquote(href):
|
||||||
result = []
|
result = []
|
||||||
|
unsafe = 0 if isinstance(href, unicode) else 1
|
||||||
|
unsafe = URL_UNSAFE[unsafe]
|
||||||
for char in href:
|
for char in href:
|
||||||
if char in URL_UNSAFE:
|
if char in unsafe:
|
||||||
char = "%%%02x" % ord(char)
|
char = "%%%02x" % ord(char)
|
||||||
result.append(char)
|
result.append(char)
|
||||||
return ''.join(result)
|
return ''.join(result)
|
||||||
@ -185,7 +202,7 @@ class AbstractContainer(object):
|
|||||||
|
|
||||||
class DirContainer(AbstractContainer):
|
class DirContainer(AbstractContainer):
|
||||||
def __init__(self, rootdir):
|
def __init__(self, rootdir):
|
||||||
self.rootdir = rootdir
|
self.rootdir = unicode(rootdir)
|
||||||
|
|
||||||
def read(self, path):
|
def read(self, path):
|
||||||
path = os.path.join(self.rootdir, path)
|
path = os.path.join(self.rootdir, path)
|
||||||
@ -205,16 +222,23 @@ class DirContainer(AbstractContainer):
|
|||||||
return os.path.isfile(urlunquote(path))
|
return os.path.isfile(urlunquote(path))
|
||||||
|
|
||||||
class DirWriter(object):
|
class DirWriter(object):
|
||||||
def __init__(self, version=2.0):
|
def __init__(self, version='2.0', page_map=False):
|
||||||
self.version = version
|
self.version = version
|
||||||
|
self.page_map = page_map
|
||||||
|
|
||||||
def dump(self, oeb, path):
|
def dump(self, oeb, path):
|
||||||
|
version = int(self.version[0])
|
||||||
if not os.path.isdir(path):
|
if not os.path.isdir(path):
|
||||||
os.mkdir(path)
|
os.mkdir(path)
|
||||||
output = DirContainer(path)
|
output = DirContainer(path)
|
||||||
for item in oeb.manifest.values():
|
for item in oeb.manifest.values():
|
||||||
output.write(item.href, str(item))
|
output.write(item.href, str(item))
|
||||||
metadata = oeb.to_opf2() if self.version == 2 else oeb.to_opf1()
|
if version == 1:
|
||||||
|
metadata = oeb.to_opf1()
|
||||||
|
elif version == 2:
|
||||||
|
metadata = oeb.to_opf2(page_map=self.page_map)
|
||||||
|
else:
|
||||||
|
raise OEBError("Unrecognized OPF version %r" % self.version)
|
||||||
for href, data in metadata.values():
|
for href, data in metadata.values():
|
||||||
output.write(href, xml2str(data))
|
output.write(href, xml2str(data))
|
||||||
return
|
return
|
||||||
@ -455,7 +479,6 @@ class Manifest(object):
|
|||||||
# Convert to Unicode and normalize line endings
|
# Convert to Unicode and normalize line endings
|
||||||
data = self.oeb.decode(data)
|
data = self.oeb.decode(data)
|
||||||
data = XMLDECL_RE.sub('', data)
|
data = XMLDECL_RE.sub('', data)
|
||||||
data = data.replace('\r\n', '\n').replace('\r', '\n')
|
|
||||||
# Handle broken XHTML w/ SVG (ugh)
|
# Handle broken XHTML w/ SVG (ugh)
|
||||||
if 'svg:' in data and SVG_NS not in data:
|
if 'svg:' in data and SVG_NS not in data:
|
||||||
data = data.replace(
|
data = data.replace(
|
||||||
@ -480,7 +503,10 @@ class Manifest(object):
|
|||||||
if elem.text:
|
if elem.text:
|
||||||
elem.text = elem.text.strip('-')
|
elem.text = elem.text.strip('-')
|
||||||
data = etree.tostring(data, encoding=unicode)
|
data = etree.tostring(data, encoding=unicode)
|
||||||
data = etree.fromstring(data)
|
try:
|
||||||
|
data = etree.fromstring(data)
|
||||||
|
except etree.XMLSyntaxError:
|
||||||
|
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||||
# Force into the XHTML namespace
|
# Force into the XHTML namespace
|
||||||
if barename(data.tag) != 'html':
|
if barename(data.tag) != 'html':
|
||||||
raise OEBError(
|
raise OEBError(
|
||||||
@ -536,6 +562,8 @@ class Manifest(object):
|
|||||||
data = self._force_xhtml(data)
|
data = self._force_xhtml(data)
|
||||||
elif self.media_type[-4:] in ('+xml', '/xml'):
|
elif self.media_type[-4:] in ('+xml', '/xml'):
|
||||||
data = etree.fromstring(data)
|
data = etree.fromstring(data)
|
||||||
|
elif self.media_type in OEB_STYLES:
|
||||||
|
data = self.oeb.decode(data)
|
||||||
self._data = data
|
self._data = data
|
||||||
return data
|
return data
|
||||||
def fset(self, value):
|
def fset(self, value):
|
||||||
@ -549,6 +577,8 @@ class Manifest(object):
|
|||||||
data = self.data
|
data = self.data
|
||||||
if isinstance(data, etree._Element):
|
if isinstance(data, etree._Element):
|
||||||
return xml2str(data)
|
return xml2str(data)
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
return data.encode('utf-8')
|
||||||
return str(data)
|
return str(data)
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
@ -572,7 +602,9 @@ class Manifest(object):
|
|||||||
return cmp(skey, okey)
|
return cmp(skey, okey)
|
||||||
|
|
||||||
def relhref(self, href):
|
def relhref(self, href):
|
||||||
if '/' not in self.href or ':' in href:
|
if urlparse(href).scheme:
|
||||||
|
return href
|
||||||
|
if '/' not in self.href:
|
||||||
return href
|
return href
|
||||||
base = os.path.dirname(self.href).split('/')
|
base = os.path.dirname(self.href).split('/')
|
||||||
target, frag = urldefrag(href)
|
target, frag = urldefrag(href)
|
||||||
@ -588,7 +620,12 @@ class Manifest(object):
|
|||||||
return relhref
|
return relhref
|
||||||
|
|
||||||
def abshref(self, href):
|
def abshref(self, href):
|
||||||
if '/' not in self.href or ':' in href:
|
if urlparse(href).scheme:
|
||||||
|
return href
|
||||||
|
path, frag = urldefrag(href)
|
||||||
|
if not path:
|
||||||
|
return '#'.join((self.href, frag))
|
||||||
|
if '/' not in self.href:
|
||||||
return href
|
return href
|
||||||
dirname = os.path.dirname(self.href)
|
dirname = os.path.dirname(self.href)
|
||||||
href = os.path.join(dirname, href)
|
href = os.path.join(dirname, href)
|
||||||
@ -615,18 +652,20 @@ class Manifest(object):
|
|||||||
if item in self.oeb.spine:
|
if item in self.oeb.spine:
|
||||||
self.oeb.spine.remove(item)
|
self.oeb.spine.remove(item)
|
||||||
|
|
||||||
def generate(self, id, href):
|
def generate(self, id=None, href=None):
|
||||||
href = urlnormalize(href)
|
if id is not None:
|
||||||
base = id
|
base = id
|
||||||
index = 1
|
index = 1
|
||||||
while id in self.ids:
|
while id in self.ids:
|
||||||
id = base + str(index)
|
id = base + str(index)
|
||||||
index += 1
|
index += 1
|
||||||
base, ext = os.path.splitext(href)
|
if href is not None:
|
||||||
index = 1
|
href = urlnormalize(href)
|
||||||
while href in self.hrefs:
|
base, ext = os.path.splitext(href)
|
||||||
href = base + str(index) + ext
|
index = 1
|
||||||
index += 1
|
while href in self.hrefs:
|
||||||
|
href = base + str(index) + ext
|
||||||
|
index += 1
|
||||||
return id, href
|
return id, href
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
@ -996,13 +1035,11 @@ class OEBBook(object):
|
|||||||
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
|
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
|
||||||
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
|
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
|
||||||
for elem in xpath(opf, 'o2:metadata//*'):
|
for elem in xpath(opf, 'o2:metadata//*'):
|
||||||
|
if elem.tag in ignored:
|
||||||
|
continue
|
||||||
if namespace(elem.tag) in DC_NSES:
|
if namespace(elem.tag) in DC_NSES:
|
||||||
tag = barename(elem.tag).lower()
|
tag = barename(elem.tag).lower()
|
||||||
elem.tag = '{%s}%s' % (DC11_NS, tag)
|
elem.tag = '{%s}%s' % (DC11_NS, tag)
|
||||||
for name in elem.attrib:
|
|
||||||
if name in ('role', 'file-as', 'scheme', 'event'):
|
|
||||||
nsname = '{%s}%s' % (OPF2_NS, name)
|
|
||||||
elem.attrib[nsname] = elem.attrib.pop(name)
|
|
||||||
metadata.append(elem)
|
metadata.append(elem)
|
||||||
for element in xpath(opf, 'o2:metadata//o2:meta'):
|
for element in xpath(opf, 'o2:metadata//o2:meta'):
|
||||||
metadata.append(element)
|
metadata.append(element)
|
||||||
@ -1015,7 +1052,6 @@ class OEBBook(object):
|
|||||||
data = self.container.read(opfpath)
|
data = self.container.read(opfpath)
|
||||||
data = self.decode(data)
|
data = self.decode(data)
|
||||||
data = XMLDECL_RE.sub('', data)
|
data = XMLDECL_RE.sub('', data)
|
||||||
data = data.replace('\r\n', '\n').replace('\r', '\n')
|
|
||||||
try:
|
try:
|
||||||
opf = etree.fromstring(data)
|
opf = etree.fromstring(data)
|
||||||
except etree.XMLSyntaxError:
|
except etree.XMLSyntaxError:
|
||||||
@ -1077,6 +1113,43 @@ class OEBBook(object):
|
|||||||
if not metadata.title:
|
if not metadata.title:
|
||||||
self.logger.warn('Title not specified')
|
self.logger.warn('Title not specified')
|
||||||
metadata.add('title', self.translate(__('Unknown')))
|
metadata.add('title', self.translate(__('Unknown')))
|
||||||
|
|
||||||
|
def _manifest_add_missing(self):
|
||||||
|
manifest = self.manifest
|
||||||
|
unchecked = set(manifest.values())
|
||||||
|
while unchecked:
|
||||||
|
new = set()
|
||||||
|
for item in unchecked:
|
||||||
|
if (item.media_type in OEB_DOCS or
|
||||||
|
item.media_type[-4:] in ('/xml', '+xml')) and \
|
||||||
|
item.data is not None:
|
||||||
|
hrefs = [sel(item.data) for sel in LINK_SELECTORS]
|
||||||
|
for href in chain(*hrefs):
|
||||||
|
href, _ = urldefrag(href)
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
href = item.abshref(urlnormalize(href))
|
||||||
|
scheme = urlparse(href).scheme
|
||||||
|
if not scheme and href not in manifest.hrefs:
|
||||||
|
new.add(href)
|
||||||
|
elif item.media_type == CSS_MIME:
|
||||||
|
for match in CSSURL_RE.finditer(item.data):
|
||||||
|
href = match.group('url')
|
||||||
|
href = item.abshref(urlnormalize(href))
|
||||||
|
scheme = urlparse(href).scheme
|
||||||
|
if not scheme and href not in manifest.hrefs:
|
||||||
|
new.add(href)
|
||||||
|
unchecked.clear()
|
||||||
|
for href in new:
|
||||||
|
if not self.container.exists(href):
|
||||||
|
self.logger.warn('Referenced file %r not found' % href)
|
||||||
|
continue
|
||||||
|
self.logger.warn('Referenced file %r not in manifest' % href)
|
||||||
|
id, _ = manifest.generate(id='added')
|
||||||
|
guessed = mimetypes.guess_type(href)[0]
|
||||||
|
media_type = guessed or BINARY_MIME
|
||||||
|
added = manifest.add(id, href, media_type)
|
||||||
|
unchecked.add(added)
|
||||||
|
|
||||||
def _manifest_from_opf(self, opf):
|
def _manifest_from_opf(self, opf):
|
||||||
self.manifest = manifest = Manifest(self)
|
self.manifest = manifest = Manifest(self)
|
||||||
@ -1100,6 +1173,40 @@ class OEBBook(object):
|
|||||||
self.logger.warn(u'Duplicate manifest id %r' % id)
|
self.logger.warn(u'Duplicate manifest id %r' % id)
|
||||||
id, href = manifest.generate(id, href)
|
id, href = manifest.generate(id, href)
|
||||||
manifest.add(id, href, media_type, fallback)
|
manifest.add(id, href, media_type, fallback)
|
||||||
|
self._manifest_add_missing()
|
||||||
|
|
||||||
|
def _spine_add_extra(self):
|
||||||
|
manifest = self.manifest
|
||||||
|
spine = self.spine
|
||||||
|
unchecked = set(spine)
|
||||||
|
selector = XPath('h:body//h:a/@href')
|
||||||
|
extras = set()
|
||||||
|
while unchecked:
|
||||||
|
new = set()
|
||||||
|
for item in unchecked:
|
||||||
|
if item.media_type not in OEB_DOCS:
|
||||||
|
# TODO: handle fallback chains
|
||||||
|
continue
|
||||||
|
for href in selector(item.data):
|
||||||
|
href, _ = urldefrag(href)
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
href = item.abshref(urlnormalize(href))
|
||||||
|
if href not in manifest.hrefs:
|
||||||
|
continue
|
||||||
|
found = manifest.hrefs[href]
|
||||||
|
if found.media_type not in OEB_DOCS or \
|
||||||
|
found in spine or found in extras:
|
||||||
|
continue
|
||||||
|
new.add(found)
|
||||||
|
extras.update(new)
|
||||||
|
unchecked = new
|
||||||
|
version = int(self.version[0])
|
||||||
|
for item in sorted(extras):
|
||||||
|
if version >= 2:
|
||||||
|
self.logger.warn(
|
||||||
|
'Spine-referenced file %r not in spine' % item.href)
|
||||||
|
spine.add(item, linear=False)
|
||||||
|
|
||||||
def _spine_from_opf(self, opf):
|
def _spine_from_opf(self, opf):
|
||||||
self.spine = spine = Spine(self)
|
self.spine = spine = Spine(self)
|
||||||
@ -1110,16 +1217,9 @@ class OEBBook(object):
|
|||||||
continue
|
continue
|
||||||
item = self.manifest[idref]
|
item = self.manifest[idref]
|
||||||
spine.add(item, elem.get('linear'))
|
spine.add(item, elem.get('linear'))
|
||||||
extras = []
|
|
||||||
for item in self.manifest.values():
|
|
||||||
if item.media_type in OEB_DOCS \
|
|
||||||
and item not in spine:
|
|
||||||
extras.append(item)
|
|
||||||
extras.sort()
|
|
||||||
for item in extras:
|
|
||||||
spine.add(item, False)
|
|
||||||
if len(spine) == 0:
|
if len(spine) == 0:
|
||||||
raise OEBError("Spine is empty")
|
raise OEBError("Spine is empty")
|
||||||
|
self._spine_add_extra()
|
||||||
|
|
||||||
def _guide_from_opf(self, opf):
|
def _guide_from_opf(self, opf):
|
||||||
self.guide = guide = Guide(self)
|
self.guide = guide = Guide(self)
|
||||||
@ -1189,12 +1289,11 @@ class OEBBook(object):
|
|||||||
href = site.get('href')
|
href = site.get('href')
|
||||||
if not title or not href:
|
if not title or not href:
|
||||||
continue
|
continue
|
||||||
href = item.abshref(urlnormalize(href))
|
path, _ = urldefrag(urlnormalize(href))
|
||||||
path, _ = urldefrag(href)
|
|
||||||
if path not in self.manifest.hrefs:
|
if path not in self.manifest.hrefs:
|
||||||
self.logger.warn('TOC reference %r not found' % href)
|
self.logger.warn('TOC reference %r not found' % href)
|
||||||
continue
|
continue
|
||||||
id = child.get('id')
|
id = site.get('id')
|
||||||
toc.add(title, href, id=id)
|
toc.add(title, href, id=id)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -1217,12 +1316,12 @@ class OEBBook(object):
|
|||||||
order = []
|
order = []
|
||||||
for anchor in xpath(html, './/h:a[@href]'):
|
for anchor in xpath(html, './/h:a[@href]'):
|
||||||
href = anchor.attrib['href']
|
href = anchor.attrib['href']
|
||||||
|
href = item.abshref(urlnormalize(href))
|
||||||
path, frag = urldefrag(href)
|
path, frag = urldefrag(href)
|
||||||
if not path:
|
if path not in self.manifest.hrefs:
|
||||||
href = '#'.join((itempath, frag))
|
continue
|
||||||
title = ' '.join(xpath(anchor, './/text()'))
|
title = ' '.join(xpath(anchor, './/text()'))
|
||||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||||
href = urlnormalize(href)
|
|
||||||
if href not in titles:
|
if href not in titles:
|
||||||
order.append(href)
|
order.append(href)
|
||||||
titles[href].append(title)
|
titles[href].append(title)
|
||||||
@ -1313,7 +1412,12 @@ class OEBBook(object):
|
|||||||
continue
|
continue
|
||||||
name = COLLAPSE_RE.sub(' ', name.strip())
|
name = COLLAPSE_RE.sub(' ', name.strip())
|
||||||
href = item.abshref(urlnormalize(href))
|
href = item.abshref(urlnormalize(href))
|
||||||
pages.add(name, href)
|
type = 'normal'
|
||||||
|
if not name:
|
||||||
|
type = 'special'
|
||||||
|
elif name.lower().strip('ivxlcdm') == '':
|
||||||
|
type = 'front'
|
||||||
|
pages.add(name, href, type=type)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _pages_from_opf(self, opf, item):
|
def _pages_from_opf(self, opf, item):
|
||||||
@ -1376,6 +1480,7 @@ class OEBBook(object):
|
|||||||
self.metadata.add('cover', cover.id)
|
self.metadata.add('cover', cover.id)
|
||||||
|
|
||||||
def _all_from_opf(self, opf):
|
def _all_from_opf(self, opf):
|
||||||
|
self.version = opf.get('version', '1.2')
|
||||||
self._metadata_from_opf(opf)
|
self._metadata_from_opf(opf)
|
||||||
self._manifest_from_opf(opf)
|
self._manifest_from_opf(opf)
|
||||||
self._spine_from_opf(opf)
|
self._spine_from_opf(opf)
|
||||||
@ -1384,7 +1489,7 @@ class OEBBook(object):
|
|||||||
self._toc_from_opf(opf, item)
|
self._toc_from_opf(opf, item)
|
||||||
self._pages_from_opf(opf, item)
|
self._pages_from_opf(opf, item)
|
||||||
self._ensure_cover_image()
|
self._ensure_cover_image()
|
||||||
|
|
||||||
def translate(self, text):
|
def translate(self, text):
|
||||||
lang = str(self.metadata.language[0])
|
lang = str(self.metadata.language[0])
|
||||||
lang = lang.split('-', 1)[0].lower()
|
lang = lang.split('-', 1)[0].lower()
|
||||||
@ -1408,6 +1513,8 @@ class OEBBook(object):
|
|||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
data, _ = xml_to_unicode(data)
|
data, _ = xml_to_unicode(data)
|
||||||
|
data = data.replace('\r\n', '\n')
|
||||||
|
data = data.replace('\r', '\n')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def to_opf1(self):
|
def to_opf1(self):
|
||||||
@ -1447,7 +1554,8 @@ class OEBBook(object):
|
|||||||
next += 1
|
next += 1
|
||||||
selector = XPath('ncx:content/@src')
|
selector = XPath('ncx:content/@src')
|
||||||
for elem in xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]'):
|
for elem in xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]'):
|
||||||
order = playorder[selector(elem)[0]]
|
href = selector(elem)[0]
|
||||||
|
order = playorder.get(href, 0)
|
||||||
elem.attrib['playOrder'] = str(order)
|
elem.attrib['playOrder'] = str(order)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -172,7 +172,6 @@ class Stylizer(object):
|
|||||||
if path not in hrefs:
|
if path not in hrefs:
|
||||||
return (None, None)
|
return (None, None)
|
||||||
data = hrefs[path].data
|
data = hrefs[path].data
|
||||||
data = self.oeb.decode(data)
|
|
||||||
data = XHTML_CSS_NAMESPACE + data
|
data = XHTML_CSS_NAMESPACE + data
|
||||||
return (None, data)
|
return (None, data)
|
||||||
|
|
||||||
|
@ -13,13 +13,9 @@ from urlparse import urldefrag
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
import cssutils
|
import cssutils
|
||||||
from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME, OEB_DOCS
|
from calibre.ebooks.oeb.base import XPNSMAP, CSS_MIME, OEB_DOCS
|
||||||
|
from calibre.ebooks.oeb.base import LINK_SELECTORS, CSSURL_RE
|
||||||
from calibre.ebooks.oeb.base import urlnormalize
|
from calibre.ebooks.oeb.base import urlnormalize
|
||||||
|
|
||||||
LINK_SELECTORS = []
|
|
||||||
for expr in ('//h:link/@href', '//h:img/@src', '//h:object/@data',
|
|
||||||
'//*/@xl:href'):
|
|
||||||
LINK_SELECTORS.append(etree.XPath(expr, namespaces=XPNSMAP))
|
|
||||||
|
|
||||||
class ManifestTrimmer(object):
|
class ManifestTrimmer(object):
|
||||||
def transform(self, oeb, context):
|
def transform(self, oeb, context):
|
||||||
oeb.logger.info('Trimming unused files from manifest...')
|
oeb.logger.info('Trimming unused files from manifest...')
|
||||||
@ -53,15 +49,13 @@ class ManifestTrimmer(object):
|
|||||||
if found not in used:
|
if found not in used:
|
||||||
new.add(found)
|
new.add(found)
|
||||||
elif item.media_type == CSS_MIME:
|
elif item.media_type == CSS_MIME:
|
||||||
def replacer(uri):
|
for match in CSSURL_RE.finditer(item.data):
|
||||||
absuri = item.abshref(urlnormalize(uri))
|
href = match.group('url')
|
||||||
if absuri in oeb.manifest.hrefs:
|
href = item.abshref(urlnormalize(href))
|
||||||
|
if href in oeb.manifest.hrefs:
|
||||||
found = oeb.manifest.hrefs[href]
|
found = oeb.manifest.hrefs[href]
|
||||||
if found not in used:
|
if found not in used:
|
||||||
new.add(found)
|
new.add(found)
|
||||||
return uri
|
|
||||||
sheet = cssutils.parseString(item.data, href=item.href)
|
|
||||||
cssutils.replaceUrls(sheet, replacer)
|
|
||||||
used.update(new)
|
used.update(new)
|
||||||
unchecked = new
|
unchecked = new
|
||||||
for item in oeb.manifest.values():
|
for item in oeb.manifest.values():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user