mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Unify handling of URIs/IRIs, storing in encoded, normalized form.
This commit is contained in:
parent
f740d20f32
commit
6e24dcddff
@ -4,7 +4,8 @@ import sys
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from types import StringTypes
|
from types import StringTypes
|
||||||
from itertools import izip, count
|
from itertools import izip, count
|
||||||
from urlparse import urldefrag
|
from urlparse import urldefrag, urlparse, urlunparse
|
||||||
|
from urllib import unquote as urlunquote
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
XML_PARSER = etree.XMLParser(
|
XML_PARSER = etree.XMLParser(
|
||||||
@ -55,6 +56,22 @@ def barename(name):
|
|||||||
def xpath(elem, expr):
|
def xpath(elem, expr):
|
||||||
return elem.xpath(expr, namespaces=XPNSMAP)
|
return elem.xpath(expr, namespaces=XPNSMAP)
|
||||||
|
|
||||||
|
URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """
|
||||||
|
def urlquote(href):
|
||||||
|
result = []
|
||||||
|
for char in href:
|
||||||
|
if char in URL_UNSAFE:
|
||||||
|
char = "%%%02x" % ord(char)
|
||||||
|
result.append(char)
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
def urlnormalize(href):
|
||||||
|
parts = urlparse(href)
|
||||||
|
parts = (part.replace('\\', '/') for part in parts)
|
||||||
|
parts = (urlunquote(part) for part in parts)
|
||||||
|
parts = (urlquote(part) for part in parts)
|
||||||
|
return urlunparse(parts)
|
||||||
|
|
||||||
|
|
||||||
class AbstractContainer(object):
|
class AbstractContainer(object):
|
||||||
def read_xml(self, path):
|
def read_xml(self, path):
|
||||||
@ -68,12 +85,12 @@ class DirContainer(AbstractContainer):
|
|||||||
|
|
||||||
def read(self, path):
|
def read(self, path):
|
||||||
path = os.path.join(self.rootdir, path)
|
path = os.path.join(self.rootdir, path)
|
||||||
with open(path, 'rb') as f:
|
with open(urlunquote(path), 'rb') as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
def write(self, path, data):
|
def write(self, path, data):
|
||||||
path = os.path.join(self.rootdir, path)
|
path = os.path.join(self.rootdir, path)
|
||||||
with open(path, 'wb') as f:
|
with open(urlunquote(path), 'wb') as f:
|
||||||
return f.write(data)
|
return f.write(data)
|
||||||
|
|
||||||
|
|
||||||
@ -178,7 +195,7 @@ class Metadata(object):
|
|||||||
return elem
|
return elem
|
||||||
|
|
||||||
def to_opf2(self, parent=None):
|
def to_opf2(self, parent=None):
|
||||||
elem = element(parent, OPF('metadata'), nsmap=self.NSMAP)
|
elem = element(parent, OPF('metadata'), nsmap=self.OPF2_NSMAP)
|
||||||
for term in self.items:
|
for term in self.items:
|
||||||
for item in self.items[term]:
|
for item in self.items[term]:
|
||||||
item.to_opf2(elem)
|
item.to_opf2(elem)
|
||||||
@ -189,7 +206,7 @@ class Manifest(object):
|
|||||||
class Item(object):
|
class Item(object):
|
||||||
def __init__(self, id, href, media_type, loader=str):
|
def __init__(self, id, href, media_type, loader=str):
|
||||||
self.id = id
|
self.id = id
|
||||||
self.href = self.path = href.replace('%20', ' ')
|
self.href = self.path = urlnormalize(href)
|
||||||
self.media_type = media_type
|
self.media_type = media_type
|
||||||
self.spine_position = None
|
self.spine_position = None
|
||||||
self.linear = True
|
self.linear = True
|
||||||
@ -235,8 +252,8 @@ class Manifest(object):
|
|||||||
|
|
||||||
def add(self, id, href, media_type):
|
def add(self, id, href, media_type):
|
||||||
item = self.Item(id, href, media_type, self.oeb.container.read)
|
item = self.Item(id, href, media_type, self.oeb.container.read)
|
||||||
self.items[id] = item
|
self.items[item.id] = item
|
||||||
self.hrefs[href] = item
|
self.hrefs[item.href] = item
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def remove(self, id):
|
def remove(self, id):
|
||||||
@ -331,7 +348,7 @@ class Guide(object):
|
|||||||
def __init__(self, type, title, href):
|
def __init__(self, type, title, href):
|
||||||
self.type = type
|
self.type = type
|
||||||
self.title = title
|
self.title = title
|
||||||
self.href = href
|
self.href = urlnormalize(href)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'Reference(type=%r, title=%r, href=%r)' \
|
return 'Reference(type=%r, title=%r, href=%r)' \
|
||||||
@ -390,7 +407,7 @@ class Guide(object):
|
|||||||
class Toc(object):
|
class Toc(object):
|
||||||
def __init__(self, title=None, href=None, klass=None, id=None):
|
def __init__(self, title=None, href=None, klass=None, id=None):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.href = href
|
self.href = urlnormalize(href) if href else href
|
||||||
self.klass = klass
|
self.klass = klass
|
||||||
self.id = id
|
self.id = id
|
||||||
self.nodes = []
|
self.nodes = []
|
||||||
@ -414,8 +431,8 @@ class Toc(object):
|
|||||||
|
|
||||||
def to_opf1(self, tour):
|
def to_opf1(self, tour):
|
||||||
for node in self.nodes:
|
for node in self.nodes:
|
||||||
element(tour, 'site',
|
element(tour, 'site', attrib={
|
||||||
attrib={'title': node.title, 'href': node.href})
|
'title': node.title, 'href': node.href})
|
||||||
node.to_opf1(tour)
|
node.to_opf1(tour)
|
||||||
return tour
|
return tour
|
||||||
|
|
||||||
@ -431,8 +448,9 @@ class Toc(object):
|
|||||||
point.attrib['id'] = self.id
|
point.attrib['id'] = self.id
|
||||||
label = etree.SubElement(point, NCX('navLabel'))
|
label = etree.SubElement(point, NCX('navLabel'))
|
||||||
etree.SubElement(label, NCX('text')).text = node.title
|
etree.SubElement(label, NCX('text')).text = node.title
|
||||||
href = node.href if depth > 1 else node.href.split('#', 1)[0]
|
href = node.href if depth > 1 else urldefrag(node.href)[0]
|
||||||
etree.SubElement(point, NCX('content'), attrib={'src': href})
|
child = etree.SubElement(point,
|
||||||
|
NCX('content'), attrib={'src': href})
|
||||||
node.to_ncx(point, playorder, depth+1)
|
node.to_ncx(point, playorder, depth+1)
|
||||||
return parent
|
return parent
|
||||||
|
|
||||||
@ -490,7 +508,8 @@ class Oeb(object):
|
|||||||
uid = opf.attrib['unique-identifier']
|
uid = opf.attrib['unique-identifier']
|
||||||
self.metadata = metadata = Metadata(self)
|
self.metadata = metadata = Metadata(self)
|
||||||
for elem in xpath(opf, '/o2:package/o2:metadata/*'):
|
for elem in xpath(opf, '/o2:package/o2:metadata/*'):
|
||||||
metadata.add(elem.tag, elem.text, elem.attrib)
|
if elem.text or elem.attrib:
|
||||||
|
metadata.add(elem.tag, elem.text, elem.attrib)
|
||||||
for item in metadata.identifier:
|
for item in metadata.identifier:
|
||||||
if item.id == uid:
|
if item.id == uid:
|
||||||
self.uid = item
|
self.uid = item
|
||||||
@ -524,7 +543,7 @@ class Oeb(object):
|
|||||||
def _toc_from_navpoint(self, toc, navpoint):
|
def _toc_from_navpoint(self, toc, navpoint):
|
||||||
children = xpath(navpoint, 'ncx:navPoint')
|
children = xpath(navpoint, 'ncx:navPoint')
|
||||||
for child in children:
|
for child in children:
|
||||||
title = xpath(child, 'ncx:navLabel/ncx:text/text()')[0]
|
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
|
||||||
href = xpath(child, 'ncx:content/@src')[0]
|
href = xpath(child, 'ncx:content/@src')[0]
|
||||||
id = child.get('id')
|
id = child.get('id')
|
||||||
klass = child.get('class')
|
klass = child.get('class')
|
||||||
@ -564,8 +583,13 @@ class Oeb(object):
|
|||||||
item = self.manifest.hrefs[itempath]
|
item = self.manifest.hrefs[itempath]
|
||||||
html = item.data
|
html = item.data
|
||||||
if frag:
|
if frag:
|
||||||
elem = xpath(html, './/*[@id="%s"]' % frag)
|
elems = xpath(html, './/*[@id="%s"]' % frag)
|
||||||
html = elem[0] if elem else html
|
if not elems:
|
||||||
|
elems = xpath(html, './/*[@name="%s"]' % frag)
|
||||||
|
elem = elems[0] if elems else html
|
||||||
|
while elem != html and not xpath(elem, './/h:a[@href]'):
|
||||||
|
elem = elem.getparent()
|
||||||
|
html = elem
|
||||||
titles = defaultdict(list)
|
titles = defaultdict(list)
|
||||||
order = []
|
order = []
|
||||||
for anchor in xpath(html, './/h:a[@href]'):
|
for anchor in xpath(html, './/h:a[@href]'):
|
||||||
@ -574,6 +598,7 @@ class Oeb(object):
|
|||||||
if not path:
|
if not path:
|
||||||
href = '#'.join((itempath, frag))
|
href = '#'.join((itempath, frag))
|
||||||
title = ' '.join(xpath(anchor, './/text()'))
|
title = ' '.join(xpath(anchor, './/text()'))
|
||||||
|
href = urlnormalize(href)
|
||||||
if href not in titles:
|
if href not in titles:
|
||||||
order.append(href)
|
order.append(href)
|
||||||
titles[href].append(title)
|
titles[href].append(title)
|
||||||
@ -679,10 +704,13 @@ class Oeb(object):
|
|||||||
return {OPF_MIME: ('content.opf', package),
|
return {OPF_MIME: ('content.opf', package),
|
||||||
NCX_MIME: (href, ncx)}
|
NCX_MIME: (href, ncx)}
|
||||||
|
|
||||||
|
|
||||||
def main(argv=sys.argv):
|
def main(argv=sys.argv):
|
||||||
for arg in argv[1:]:
|
for arg in argv[1:]:
|
||||||
oeb = Oeb(arg)
|
oeb = Oeb(arg)
|
||||||
for name, doc in oeb.to_opf2().items():
|
for name, doc in oeb.to_opf1().values():
|
||||||
|
print etree.tostring(doc, pretty_print=True)
|
||||||
|
for name, doc in oeb.to_opf2().values():
|
||||||
print etree.tostring(doc, pretty_print=True)
|
print etree.tostring(doc, pretty_print=True)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@ -10,10 +10,12 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
|
|||||||
import sys, struct, cStringIO, os
|
import sys, struct, cStringIO, os
|
||||||
import functools
|
import functools
|
||||||
import re
|
import re
|
||||||
|
from urlparse import urldefrag
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from calibre.ebooks.lit import LitError
|
from calibre.ebooks.lit import LitError
|
||||||
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
|
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
|
||||||
import calibre.ebooks.lit.mssha1 as mssha1
|
import calibre.ebooks.lit.mssha1 as mssha1
|
||||||
|
from calibre.ebooks.lit.oeb import urlnormalize
|
||||||
from calibre.ebooks import DRMError
|
from calibre.ebooks import DRMError
|
||||||
from calibre import plugins
|
from calibre import plugins
|
||||||
lzx, lxzerror = plugins['lzx']
|
lzx, lxzerror = plugins['lzx']
|
||||||
@ -322,12 +324,12 @@ class UnBinary(object):
|
|||||||
href += c
|
href += c
|
||||||
count -= 1
|
count -= 1
|
||||||
if count == 0:
|
if count == 0:
|
||||||
doc, m, frag = href[1:].partition('#')
|
doc, frag = urldefrag(href[1:])
|
||||||
path = self.item_path(doc)
|
path = self.item_path(doc)
|
||||||
if m and frag:
|
if frag:
|
||||||
path += m + frag
|
path = '#'.join((path, frag))
|
||||||
self.buf.write((u'"%s"' % path).encode(
|
path = urlnormalize(path)
|
||||||
'ascii', 'xmlcharrefreplace'))
|
self.buf.write((u'"%s"' % path).encode('utf-8'))
|
||||||
state = 'get attr'
|
state = 'get attr'
|
||||||
return index
|
return index
|
||||||
|
|
||||||
|
@ -10,11 +10,14 @@ import re
|
|||||||
import copy
|
import copy
|
||||||
import uuid
|
import uuid
|
||||||
import functools
|
import functools
|
||||||
|
from urlparse import urldefrag
|
||||||
|
from urllib import unquote as urlunquote
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from calibre.ebooks.lit.reader import msguid, DirectoryEntry
|
from calibre.ebooks.lit.reader import msguid, DirectoryEntry
|
||||||
import calibre.ebooks.lit.maps as maps
|
import calibre.ebooks.lit.maps as maps
|
||||||
from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME
|
from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME
|
||||||
from calibre.ebooks.lit.oeb import Oeb, namespace, barename
|
from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize
|
||||||
|
from calibre.ebooks.lit.oeb import Oeb
|
||||||
from calibre.ebooks.lit.stylizer import Stylizer
|
from calibre.ebooks.lit.stylizer import Stylizer
|
||||||
from calibre.ebooks.lit.lzxcomp import Compressor
|
from calibre.ebooks.lit.lzxcomp import Compressor
|
||||||
import calibre
|
import calibre
|
||||||
@ -173,15 +176,13 @@ class ReBinary(object):
|
|||||||
for attr, value in attrib.items():
|
for attr, value in attrib.items():
|
||||||
attr = prefixname(attr, nsrmap)
|
attr = prefixname(attr, nsrmap)
|
||||||
if attr in ('href', 'src'):
|
if attr in ('href', 'src'):
|
||||||
path, hash, frag = value.partition('#')
|
value = urlnormalize(value)
|
||||||
path = os.path.join(self.dir, path)
|
path, frag = urldefrag(value)
|
||||||
path = os.path.normpath(path)
|
|
||||||
path = path.replace('\\', '/')
|
|
||||||
prefix = unichr(3)
|
prefix = unichr(3)
|
||||||
if path in self.manifest.hrefs:
|
if path in self.manifest.hrefs:
|
||||||
prefix = unichr(2)
|
prefix = unichr(2)
|
||||||
value = self.manifest.hrefs[path].id
|
value = self.manifest.hrefs[path].id
|
||||||
if hash and frag:
|
if frag:
|
||||||
value = '#'.join((value, frag))
|
value = '#'.join((value, frag))
|
||||||
value = prefix + value
|
value = prefix + value
|
||||||
elif attr in ('id', 'name'):
|
elif attr in ('id', 'name'):
|
||||||
@ -420,7 +421,8 @@ class LitWriter(object):
|
|||||||
items.sort()
|
items.sort()
|
||||||
data.write(pack('<I', len(items)))
|
data.write(pack('<I', len(items)))
|
||||||
for item in items:
|
for item in items:
|
||||||
id, href, media_type = item.id, item.href, item.media_type
|
id, media_type = item.id, item.media_type
|
||||||
|
href = urlunquote(item.href)
|
||||||
item.offset = offset \
|
item.offset = offset \
|
||||||
if state in ('linear', 'nonlinear') else 0
|
if state in ('linear', 'nonlinear') else 0
|
||||||
data.write(pack('<I', item.offset))
|
data.write(pack('<I', item.offset))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user