Unify handling of URIs/IRIs, storing in encoded, normalized form.

This commit is contained in:
Marshall T. Vandegrift 2008-12-09 08:02:09 -05:00
parent f740d20f32
commit 6e24dcddff
3 changed files with 62 additions and 30 deletions

View File

@ -4,7 +4,8 @@ import sys
from collections import defaultdict from collections import defaultdict
from types import StringTypes from types import StringTypes
from itertools import izip, count from itertools import izip, count
from urlparse import urldefrag from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
from lxml import etree from lxml import etree
XML_PARSER = etree.XMLParser( XML_PARSER = etree.XMLParser(
@ -55,6 +56,22 @@ def barename(name):
def xpath(elem, expr): def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP) return elem.xpath(expr, namespaces=XPNSMAP)
URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """
def urlquote(href):
result = []
for char in href:
if char in URL_UNSAFE:
char = "%%%02x" % ord(char)
result.append(char)
return ''.join(result)
def urlnormalize(href):
parts = urlparse(href)
parts = (part.replace('\\', '/') for part in parts)
parts = (urlunquote(part) for part in parts)
parts = (urlquote(part) for part in parts)
return urlunparse(parts)
class AbstractContainer(object): class AbstractContainer(object):
def read_xml(self, path): def read_xml(self, path):
@ -68,12 +85,12 @@ class DirContainer(AbstractContainer):
def read(self, path): def read(self, path):
path = os.path.join(self.rootdir, path) path = os.path.join(self.rootdir, path)
with open(path, 'rb') as f: with open(urlunquote(path), 'rb') as f:
return f.read() return f.read()
def write(self, path, data): def write(self, path, data):
path = os.path.join(self.rootdir, path) path = os.path.join(self.rootdir, path)
with open(path, 'wb') as f: with open(urlunquote(path), 'wb') as f:
return f.write(data) return f.write(data)
@ -178,7 +195,7 @@ class Metadata(object):
return elem return elem
def to_opf2(self, parent=None): def to_opf2(self, parent=None):
elem = element(parent, OPF('metadata'), nsmap=self.NSMAP) elem = element(parent, OPF('metadata'), nsmap=self.OPF2_NSMAP)
for term in self.items: for term in self.items:
for item in self.items[term]: for item in self.items[term]:
item.to_opf2(elem) item.to_opf2(elem)
@ -189,7 +206,7 @@ class Manifest(object):
class Item(object): class Item(object):
def __init__(self, id, href, media_type, loader=str): def __init__(self, id, href, media_type, loader=str):
self.id = id self.id = id
self.href = self.path = href.replace('%20', ' ') self.href = self.path = urlnormalize(href)
self.media_type = media_type self.media_type = media_type
self.spine_position = None self.spine_position = None
self.linear = True self.linear = True
@ -235,8 +252,8 @@ class Manifest(object):
def add(self, id, href, media_type): def add(self, id, href, media_type):
item = self.Item(id, href, media_type, self.oeb.container.read) item = self.Item(id, href, media_type, self.oeb.container.read)
self.items[id] = item self.items[item.id] = item
self.hrefs[href] = item self.hrefs[item.href] = item
return item return item
def remove(self, id): def remove(self, id):
@ -331,7 +348,7 @@ class Guide(object):
def __init__(self, type, title, href): def __init__(self, type, title, href):
self.type = type self.type = type
self.title = title self.title = title
self.href = href self.href = urlnormalize(href)
def __repr__(self): def __repr__(self):
return 'Reference(type=%r, title=%r, href=%r)' \ return 'Reference(type=%r, title=%r, href=%r)' \
@ -390,7 +407,7 @@ class Guide(object):
class Toc(object): class Toc(object):
def __init__(self, title=None, href=None, klass=None, id=None): def __init__(self, title=None, href=None, klass=None, id=None):
self.title = title self.title = title
self.href = href self.href = urlnormalize(href) if href else href
self.klass = klass self.klass = klass
self.id = id self.id = id
self.nodes = [] self.nodes = []
@ -414,8 +431,8 @@ class Toc(object):
def to_opf1(self, tour): def to_opf1(self, tour):
for node in self.nodes: for node in self.nodes:
element(tour, 'site', element(tour, 'site', attrib={
attrib={'title': node.title, 'href': node.href}) 'title': node.title, 'href': node.href})
node.to_opf1(tour) node.to_opf1(tour)
return tour return tour
@ -431,8 +448,9 @@ class Toc(object):
point.attrib['id'] = self.id point.attrib['id'] = self.id
label = etree.SubElement(point, NCX('navLabel')) label = etree.SubElement(point, NCX('navLabel'))
etree.SubElement(label, NCX('text')).text = node.title etree.SubElement(label, NCX('text')).text = node.title
href = node.href if depth > 1 else node.href.split('#', 1)[0] href = node.href if depth > 1 else urldefrag(node.href)[0]
etree.SubElement(point, NCX('content'), attrib={'src': href}) child = etree.SubElement(point,
NCX('content'), attrib={'src': href})
node.to_ncx(point, playorder, depth+1) node.to_ncx(point, playorder, depth+1)
return parent return parent
@ -490,7 +508,8 @@ class Oeb(object):
uid = opf.attrib['unique-identifier'] uid = opf.attrib['unique-identifier']
self.metadata = metadata = Metadata(self) self.metadata = metadata = Metadata(self)
for elem in xpath(opf, '/o2:package/o2:metadata/*'): for elem in xpath(opf, '/o2:package/o2:metadata/*'):
metadata.add(elem.tag, elem.text, elem.attrib) if elem.text or elem.attrib:
metadata.add(elem.tag, elem.text, elem.attrib)
for item in metadata.identifier: for item in metadata.identifier:
if item.id == uid: if item.id == uid:
self.uid = item self.uid = item
@ -524,7 +543,7 @@ class Oeb(object):
def _toc_from_navpoint(self, toc, navpoint): def _toc_from_navpoint(self, toc, navpoint):
children = xpath(navpoint, 'ncx:navPoint') children = xpath(navpoint, 'ncx:navPoint')
for child in children: for child in children:
title = xpath(child, 'ncx:navLabel/ncx:text/text()')[0] title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
href = xpath(child, 'ncx:content/@src')[0] href = xpath(child, 'ncx:content/@src')[0]
id = child.get('id') id = child.get('id')
klass = child.get('class') klass = child.get('class')
@ -564,8 +583,13 @@ class Oeb(object):
item = self.manifest.hrefs[itempath] item = self.manifest.hrefs[itempath]
html = item.data html = item.data
if frag: if frag:
elem = xpath(html, './/*[@id="%s"]' % frag) elems = xpath(html, './/*[@id="%s"]' % frag)
html = elem[0] if elem else html if not elems:
elems = xpath(html, './/*[@name="%s"]' % frag)
elem = elems[0] if elems else html
while elem != html and not xpath(elem, './/h:a[@href]'):
elem = elem.getparent()
html = elem
titles = defaultdict(list) titles = defaultdict(list)
order = [] order = []
for anchor in xpath(html, './/h:a[@href]'): for anchor in xpath(html, './/h:a[@href]'):
@ -574,6 +598,7 @@ class Oeb(object):
if not path: if not path:
href = '#'.join((itempath, frag)) href = '#'.join((itempath, frag))
title = ' '.join(xpath(anchor, './/text()')) title = ' '.join(xpath(anchor, './/text()'))
href = urlnormalize(href)
if href not in titles: if href not in titles:
order.append(href) order.append(href)
titles[href].append(title) titles[href].append(title)
@ -679,10 +704,13 @@ class Oeb(object):
return {OPF_MIME: ('content.opf', package), return {OPF_MIME: ('content.opf', package),
NCX_MIME: (href, ncx)} NCX_MIME: (href, ncx)}
def main(argv=sys.argv): def main(argv=sys.argv):
for arg in argv[1:]: for arg in argv[1:]:
oeb = Oeb(arg) oeb = Oeb(arg)
for name, doc in oeb.to_opf2().items(): for name, doc in oeb.to_opf1().values():
print etree.tostring(doc, pretty_print=True)
for name, doc in oeb.to_opf2().values():
print etree.tostring(doc, pretty_print=True) print etree.tostring(doc, pretty_print=True)
return 0 return 0

View File

@ -10,10 +10,12 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
import sys, struct, cStringIO, os import sys, struct, cStringIO, os
import functools import functools
import re import re
from urlparse import urldefrag
from lxml import etree from lxml import etree
from calibre.ebooks.lit import LitError from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1 import calibre.ebooks.lit.mssha1 as mssha1
from calibre.ebooks.lit.oeb import urlnormalize
from calibre.ebooks import DRMError from calibre.ebooks import DRMError
from calibre import plugins from calibre import plugins
lzx, lxzerror = plugins['lzx'] lzx, lxzerror = plugins['lzx']
@ -322,12 +324,12 @@ class UnBinary(object):
href += c href += c
count -= 1 count -= 1
if count == 0: if count == 0:
doc, m, frag = href[1:].partition('#') doc, frag = urldefrag(href[1:])
path = self.item_path(doc) path = self.item_path(doc)
if m and frag: if frag:
path += m + frag path = '#'.join((path, frag))
self.buf.write((u'"%s"' % path).encode( path = urlnormalize(path)
'ascii', 'xmlcharrefreplace')) self.buf.write((u'"%s"' % path).encode('utf-8'))
state = 'get attr' state = 'get attr'
return index return index

View File

@ -10,11 +10,14 @@ import re
import copy import copy
import uuid import uuid
import functools import functools
from urlparse import urldefrag
from urllib import unquote as urlunquote
from lxml import etree from lxml import etree
from calibre.ebooks.lit.reader import msguid, DirectoryEntry from calibre.ebooks.lit.reader import msguid, DirectoryEntry
import calibre.ebooks.lit.maps as maps import calibre.ebooks.lit.maps as maps
from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME
from calibre.ebooks.lit.oeb import Oeb, namespace, barename from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize
from calibre.ebooks.lit.oeb import Oeb
from calibre.ebooks.lit.stylizer import Stylizer from calibre.ebooks.lit.stylizer import Stylizer
from calibre.ebooks.lit.lzxcomp import Compressor from calibre.ebooks.lit.lzxcomp import Compressor
import calibre import calibre
@ -173,15 +176,13 @@ class ReBinary(object):
for attr, value in attrib.items(): for attr, value in attrib.items():
attr = prefixname(attr, nsrmap) attr = prefixname(attr, nsrmap)
if attr in ('href', 'src'): if attr in ('href', 'src'):
path, hash, frag = value.partition('#') value = urlnormalize(value)
path = os.path.join(self.dir, path) path, frag = urldefrag(value)
path = os.path.normpath(path)
path = path.replace('\\', '/')
prefix = unichr(3) prefix = unichr(3)
if path in self.manifest.hrefs: if path in self.manifest.hrefs:
prefix = unichr(2) prefix = unichr(2)
value = self.manifest.hrefs[path].id value = self.manifest.hrefs[path].id
if hash and frag: if frag:
value = '#'.join((value, frag)) value = '#'.join((value, frag))
value = prefix + value value = prefix + value
elif attr in ('id', 'name'): elif attr in ('id', 'name'):
@ -420,7 +421,8 @@ class LitWriter(object):
items.sort() items.sort()
data.write(pack('<I', len(items))) data.write(pack('<I', len(items)))
for item in items: for item in items:
id, href, media_type = item.id, item.href, item.media_type id, media_type = item.id, item.media_type
href = urlunquote(item.href)
item.offset = offset \ item.offset = offset \
if state in ('linear', 'nonlinear') else 0 if state in ('linear', 'nonlinear') else 0
data.write(pack('<I', item.offset)) data.write(pack('<I', item.offset))