Unify handling of URIs/IRIs, storing in encoded, normalized form.

This commit is contained in:
Marshall T. Vandegrift 2008-12-09 08:02:09 -05:00
parent f740d20f32
commit 6e24dcddff
3 changed files with 62 additions and 30 deletions

View File

@ -4,7 +4,8 @@ import sys
from collections import defaultdict
from types import StringTypes
from itertools import izip, count
from urlparse import urldefrag
from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
from lxml import etree
XML_PARSER = etree.XMLParser(
@ -55,6 +56,22 @@ def barename(name):
def xpath(elem, expr):
return elem.xpath(expr, namespaces=XPNSMAP)
URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """
def urlquote(href):
result = []
for char in href:
if char in URL_UNSAFE:
char = "%%%02x" % ord(char)
result.append(char)
return ''.join(result)
def urlnormalize(href):
parts = urlparse(href)
parts = (part.replace('\\', '/') for part in parts)
parts = (urlunquote(part) for part in parts)
parts = (urlquote(part) for part in parts)
return urlunparse(parts)
class AbstractContainer(object):
def read_xml(self, path):
@ -68,12 +85,12 @@ class DirContainer(AbstractContainer):
def read(self, path):
path = os.path.join(self.rootdir, path)
with open(path, 'rb') as f:
with open(urlunquote(path), 'rb') as f:
return f.read()
def write(self, path, data):
path = os.path.join(self.rootdir, path)
with open(path, 'wb') as f:
with open(urlunquote(path), 'wb') as f:
return f.write(data)
@ -178,7 +195,7 @@ class Metadata(object):
return elem
def to_opf2(self, parent=None):
elem = element(parent, OPF('metadata'), nsmap=self.NSMAP)
elem = element(parent, OPF('metadata'), nsmap=self.OPF2_NSMAP)
for term in self.items:
for item in self.items[term]:
item.to_opf2(elem)
@ -189,7 +206,7 @@ class Manifest(object):
class Item(object):
def __init__(self, id, href, media_type, loader=str):
self.id = id
self.href = self.path = href.replace('%20', ' ')
self.href = self.path = urlnormalize(href)
self.media_type = media_type
self.spine_position = None
self.linear = True
@ -235,8 +252,8 @@ class Manifest(object):
def add(self, id, href, media_type):
item = self.Item(id, href, media_type, self.oeb.container.read)
self.items[id] = item
self.hrefs[href] = item
self.items[item.id] = item
self.hrefs[item.href] = item
return item
def remove(self, id):
@ -331,7 +348,7 @@ class Guide(object):
def __init__(self, type, title, href):
self.type = type
self.title = title
self.href = href
self.href = urlnormalize(href)
def __repr__(self):
return 'Reference(type=%r, title=%r, href=%r)' \
@ -390,7 +407,7 @@ class Guide(object):
class Toc(object):
def __init__(self, title=None, href=None, klass=None, id=None):
self.title = title
self.href = href
self.href = urlnormalize(href) if href else href
self.klass = klass
self.id = id
self.nodes = []
@ -414,8 +431,8 @@ class Toc(object):
def to_opf1(self, tour):
for node in self.nodes:
element(tour, 'site',
attrib={'title': node.title, 'href': node.href})
element(tour, 'site', attrib={
'title': node.title, 'href': node.href})
node.to_opf1(tour)
return tour
@ -431,8 +448,9 @@ class Toc(object):
point.attrib['id'] = self.id
label = etree.SubElement(point, NCX('navLabel'))
etree.SubElement(label, NCX('text')).text = node.title
href = node.href if depth > 1 else node.href.split('#', 1)[0]
etree.SubElement(point, NCX('content'), attrib={'src': href})
href = node.href if depth > 1 else urldefrag(node.href)[0]
child = etree.SubElement(point,
NCX('content'), attrib={'src': href})
node.to_ncx(point, playorder, depth+1)
return parent
@ -490,7 +508,8 @@ class Oeb(object):
uid = opf.attrib['unique-identifier']
self.metadata = metadata = Metadata(self)
for elem in xpath(opf, '/o2:package/o2:metadata/*'):
metadata.add(elem.tag, elem.text, elem.attrib)
if elem.text or elem.attrib:
metadata.add(elem.tag, elem.text, elem.attrib)
for item in metadata.identifier:
if item.id == uid:
self.uid = item
@ -524,7 +543,7 @@ class Oeb(object):
def _toc_from_navpoint(self, toc, navpoint):
children = xpath(navpoint, 'ncx:navPoint')
for child in children:
title = xpath(child, 'ncx:navLabel/ncx:text/text()')[0]
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
href = xpath(child, 'ncx:content/@src')[0]
id = child.get('id')
klass = child.get('class')
@ -564,8 +583,13 @@ class Oeb(object):
item = self.manifest.hrefs[itempath]
html = item.data
if frag:
elem = xpath(html, './/*[@id="%s"]' % frag)
html = elem[0] if elem else html
elems = xpath(html, './/*[@id="%s"]' % frag)
if not elems:
elems = xpath(html, './/*[@name="%s"]' % frag)
elem = elems[0] if elems else html
while elem != html and not xpath(elem, './/h:a[@href]'):
elem = elem.getparent()
html = elem
titles = defaultdict(list)
order = []
for anchor in xpath(html, './/h:a[@href]'):
@ -574,6 +598,7 @@ class Oeb(object):
if not path:
href = '#'.join((itempath, frag))
title = ' '.join(xpath(anchor, './/text()'))
href = urlnormalize(href)
if href not in titles:
order.append(href)
titles[href].append(title)
@ -679,10 +704,13 @@ class Oeb(object):
return {OPF_MIME: ('content.opf', package),
NCX_MIME: (href, ncx)}
def main(argv=sys.argv):
for arg in argv[1:]:
oeb = Oeb(arg)
for name, doc in oeb.to_opf2().items():
for name, doc in oeb.to_opf1().values():
print etree.tostring(doc, pretty_print=True)
for name, doc in oeb.to_opf2().values():
print etree.tostring(doc, pretty_print=True)
return 0

View File

@ -10,10 +10,12 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
import sys, struct, cStringIO, os
import functools
import re
from urlparse import urldefrag
from lxml import etree
from calibre.ebooks.lit import LitError
from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
import calibre.ebooks.lit.mssha1 as mssha1
from calibre.ebooks.lit.oeb import urlnormalize
from calibre.ebooks import DRMError
from calibre import plugins
lzx, lxzerror = plugins['lzx']
@ -322,12 +324,12 @@ class UnBinary(object):
href += c
count -= 1
if count == 0:
doc, m, frag = href[1:].partition('#')
doc, frag = urldefrag(href[1:])
path = self.item_path(doc)
if m and frag:
path += m + frag
self.buf.write((u'"%s"' % path).encode(
'ascii', 'xmlcharrefreplace'))
if frag:
path = '#'.join((path, frag))
path = urlnormalize(path)
self.buf.write((u'"%s"' % path).encode('utf-8'))
state = 'get attr'
return index

View File

@ -10,11 +10,14 @@ import re
import copy
import uuid
import functools
from urlparse import urldefrag
from urllib import unquote as urlunquote
from lxml import etree
from calibre.ebooks.lit.reader import msguid, DirectoryEntry
import calibre.ebooks.lit.maps as maps
from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME
from calibre.ebooks.lit.oeb import Oeb, namespace, barename
from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize
from calibre.ebooks.lit.oeb import Oeb
from calibre.ebooks.lit.stylizer import Stylizer
from calibre.ebooks.lit.lzxcomp import Compressor
import calibre
@ -173,15 +176,13 @@ class ReBinary(object):
for attr, value in attrib.items():
attr = prefixname(attr, nsrmap)
if attr in ('href', 'src'):
path, hash, frag = value.partition('#')
path = os.path.join(self.dir, path)
path = os.path.normpath(path)
path = path.replace('\\', '/')
value = urlnormalize(value)
path, frag = urldefrag(value)
prefix = unichr(3)
if path in self.manifest.hrefs:
prefix = unichr(2)
value = self.manifest.hrefs[path].id
if hash and frag:
if frag:
value = '#'.join((value, frag))
value = prefix + value
elif attr in ('id', 'name'):
@ -420,7 +421,8 @@ class LitWriter(object):
items.sort()
data.write(pack('<I', len(items)))
for item in items:
id, href, media_type = item.id, item.href, item.media_type
id, media_type = item.id, item.media_type
href = urlunquote(item.href)
item.offset = offset \
if state in ('linear', 'nonlinear') else 0
data.write(pack('<I', item.offset))