Unify handling of URIs/IRIs, storing in encoded, normalized form.

2025-07-09 03:04:10 -04:00 · 2008-12-09 08:02:09 -05:00 · 2008-12-09 08:02:09 -05:00 · 6e24dcddff
commit 6e24dcddff
parent f740d20f32
3 changed files with 62 additions and 30 deletions
--- a/src/calibre/ebooks/lit/oeb.py
+++ b/src/calibre/ebooks/lit/oeb.py
@ -4,7 +4,8 @@ import sys
 from collections import defaultdict
 from types import StringTypes
 from itertools import izip, count
-from urlparse import urldefrag
+from urlparse import urldefrag, urlparse, urlunparse
 from urllib import unquote as urlunquote
 from lxml import etree
 XML_PARSER = etree.XMLParser(
@ -55,6 +56,22 @@ def barename(name):
 def xpath(elem, expr):
    return elem.xpath(expr, namespaces=XPNSMAP)
 URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """
 def urlquote(href):
    result = []
    for char in href:
        if char in URL_UNSAFE:
            char = "%%%02x" % ord(char)
        result.append(char)
    return ''.join(result)
 def urlnormalize(href):
    parts = urlparse(href)
    parts = (part.replace('\\', '/') for part in parts)
    parts = (urlunquote(part) for part in parts)
    parts = (urlquote(part) for part in parts)
    return urlunparse(parts)
 class AbstractContainer(object):
    def read_xml(self, path):
@ -68,12 +85,12 @@ class DirContainer(AbstractContainer):
    def read(self, path):
        path = os.path.join(self.rootdir, path)
-        with open(path, 'rb') as f:
+        with open(urlunquote(path), 'rb') as f:
            return f.read()
    def write(self, path, data):
        path = os.path.join(self.rootdir, path)
-        with open(path, 'wb') as f:
+        with open(urlunquote(path), 'wb') as f:
            return f.write(data)
@ -178,7 +195,7 @@ class Metadata(object):
        return elem
    def to_opf2(self, parent=None):
-        elem = element(parent, OPF('metadata'), nsmap=self.NSMAP)
+        elem = element(parent, OPF('metadata'), nsmap=self.OPF2_NSMAP)
        for term in self.items:
            for item in self.items[term]:
                item.to_opf2(elem)
@ -189,7 +206,7 @@ class Manifest(object):
    class Item(object):
        def __init__(self, id, href, media_type, loader=str):
            self.id = id
-            self.href = self.path = href.replace('%20', ' ')
+            self.href = self.path = urlnormalize(href)
            self.media_type = media_type
            self.spine_position = None
            self.linear = True
@ -235,8 +252,8 @@ class Manifest(object):
    def add(self, id, href, media_type):
        item = self.Item(id, href, media_type, self.oeb.container.read)
-        self.items[id] = item
+        self.items[item.id] = item
-        self.hrefs[href] = item
+        self.hrefs[item.href] = item
        return item
    def remove(self, id):
@ -331,7 +348,7 @@ class Guide(object):
        def __init__(self, type, title, href):
            self.type = type
            self.title = title
-            self.href = href
+            self.href = urlnormalize(href)
        def __repr__(self):
            return 'Reference(type=%r, title=%r, href=%r)' \
@ -390,7 +407,7 @@ class Guide(object):
 class Toc(object):
    def __init__(self, title=None, href=None, klass=None, id=None):
        self.title = title
-        self.href = href
+        self.href = urlnormalize(href) if href else href
        self.klass = klass
        self.id = id
        self.nodes = []
@ -414,8 +431,8 @@ class Toc(object):
    def to_opf1(self, tour):
        for node in self.nodes:
-            element(tour, 'site',
+            element(tour, 'site', attrib={
-                attrib={'title': node.title, 'href': node.href})
+                'title': node.title, 'href': node.href})
            node.to_opf1(tour)
        return tour
@ -431,8 +448,9 @@ class Toc(object):
                point.attrib['id'] = self.id
            label = etree.SubElement(point, NCX('navLabel'))
            etree.SubElement(label, NCX('text')).text = node.title
-            href = node.href if depth > 1 else node.href.split('#', 1)[0]
+            href = node.href if depth > 1 else urldefrag(node.href)[0]
-            etree.SubElement(point, NCX('content'), attrib={'src': href})
+            child = etree.SubElement(point,
                NCX('content'), attrib={'src': href})
            node.to_ncx(point, playorder, depth+1)
        return parent
@ -490,7 +508,8 @@ class Oeb(object):
        uid = opf.attrib['unique-identifier']
        self.metadata = metadata = Metadata(self)        
        for elem in xpath(opf, '/o2:package/o2:metadata/*'):
-            metadata.add(elem.tag, elem.text, elem.attrib)
+            if elem.text or elem.attrib:
                metadata.add(elem.tag, elem.text, elem.attrib)
        for item in metadata.identifier:
            if item.id == uid:
                self.uid = item
@ -524,7 +543,7 @@ class Oeb(object):
    def _toc_from_navpoint(self, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
-            title = xpath(child, 'ncx:navLabel/ncx:text/text()')[0]
+            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
            href = xpath(child, 'ncx:content/@src')[0]
            id = child.get('id')
            klass = child.get('class')
@ -564,8 +583,13 @@ class Oeb(object):
        item = self.manifest.hrefs[itempath]
        html = item.data
        if frag:
-            elem = xpath(html, './/*[@id="%s"]' % frag)
+            elems = xpath(html, './/*[@id="%s"]' % frag)
-            html = elem[0] if elem else html
+            if not elems:
                elems = xpath(html, './/*[@name="%s"]' % frag)
            elem = elems[0] if elems else html
            while elem != html and not xpath(elem, './/h:a[@href]'):
                elem = elem.getparent()
            html = elem
        titles = defaultdict(list)
        order = []
        for anchor in xpath(html, './/h:a[@href]'):
@ -574,6 +598,7 @@ class Oeb(object):
            if not path:
                href = '#'.join((itempath, frag))
            title = ' '.join(xpath(anchor, './/text()'))
            href = urlnormalize(href)
            if href not in titles:
                order.append(href)
            titles[href].append(title)
@ -679,10 +704,13 @@ class Oeb(object):
        return {OPF_MIME: ('content.opf', package),
                NCX_MIME: (href, ncx)}
 def main(argv=sys.argv):
    for arg in argv[1:]:
        oeb = Oeb(arg)
-        for name, doc in oeb.to_opf2().items():
+        for name, doc in oeb.to_opf1().values():
            print etree.tostring(doc, pretty_print=True)
        for name, doc in oeb.to_opf2().values():
            print etree.tostring(doc, pretty_print=True)
    return 0
--- a/src/calibre/ebooks/lit/reader.py
+++ b/src/calibre/ebooks/lit/reader.py
@ -10,10 +10,12 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
 import sys, struct, cStringIO, os
 import functools
 import re
 from urlparse import urldefrag
 from lxml import etree
 from calibre.ebooks.lit import LitError
 from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
 import calibre.ebooks.lit.mssha1 as mssha1
 from calibre.ebooks.lit.oeb import urlnormalize
 from calibre.ebooks import DRMError
 from calibre import plugins
 lzx, lxzerror = plugins['lzx']
@ -322,12 +324,12 @@ class UnBinary(object):
                href += c
                count -= 1
                if count == 0:
-                    doc, m, frag = href[1:].partition('#')
+                    doc, frag = urldefrag(href[1:])
                    path = self.item_path(doc)
-                    if m and frag:
+                    if frag:
-                        path += m + frag
+                        path = '#'.join((path, frag))
-                    self.buf.write((u'"%s"' % path).encode(
+                    path = urlnormalize(path)
-                        'ascii', 'xmlcharrefreplace'))
+                    self.buf.write((u'"%s"' % path).encode('utf-8'))
                    state = 'get attr'
        return index
--- a/src/calibre/ebooks/lit/writer.py
+++ b/src/calibre/ebooks/lit/writer.py
@ -10,11 +10,14 @@ import re
 import copy
 import uuid
 import functools
 from urlparse import urldefrag
 from urllib import unquote as urlunquote
 from lxml import etree
 from calibre.ebooks.lit.reader import msguid, DirectoryEntry
 import calibre.ebooks.lit.maps as maps
 from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME
-from calibre.ebooks.lit.oeb import Oeb, namespace, barename
+from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize
 from calibre.ebooks.lit.oeb import Oeb
 from calibre.ebooks.lit.stylizer import Stylizer
 from calibre.ebooks.lit.lzxcomp import Compressor
 import calibre
@ -173,15 +176,13 @@ class ReBinary(object):
        for attr, value in attrib.items():
            attr = prefixname(attr, nsrmap)
            if attr in ('href', 'src'):
-                path, hash, frag = value.partition('#')
+                value = urlnormalize(value)
-                path = os.path.join(self.dir, path)
+                path, frag = urldefrag(value)
                path = os.path.normpath(path)
                path = path.replace('\\', '/')
                prefix = unichr(3)
                if path in self.manifest.hrefs:
                    prefix = unichr(2)
                    value = self.manifest.hrefs[path].id
-                    if hash and frag:
+                    if frag:
                        value = '#'.join((value, frag))
                value = prefix + value
            elif attr in ('id', 'name'):
@ -420,7 +421,8 @@ class LitWriter(object):
            items.sort()
            data.write(pack('<I', len(items)))
            for item in items:
-                id, href, media_type = item.id, item.href, item.media_type
+                id, media_type = item.id, item.media_type
                href = urlunquote(item.href)
                item.offset = offset \
                    if state in ('linear', 'nonlinear') else 0
                data.write(pack('<I', item.offset))