Unify handling of URIs/IRIs, storing in encoded, normalized form.

2025-07-09 03:04:10 -04:00 · 2008-12-09 08:02:09 -05:00 · 2008-12-09 08:02:09 -05:00 · 6e24dcddff
commit 6e24dcddff
parent f740d20f32
3 changed files with 62 additions and 30 deletions
--- a/src/calibre/ebooks/lit/oeb.py
+++ b/src/calibre/ebooks/lit/oeb.py
@ -4,7 +4,8 @@ import sys
 from collections import defaultdict
 from types import StringTypes
 from itertools import izip, count
-from urlparse import urldefrag
+from urlparse import urldefrag, urlparse, urlunparse
+from urllib import unquote as urlunquote
 from lxml import etree

 XML_PARSER = etree.XMLParser(
@ -55,6 +56,22 @@ def barename(name):
 def xpath(elem, expr):
    return elem.xpath(expr, namespaces=XPNSMAP)

+URL_UNSAFE = r"""`!@#$%^&*[](){}?+=;:'",<>\| """
+def urlquote(href):
+    result = []
+    for char in href:
+        if char in URL_UNSAFE:
+            char = "%%%02x" % ord(char)
+        result.append(char)
+    return ''.join(result)
+
+def urlnormalize(href):
+    parts = urlparse(href)
+    parts = (part.replace('\\', '/') for part in parts)
+    parts = (urlunquote(part) for part in parts)
+    parts = (urlquote(part) for part in parts)
+    return urlunparse(parts)
+

 class AbstractContainer(object):
    def read_xml(self, path):
@ -68,12 +85,12 @@ class DirContainer(AbstractContainer):

    def read(self, path):
        path = os.path.join(self.rootdir, path)
-        with open(path, 'rb') as f:
+        with open(urlunquote(path), 'rb') as f:
            return f.read()

    def write(self, path, data):
        path = os.path.join(self.rootdir, path)
-        with open(path, 'wb') as f:
+        with open(urlunquote(path), 'wb') as f:
            return f.write(data)


@ -178,7 +195,7 @@ class Metadata(object):
        return elem
        
    def to_opf2(self, parent=None):
-        elem = element(parent, OPF('metadata'), nsmap=self.NSMAP)
+        elem = element(parent, OPF('metadata'), nsmap=self.OPF2_NSMAP)
        for term in self.items:
            for item in self.items[term]:
                item.to_opf2(elem)
@ -189,7 +206,7 @@ class Manifest(object):
    class Item(object):
        def __init__(self, id, href, media_type, loader=str):
            self.id = id
-            self.href = self.path = href.replace('%20', ' ')
+            self.href = self.path = urlnormalize(href)
            self.media_type = media_type
            self.spine_position = None
            self.linear = True
@ -235,8 +252,8 @@ class Manifest(object):

    def add(self, id, href, media_type):
        item = self.Item(id, href, media_type, self.oeb.container.read)
-        self.items[id] = item
-        self.hrefs[href] = item
+        self.items[item.id] = item
+        self.hrefs[item.href] = item
        return item

    def remove(self, id):
@ -331,7 +348,7 @@ class Guide(object):
        def __init__(self, type, title, href):
            self.type = type
            self.title = title
-            self.href = href
+            self.href = urlnormalize(href)

        def __repr__(self):
            return 'Reference(type=%r, title=%r, href=%r)' \
@ -390,7 +407,7 @@ class Guide(object):
 class Toc(object):
    def __init__(self, title=None, href=None, klass=None, id=None):
        self.title = title
-        self.href = href
+        self.href = urlnormalize(href) if href else href
        self.klass = klass
        self.id = id
        self.nodes = []
@ -414,8 +431,8 @@ class Toc(object):

    def to_opf1(self, tour):
        for node in self.nodes:
-            element(tour, 'site',
-                attrib={'title': node.title, 'href': node.href})
+            element(tour, 'site', attrib={
+                'title': node.title, 'href': node.href})
            node.to_opf1(tour)
        return tour
    
@ -431,8 +448,9 @@ class Toc(object):
                point.attrib['id'] = self.id
            label = etree.SubElement(point, NCX('navLabel'))
            etree.SubElement(label, NCX('text')).text = node.title
-            href = node.href if depth > 1 else node.href.split('#', 1)[0]
-            etree.SubElement(point, NCX('content'), attrib={'src': href})
+            href = node.href if depth > 1 else urldefrag(node.href)[0]
+            child = etree.SubElement(point,
+                NCX('content'), attrib={'src': href})
            node.to_ncx(point, playorder, depth+1)
        return parent

@ -490,7 +508,8 @@ class Oeb(object):
        uid = opf.attrib['unique-identifier']
        self.metadata = metadata = Metadata(self)        
        for elem in xpath(opf, '/o2:package/o2:metadata/*'):
-            metadata.add(elem.tag, elem.text, elem.attrib)
+            if elem.text or elem.attrib:
+                metadata.add(elem.tag, elem.text, elem.attrib)
        for item in metadata.identifier:
            if item.id == uid:
                self.uid = item
@ -524,7 +543,7 @@ class Oeb(object):
    def _toc_from_navpoint(self, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
-            title = xpath(child, 'ncx:navLabel/ncx:text/text()')[0]
+            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
            href = xpath(child, 'ncx:content/@src')[0]
            id = child.get('id')
            klass = child.get('class')
@ -564,8 +583,13 @@ class Oeb(object):
        item = self.manifest.hrefs[itempath]
        html = item.data
        if frag:
-            elem = xpath(html, './/*[@id="%s"]' % frag)
-            html = elem[0] if elem else html
+            elems = xpath(html, './/*[@id="%s"]' % frag)
+            if not elems:
+                elems = xpath(html, './/*[@name="%s"]' % frag)
+            elem = elems[0] if elems else html
+            while elem != html and not xpath(elem, './/h:a[@href]'):
+                elem = elem.getparent()
+            html = elem
        titles = defaultdict(list)
        order = []
        for anchor in xpath(html, './/h:a[@href]'):
@ -574,6 +598,7 @@ class Oeb(object):
            if not path:
                href = '#'.join((itempath, frag))
            title = ' '.join(xpath(anchor, './/text()'))
+            href = urlnormalize(href)
            if href not in titles:
                order.append(href)
            titles[href].append(title)
@ -679,10 +704,13 @@ class Oeb(object):
        return {OPF_MIME: ('content.opf', package),
                NCX_MIME: (href, ncx)}

+
 def main(argv=sys.argv):
    for arg in argv[1:]:
        oeb = Oeb(arg)
-        for name, doc in oeb.to_opf2().items():
+        for name, doc in oeb.to_opf1().values():
+            print etree.tostring(doc, pretty_print=True)
+        for name, doc in oeb.to_opf2().values():
            print etree.tostring(doc, pretty_print=True)
    return 0

--- a/src/calibre/ebooks/lit/reader.py
+++ b/src/calibre/ebooks/lit/reader.py
@ -10,10 +10,12 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net> ' \
 import sys, struct, cStringIO, os
 import functools
 import re
+from urlparse import urldefrag
 from lxml import etree
 from calibre.ebooks.lit import LitError
 from calibre.ebooks.lit.maps import OPF_MAP, HTML_MAP
 import calibre.ebooks.lit.mssha1 as mssha1
+from calibre.ebooks.lit.oeb import urlnormalize
 from calibre.ebooks import DRMError
 from calibre import plugins
 lzx, lxzerror = plugins['lzx']
@ -322,12 +324,12 @@ class UnBinary(object):
                href += c
                count -= 1
                if count == 0:
-                    doc, m, frag = href[1:].partition('#')
+                    doc, frag = urldefrag(href[1:])
                    path = self.item_path(doc)
-                    if m and frag:
-                        path += m + frag
-                    self.buf.write((u'"%s"' % path).encode(
-                        'ascii', 'xmlcharrefreplace'))
+                    if frag:
+                        path = '#'.join((path, frag))
+                    path = urlnormalize(path)
+                    self.buf.write((u'"%s"' % path).encode('utf-8'))
                    state = 'get attr'
        return index
    
--- a/src/calibre/ebooks/lit/writer.py
+++ b/src/calibre/ebooks/lit/writer.py
@ -10,11 +10,14 @@ import re
 import copy
 import uuid
 import functools
+from urlparse import urldefrag
+from urllib import unquote as urlunquote
 from lxml import etree
 from calibre.ebooks.lit.reader import msguid, DirectoryEntry
 import calibre.ebooks.lit.maps as maps
 from calibre.ebooks.lit.oeb import CSS_MIME, OPF_MIME
-from calibre.ebooks.lit.oeb import Oeb, namespace, barename
+from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize
+from calibre.ebooks.lit.oeb import Oeb
 from calibre.ebooks.lit.stylizer import Stylizer
 from calibre.ebooks.lit.lzxcomp import Compressor
 import calibre
@ -173,15 +176,13 @@ class ReBinary(object):
        for attr, value in attrib.items():
            attr = prefixname(attr, nsrmap)
            if attr in ('href', 'src'):
-                path, hash, frag = value.partition('#')
-                path = os.path.join(self.dir, path)
-                path = os.path.normpath(path)
-                path = path.replace('\\', '/')
+                value = urlnormalize(value)
+                path, frag = urldefrag(value)
                prefix = unichr(3)
                if path in self.manifest.hrefs:
                    prefix = unichr(2)
                    value = self.manifest.hrefs[path].id
-                    if hash and frag:
+                    if frag:
                        value = '#'.join((value, frag))
                value = prefix + value
            elif attr in ('id', 'name'):
@ -420,7 +421,8 @@ class LitWriter(object):
            items.sort()
            data.write(pack('<I', len(items)))
            for item in items:
-                id, href, media_type = item.id, item.href, item.media_type
+                id, media_type = item.id, item.media_type
+                href = urlunquote(item.href)
                item.offset = offset \
                    if state in ('linear', 'nonlinear') else 0
                data.write(pack('<I', item.offset))