Fix #1715 (LIT: Filename with a semi-colon ";" breaks style sheets.)

2025-07-08 10:44:09 -04:00 · 2009-02-02 16:16:16 -08:00 · 2009-02-02 16:16:16 -08:00 · 2e704247be
commit 2e704247be
parent 1fc09e8267 d4eed478b1
2 changed files with 21 additions and 14 deletions
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@ -9,7 +9,7 @@ directory or zip file. All the action starts in :function:`create_dir`.
 '''

 import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools
-from urlparse import urlparse
+from urlparse import urlparse, urlunparse
 from urllib import unquote

 from lxml import etree
@ -98,7 +98,8 @@ class Link(object):
    
    @classmethod
    def url_to_local_path(cls, url, base):
-        path = url.path
+        path = urlunparse(('', '', url.path, url.params, url.query, ''))
+        path = unquote(path)
        if os.path.isabs(path):
            return path
        return os.path.abspath(os.path.join(base, path))
@ -111,11 +112,11 @@ class Link(object):
        '''
        assert isinstance(url, unicode) and isinstance(base, unicode)
        self.url         = url
-        self.parsed_url  = urlparse(unquote(self.url))
+        self.parsed_url  = urlparse(self.url)
        self.is_local    = self.parsed_url.scheme in ('', 'file')
        self.is_internal = self.is_local and not bool(self.parsed_url.path)
        self.path        = None
-        self.fragment    = self.parsed_url.fragment 
+        self.fragment    = unquote(self.parsed_url.fragment)
        if self.is_local and not self.is_internal:
            self.path = self.url_to_local_path(self.parsed_url, base)

--- a/src/calibre/ebooks/oeb/base.py
+++ b/src/calibre/ebooks/oeb/base.py
@ -154,6 +154,9 @@ def urlquote(href):

 def urlnormalize(href):
    parts = urlparse(href)
+    if not parts.scheme:
+        path, frag = urldefrag(href)
+        parts = ('', '', path, '', '', frag)
    parts = (part.replace('\\', '/') for part in parts)
    parts = (urlunquote(part) for part in parts)
    parts = (urlquote(part) for part in parts)
@ -900,9 +903,9 @@ class TOC(object):
    
    def to_ncx(self, parent, depth=1):
        for node in self.nodes:
-            id = self.id or unicode(uuid.uuid4())
+            id = node.id or unicode(uuid.uuid4())
            attrib = {'id': id, 'playOrder': '0'}
-            if self.klass:
+            if node.klass:
                attrib['class'] = node.klass
            point = element(parent, NCX('navPoint'), attrib=attrib)
            label = etree.SubElement(point, NCX('navLabel'))
@ -1009,13 +1012,16 @@ class OEBBook(object):
        return nroot
    
    def _read_opf(self, opfpath):
-        opf = self.container.read(opfpath)
+        data = self.container.read(opfpath)
+        data = self.decode(data)
+        data = XMLDECL_RE.sub('', data)
+        data = data.replace('\r\n', '\n').replace('\r', '\n')
        try:
-            opf = etree.fromstring(opf)
+            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
            repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
-            opf = ENTITY_RE.sub(repl, opf)
-            opf = etree.fromstring(opf)
+            data = ENTITY_RE.sub(repl, data)
+            opf = etree.fromstring(data)
            self.logger.warn('OPF contains invalid HTML named entities')
        ns = namespace(opf.tag)
        if ns not in ('', OPF1_NS, OPF2_NS):
@ -1045,7 +1051,7 @@ class OEBBook(object):
                haveuuid = True
            if 'id' in ident.attrib:
                haveid = True
-        if not haveuuid and haveid:
+        if not (haveuuid and haveid):
            bookid = "urn:uuid:%s" % str(uuid.uuid4())
            metadata.add('identifier', bookid, id='calibre-uuid')
        if uid is None:
@ -1232,13 +1238,13 @@ class OEBBook(object):
            if not item.linear: continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
-            title = COLLAPSE_RE(' ', title.strip())
+            title = COLLAPSE_RE.sub(' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
-                header = ''.join(xpath(html % tag, expr))
+                header = ''.join(xpath(html, expr % tag))
                header = COLLAPSE_RE.sub(' ', header.strip())
                if header:
                    headers[-1] = header
@ -1320,7 +1326,7 @@ class OEBBook(object):
        with TemporaryDirectory('_html_cover') as tdir:
            writer = DirWriter()
            writer.dump(self, tdir)
-            path = os.path.join(tdir, hcover.href)
+            path = os.path.join(tdir, urlunquote(hcover.href))
            renderer = CoverRenderer(path)
            data = renderer.image_data
        id, href = self.manifest.generate('cover', 'cover.jpeg')