diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py index de863cca75..0229fd6124 100644 --- a/src/calibre/ebooks/html.py +++ b/src/calibre/ebooks/html.py @@ -9,7 +9,7 @@ directory or zip file. All the action starts in :function:`create_dir`. ''' import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools -from urlparse import urlparse +from urlparse import urlparse, urlunparse from urllib import unquote from lxml import etree @@ -98,7 +98,8 @@ class Link(object): @classmethod def url_to_local_path(cls, url, base): - path = url.path + path = urlunparse(('', '', url.path, url.params, url.query, '')) + path = unquote(path) if os.path.isabs(path): return path return os.path.abspath(os.path.join(base, path)) @@ -111,11 +112,11 @@ class Link(object): ''' assert isinstance(url, unicode) and isinstance(base, unicode) self.url = url - self.parsed_url = urlparse(unquote(self.url)) + self.parsed_url = urlparse(self.url) self.is_local = self.parsed_url.scheme in ('', 'file') self.is_internal = self.is_local and not bool(self.parsed_url.path) self.path = None - self.fragment = self.parsed_url.fragment + self.fragment = unquote(self.parsed_url.fragment) if self.is_local and not self.is_internal: self.path = self.url_to_local_path(self.parsed_url, base) diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 9ba5d95899..94402ae882 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -154,6 +154,9 @@ def urlquote(href): def urlnormalize(href): parts = urlparse(href) + if not parts.scheme: + path, frag = urldefrag(href) + parts = ('', '', path, '', '', frag) parts = (part.replace('\\', '/') for part in parts) parts = (urlunquote(part) for part in parts) parts = (urlquote(part) for part in parts) @@ -900,9 +903,9 @@ class TOC(object): def to_ncx(self, parent, depth=1): for node in self.nodes: - id = self.id or unicode(uuid.uuid4()) + id = node.id or unicode(uuid.uuid4()) attrib = {'id': id, 'playOrder': '0'} - if self.klass: + if node.klass: attrib['class'] = node.klass point = element(parent, NCX('navPoint'), attrib=attrib) label = etree.SubElement(point, NCX('navLabel')) @@ -1009,13 +1012,16 @@ class OEBBook(object): return nroot def _read_opf(self, opfpath): - opf = self.container.read(opfpath) + data = self.container.read(opfpath) + data = self.decode(data) + data = XMLDECL_RE.sub('', data) + data = data.replace('\r\n', '\n').replace('\r', '\n') try: - opf = etree.fromstring(opf) + opf = etree.fromstring(data) except etree.XMLSyntaxError: repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) - opf = ENTITY_RE.sub(repl, opf) - opf = etree.fromstring(opf) + data = ENTITY_RE.sub(repl, data) + opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): @@ -1045,7 +1051,7 @@ class OEBBook(object): haveuuid = True if 'id' in ident.attrib: haveid = True - if not haveuuid and haveid: + if not (haveuuid and haveid): bookid = "urn:uuid:%s" % str(uuid.uuid4()) metadata.add('identifier', bookid, id='calibre-uuid') if uid is None: @@ -1232,13 +1238,13 @@ class OEBBook(object): if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) - title = COLLAPSE_RE(' ', title.strip()) + title = COLLAPSE_RE.sub(' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' - header = ''.join(xpath(html % tag, expr)) + header = ''.join(xpath(html, expr % tag)) header = COLLAPSE_RE.sub(' ', header.strip()) if header: headers[-1] = header @@ -1320,7 +1326,7 @@ class OEBBook(object): with TemporaryDirectory('_html_cover') as tdir: writer = DirWriter() writer.dump(self, tdir) - path = os.path.join(tdir, hcover.href) + path = os.path.join(tdir, urlunquote(hcover.href)) renderer = CoverRenderer(path) data = renderer.image_data id, href = self.manifest.generate('cover', 'cover.jpeg')