Fix #1715 (LIT: Filename with a semi-colon ";" breaks style sheets.)

This commit is contained in:
Kovid Goyal 2009-02-02 16:16:16 -08:00
commit 2e704247be
2 changed files with 21 additions and 14 deletions

View File

@ -9,7 +9,7 @@ directory or zip file. All the action starts in :function:`create_dir`.
''' '''
import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools import sys, re, os, shutil, logging, tempfile, cStringIO, operator, functools
from urlparse import urlparse from urlparse import urlparse, urlunparse
from urllib import unquote from urllib import unquote
from lxml import etree from lxml import etree
@ -98,7 +98,8 @@ class Link(object):
@classmethod @classmethod
def url_to_local_path(cls, url, base): def url_to_local_path(cls, url, base):
path = url.path path = urlunparse(('', '', url.path, url.params, url.query, ''))
path = unquote(path)
if os.path.isabs(path): if os.path.isabs(path):
return path return path
return os.path.abspath(os.path.join(base, path)) return os.path.abspath(os.path.join(base, path))
@ -111,11 +112,11 @@ class Link(object):
''' '''
assert isinstance(url, unicode) and isinstance(base, unicode) assert isinstance(url, unicode) and isinstance(base, unicode)
self.url = url self.url = url
self.parsed_url = urlparse(unquote(self.url)) self.parsed_url = urlparse(self.url)
self.is_local = self.parsed_url.scheme in ('', 'file') self.is_local = self.parsed_url.scheme in ('', 'file')
self.is_internal = self.is_local and not bool(self.parsed_url.path) self.is_internal = self.is_local and not bool(self.parsed_url.path)
self.path = None self.path = None
self.fragment = self.parsed_url.fragment self.fragment = unquote(self.parsed_url.fragment)
if self.is_local and not self.is_internal: if self.is_local and not self.is_internal:
self.path = self.url_to_local_path(self.parsed_url, base) self.path = self.url_to_local_path(self.parsed_url, base)

View File

@ -154,6 +154,9 @@ def urlquote(href):
def urlnormalize(href): def urlnormalize(href):
parts = urlparse(href) parts = urlparse(href)
if not parts.scheme:
path, frag = urldefrag(href)
parts = ('', '', path, '', '', frag)
parts = (part.replace('\\', '/') for part in parts) parts = (part.replace('\\', '/') for part in parts)
parts = (urlunquote(part) for part in parts) parts = (urlunquote(part) for part in parts)
parts = (urlquote(part) for part in parts) parts = (urlquote(part) for part in parts)
@ -900,9 +903,9 @@ class TOC(object):
def to_ncx(self, parent, depth=1): def to_ncx(self, parent, depth=1):
for node in self.nodes: for node in self.nodes:
id = self.id or unicode(uuid.uuid4()) id = node.id or unicode(uuid.uuid4())
attrib = {'id': id, 'playOrder': '0'} attrib = {'id': id, 'playOrder': '0'}
if self.klass: if node.klass:
attrib['class'] = node.klass attrib['class'] = node.klass
point = element(parent, NCX('navPoint'), attrib=attrib) point = element(parent, NCX('navPoint'), attrib=attrib)
label = etree.SubElement(point, NCX('navLabel')) label = etree.SubElement(point, NCX('navLabel'))
@ -1009,13 +1012,16 @@ class OEBBook(object):
return nroot return nroot
def _read_opf(self, opfpath): def _read_opf(self, opfpath):
opf = self.container.read(opfpath) data = self.container.read(opfpath)
data = self.decode(data)
data = XMLDECL_RE.sub('', data)
data = data.replace('\r\n', '\n').replace('\r', '\n')
try: try:
opf = etree.fromstring(opf) opf = etree.fromstring(data)
except etree.XMLSyntaxError: except etree.XMLSyntaxError:
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0)) repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
opf = ENTITY_RE.sub(repl, opf) data = ENTITY_RE.sub(repl, data)
opf = etree.fromstring(opf) opf = etree.fromstring(data)
self.logger.warn('OPF contains invalid HTML named entities') self.logger.warn('OPF contains invalid HTML named entities')
ns = namespace(opf.tag) ns = namespace(opf.tag)
if ns not in ('', OPF1_NS, OPF2_NS): if ns not in ('', OPF1_NS, OPF2_NS):
@ -1045,7 +1051,7 @@ class OEBBook(object):
haveuuid = True haveuuid = True
if 'id' in ident.attrib: if 'id' in ident.attrib:
haveid = True haveid = True
if not haveuuid and haveid: if not (haveuuid and haveid):
bookid = "urn:uuid:%s" % str(uuid.uuid4()) bookid = "urn:uuid:%s" % str(uuid.uuid4())
metadata.add('identifier', bookid, id='calibre-uuid') metadata.add('identifier', bookid, id='calibre-uuid')
if uid is None: if uid is None:
@ -1232,13 +1238,13 @@ class OEBBook(object):
if not item.linear: continue if not item.linear: continue
html = item.data html = item.data
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
title = COLLAPSE_RE(' ', title.strip()) title = COLLAPSE_RE.sub(' ', title.strip())
if title: if title:
titles.append(title) titles.append(title)
headers.append('(unlabled)') headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()' expr = '/h:html/h:body//h:%s[position()=1]/text()'
header = ''.join(xpath(html % tag, expr)) header = ''.join(xpath(html, expr % tag))
header = COLLAPSE_RE.sub(' ', header.strip()) header = COLLAPSE_RE.sub(' ', header.strip())
if header: if header:
headers[-1] = header headers[-1] = header
@ -1320,7 +1326,7 @@ class OEBBook(object):
with TemporaryDirectory('_html_cover') as tdir: with TemporaryDirectory('_html_cover') as tdir:
writer = DirWriter() writer = DirWriter()
writer.dump(self, tdir) writer.dump(self, tdir)
path = os.path.join(tdir, hcover.href) path = os.path.join(tdir, urlunquote(hcover.href))
renderer = CoverRenderer(path) renderer = CoverRenderer(path)
data = renderer.image_data data = renderer.image_data
id, href = self.manifest.generate('cover', 'cover.jpeg') id, href = self.manifest.generate('cover', 'cover.jpeg')