From 9200e202ce96d60a344ca2d6a3daec7d69e0ba3c Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Mon, 11 Aug 2008 10:01:21 -0400 Subject: [PATCH 1/3] Fix #939 (lit2oeb: generates incorrect image tags in some cases) --- src/calibre/ebooks/lit/reader.py | 48 +++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index d9924a2454..995b698de4 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -107,11 +107,12 @@ class UnBinary(object): AMPERSAND_RE = re.compile( r'&(?!(?:#[0-9]+|#x[0-9a-fA-F]+|[a-zA-Z_:][a-zA-Z0-9.-_:]+);)') - def __init__(self, bin, manifest, map=OPF_MAP): + def __init__(self, bin, path, manifest, map=OPF_MAP): self.manifest = manifest self.tag_map, self.attr_map, self.tag_to_attr_map = map self.opf = map is OPF_MAP self.bin = bin + self.dir = os.path.dirname(path) self.buf = cStringIO.StringIO() self.binary_to_text() self.raw = self.buf.getvalue().lstrip().decode('utf-8') @@ -122,9 +123,19 @@ class UnBinary(object): def item_path(self, internal_id): try: - return self.manifest[internal_id].path + target = self.manifest[internal_id].path except KeyError: return internal_id + if not self.dir: + return target + target = target.split('/') + base = self.dir.split('/') + for index in xrange(min(len(base), len(target))): + if base[index] != target[index]: break + else: + index += 1 + relpath = (['..'] * (len(base) - index)) + target[index:] + return '/'.join(relpath) def __unicode__(self): return self.raw @@ -597,15 +608,16 @@ class LitReader(object): item.path = os.path.basename(item.path) def _read_meta(self): + path = 'content.opf' raw = self.get_file('/meta') try: - xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP)) + xml = OPF_DECL + unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) except LitError: if 'PENGUIN group' not in raw: raise print "WARNING: attempting PENGUIN malformed OPF fix" raw = raw.replace( 'PENGUIN group', '\x00\x01\x18\x00PENGUIN group', 1) - xml = OPF_DECL + unicode(UnBinary(raw, self.manifest, OPF_MAP)) + xml = OPF_DECL + unicode(UnBinary(raw, path, self.manifest, OPF_MAP)) self.meta = xml def _read_drm(self): @@ -645,13 +657,6 @@ class LitReader(object): key[i % 8] ^= ord(digest[i]) return ''.join(chr(x) for x in key) - def get_markup_file(self, name): - raw = self.get_file(name) - decl, map = (OPF_DECL, OPF_MAP) \ - if name == '/meta' else (HTML_DECL, HTML_MAP) - xml = decl + unicode(UnBinary(raw, self.manifest, map)) - return xml - def get_file(self, name): entry = self.entries[name] if entry.section == 0: @@ -748,6 +753,20 @@ class LitReader(object): raise LitError("Failed to completely decompress section") return ''.join(result) + def get_entry_content(self, entry): + if 'spine' in entry.state: + name = '/'.join(('/data', entry.internal, 'content')) + path = entry.path + raw = self.get_file(name) + decl, map = (OPF_DECL, OPF_MAP) \ + if name == '/meta' else (HTML_DECL, HTML_MAP) + content = decl + unicode(UnBinary(raw, path, self.manifest, map)) + content = content.encode('utf-8') + else: + name = '/'.join(('/data', entry.internal)) + content = self.get_file(name) + return content + def extract_content(self, output_dir=os.getcwdu()): output_dir = os.path.abspath(output_dir) try: @@ -763,12 +782,7 @@ class LitReader(object): path = os.path.join(output_dir, entry.path) self._ensure_dir(path) with open(path, 'wb') as f: - if 'spine' in entry.state: - name = '/'.join(('/data', entry.internal, 'content')) - f.write(self.get_markup_file(name).encode('utf-8')) - else: - name = '/'.join(('/data', entry.internal)) - f.write(self.get_file(name)) + f.write(self.get_entry_content(entry)) def _ensure_dir(self, path): dir = os.path.dirname(path) From 3f6d486fb897710c88a4e58af6f90316d9ba8515 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Mon, 11 Aug 2008 12:11:10 -0400 Subject: [PATCH 2/3] Fix #941 (Raw UTF-8 misinterpreted in HTML with explicitly specified encoding) --- src/calibre/ebooks/lit/reader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 995b698de4..1376363d45 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -158,7 +158,7 @@ class UnBinary(object): continue elif c == '\v': c = '\n' - self.buf.write(c.encode('utf-8')) + self.buf.write(c.encode('ascii', 'xmlcharrefreplace')) elif state == 'get flags': if oc == 0: @@ -258,7 +258,8 @@ class UnBinary(object): state = 'get attr' elif count > 0: if not in_censorship: - self.buf.write(unicode(c).encode('utf-8')) + self.buf.write(c.encode( + 'ascii', 'xmlcharrefreplace')) count -= 1 if count == 0: if not in_censorship: @@ -310,7 +311,8 @@ class UnBinary(object): path = self.item_path(doc) if m and frag: path += m + frag - self.buf.write((u'"%s"' % path).encode('utf-8')) + self.buf.write((u'"%s"' % path).encode( + 'ascii', 'xmlcharrefreplace')) state = 'get attr' return index From 673ef45d66a28e0ad46e9a67f1fcca6b3fdf0709 Mon Sep 17 00:00:00 2001 From: "Marshall T. Vandegrift" Date: Mon, 11 Aug 2008 12:32:02 -0400 Subject: [PATCH 3/3] Fix #938 (lit2oeb: Crash converting some LIT files) --- src/calibre/ebooks/lit/reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/ebooks/lit/reader.py b/src/calibre/ebooks/lit/reader.py index 1376363d45..b2effb60b9 100644 --- a/src/calibre/ebooks/lit/reader.py +++ b/src/calibre/ebooks/lit/reader.py @@ -217,7 +217,7 @@ class UnBinary(object): state = 'get attr length' continue attr = None - if oc in current_map and current_map[oc]: + if current_map and oc in current_map and current_map[oc]: attr = current_map[oc] elif oc in self.attr_map: attr = self.attr_map[oc]