From db3203176e09965dc48ff41f0b7f05d03422abd1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 29 Apr 2012 13:51:09 +0530 Subject: [PATCH] FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded. Fixes #990929 (Private bug) --- .../ebooks/conversion/plugins/fb2_input.py | 19 +++++-- src/calibre/ebooks/fb2/__init__.py | 52 +++++++++++++++++++ src/calibre/ebooks/metadata/fb2.py | 5 +- 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/src/calibre/ebooks/conversion/plugins/fb2_input.py b/src/calibre/ebooks/conversion/plugins/fb2_input.py index b0d6a8b0ae..e1e619600d 100644 --- a/src/calibre/ebooks/conversion/plugins/fb2_input.py +++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py @@ -5,13 +5,13 @@ __copyright__ = '2008, Anatoly Shipitsin ' Convert .fb2 files to .lrf """ import os, re -from base64 import b64decode from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre import guess_type FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0' + class FB2Input(InputFormatPlugin): name = 'FB2 Input' @@ -41,6 +41,7 @@ class FB2Input(InputFormatPlugin): from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER from calibre.ebooks.chardet import xml_to_unicode NAMESPACES = {'f':FB2NS, 'l':XLINK_NS} + self.log = log log.debug('Parsing XML...') raw = stream.read().replace('\0', '') raw = xml_to_unicode(raw, strip_encoding_pats=True, @@ -123,6 +124,7 @@ class FB2Input(InputFormatPlugin): return os.path.join(os.getcwdu(), u'metadata.opf') def extract_embedded_content(self, doc): + from calibre.ebooks.fb2 import base64_decode self.binary_map = {} for elem in doc.xpath('./*'): if elem.text and 'binary' in elem.tag and elem.attrib.has_key('id'): @@ -130,8 +132,17 @@ class FB2Input(InputFormatPlugin): fname = elem.attrib['id'] ext = ct.rpartition('/')[-1].lower() if ext in ('png', 'jpeg', 'jpg'): - fname += '.' + ext + if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg', + 'png'}: + fname += '.' + ext self.binary_map[elem.get('id')] = fname - data = b64decode(elem.text.strip()) - open(fname, 'wb').write(data) + raw = elem.text.strip() + try: + data = base64_decode(raw) + except TypeError: + self.log.exception('Binary data with id=%s is corrupted, ignoring'%( + elem.get('id'))) + else: + with open(fname, 'wb') as f: + f.write(data) diff --git a/src/calibre/ebooks/fb2/__init__.py b/src/calibre/ebooks/fb2/__init__.py index e69de29bb2..944bfdd054 100644 --- a/src/calibre/ebooks/fb2/__init__.py +++ b/src/calibre/ebooks/fb2/__init__.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2012, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +def base64_decode(raw): + from io import BytesIO + from base64 import b64decode + + # First try the python implementation as it is faster + try: + return b64decode(raw) + except TypeError: + pass + + # Try a more robust version (adapted from FBReader sources) + A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=') + raw = bytearray(raw) + out = BytesIO() + pos = 0 + while pos < len(raw): + tot = 0 + i = 0 + while i < 4 and pos < len(raw): + byt = raw[pos] + pos += 1 + num = 0 + if A <= byt <= Z: + num = byt - A + elif a <= byt <= z: + num = byt - a + 26 + elif zero <= byt <= nine: + num = byt - zero + 52 + else: + num = {plus:62, slash:63, equal:64}.get(byt, None) + if num is None: + # Ignore this byte + continue + tot += num << (6 * (3 - i)) + i += 1 + triple = bytearray(3) + for j in (2, 1, 0): + triple[j] = tot & 0xff + tot >>= 8 + out.write(bytes(triple)) + return out.getvalue() + + diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py index f5ba06e81f..2bcbe931b8 100644 --- a/src/calibre/ebooks/metadata/fb2.py +++ b/src/calibre/ebooks/metadata/fb2.py @@ -8,7 +8,6 @@ __copyright__ = '2011, Roman Mukhin , '\ import os import datetime from functools import partial -from base64 import b64decode from lxml import etree from calibre.utils.date import parse_date from calibre import guess_type, guess_all_extensions, prints, force_unicode @@ -143,6 +142,7 @@ def _parse_cover(root, mi): pass def _parse_cover_data(root, imgid, mi): + from calibre.ebooks.fb2 import base64_decode elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root) if elm_binary: mimetype = elm_binary[0].get('content-type', 'image/jpeg') @@ -156,7 +156,8 @@ def _parse_cover_data(root, imgid, mi): if mime_extensions: pic_data = elm_binary[0].text if pic_data: - mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data)) + mi.cover_data = (mime_extensions[0][1:], + base64_decode(pic_data.strip())) else: prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )