FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded. Fixes #990929 (Private bug)

2025-07-09 03:04:10 -04:00 · 2012-04-29 13:51:09 +05:30 · 2012-04-29 13:51:09 +05:30 · db3203176e
commit db3203176e
parent 0d506a3ee7
3 changed files with 70 additions and 6 deletions
--- a/src/calibre/ebooks/conversion/plugins/fb2_input.py
+++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py
@ -5,13 +5,13 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
 Convert .fb2 files to .lrf
 """
 import os, re
 from base64 import b64decode
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre import guess_type
 FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
 class FB2Input(InputFormatPlugin):
    name        = 'FB2 Input'
@ -41,6 +41,7 @@ class FB2Input(InputFormatPlugin):
        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
        from calibre.ebooks.chardet import xml_to_unicode
        NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
        self.log = log
        log.debug('Parsing XML...')
        raw = stream.read().replace('\0', '')
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
@ -123,6 +124,7 @@ class FB2Input(InputFormatPlugin):
        return os.path.join(os.getcwdu(), u'metadata.opf')
    def extract_embedded_content(self, doc):
        from calibre.ebooks.fb2 import base64_decode
        self.binary_map = {}
        for elem in doc.xpath('./*'):
            if elem.text and 'binary' in elem.tag and elem.attrib.has_key('id'):
@ -130,8 +132,17 @@ class FB2Input(InputFormatPlugin):
                fname = elem.attrib['id']
                ext = ct.rpartition('/')[-1].lower()
                if ext in ('png', 'jpeg', 'jpg'):
                    if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
                            'png'}:
                        fname += '.' + ext
                    self.binary_map[elem.get('id')] = fname
-                data = b64decode(elem.text.strip())
+                raw = elem.text.strip()
-                open(fname, 'wb').write(data)
+                try:
                    data = base64_decode(raw)
                except TypeError:
                    self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
                        elem.get('id')))
                else:
                    with open(fname, 'wb') as f:
                        f.write(data)
--- a/src/calibre/ebooks/fb2/init.py
+++ b/src/calibre/ebooks/fb2/init.py
@ -0,0 +1,52 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 def base64_decode(raw):
    from io import BytesIO
    from base64 import b64decode
    # First try the python implementation as it is faster
    try:
        return b64decode(raw)
    except TypeError:
        pass
    # Try a more robust version (adapted from FBReader sources)
    A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=')
    raw = bytearray(raw)
    out = BytesIO()
    pos = 0
    while pos < len(raw):
        tot = 0
        i = 0
        while i < 4 and pos < len(raw):
            byt = raw[pos]
            pos += 1
            num = 0
            if A <= byt <= Z:
                num = byt - A
            elif a <= byt <= z:
                num = byt - a + 26
            elif zero <= byt <= nine:
                num = byt - zero + 52
            else:
                num = {plus:62, slash:63, equal:64}.get(byt, None)
                if num is None:
                    # Ignore this byte
                    continue
            tot += num << (6 * (3 - i))
            i += 1
        triple = bytearray(3)
        for j in (2, 1, 0):
            triple[j] = tot & 0xff
            tot >>= 8
        out.write(bytes(triple))
    return out.getvalue()
--- a/src/calibre/ebooks/metadata/fb2.py
+++ b/src/calibre/ebooks/metadata/fb2.py
@ -8,7 +8,6 @@ __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
 import os
 import datetime
 from functools import partial
 from base64 import b64decode
 from lxml import etree
 from calibre.utils.date import parse_date
 from calibre import guess_type, guess_all_extensions, prints, force_unicode
@ -143,6 +142,7 @@ def _parse_cover(root, mi):
            pass
 def _parse_cover_data(root, imgid, mi):
    from calibre.ebooks.fb2 import base64_decode
    elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
    if elm_binary:
        mimetype = elm_binary[0].get('content-type', 'image/jpeg')
@ -156,7 +156,8 @@ def _parse_cover_data(root, imgid, mi):
        if mime_extensions:
            pic_data = elm_binary[0].text
            if pic_data:
-                mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
+                mi.cover_data = (mime_extensions[0][1:],
                        base64_decode(pic_data.strip()))
        else:
            prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )