FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded. Fixes #990929 (Private bug)

2025-07-09 03:04:10 -04:00 · 2012-04-29 13:51:09 +05:30 · 2012-04-29 13:51:09 +05:30 · db3203176e
commit db3203176e
parent 0d506a3ee7
3 changed files with 70 additions and 6 deletions
--- a/src/calibre/ebooks/conversion/plugins/fb2_input.py
+++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py
@ -5,13 +5,13 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
 Convert .fb2 files to .lrf
 """
 import os, re
-from base64 import b64decode

 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre import guess_type

 FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'

+
 class FB2Input(InputFormatPlugin):

    name        = 'FB2 Input'
@ -41,6 +41,7 @@ class FB2Input(InputFormatPlugin):
        from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
        from calibre.ebooks.chardet import xml_to_unicode
        NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
+        self.log = log
        log.debug('Parsing XML...')
        raw = stream.read().replace('\0', '')
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
@ -123,6 +124,7 @@ class FB2Input(InputFormatPlugin):
        return os.path.join(os.getcwdu(), u'metadata.opf')

    def extract_embedded_content(self, doc):
+        from calibre.ebooks.fb2 import base64_decode
        self.binary_map = {}
        for elem in doc.xpath('./*'):
            if elem.text and 'binary' in elem.tag and elem.attrib.has_key('id'):
@ -130,8 +132,17 @@ class FB2Input(InputFormatPlugin):
                fname = elem.attrib['id']
                ext = ct.rpartition('/')[-1].lower()
                if ext in ('png', 'jpeg', 'jpg'):
-                    fname += '.' + ext
+                    if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
+                            'png'}:
+                        fname += '.' + ext
                    self.binary_map[elem.get('id')] = fname
-                data = b64decode(elem.text.strip())
-                open(fname, 'wb').write(data)
+                raw = elem.text.strip()
+                try:
+                    data = base64_decode(raw)
+                except TypeError:
+                    self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
+                        elem.get('id')))
+                else:
+                    with open(fname, 'wb') as f:
+                        f.write(data)

--- a/src/calibre/ebooks/fb2/init.py
+++ b/src/calibre/ebooks/fb2/init.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+def base64_decode(raw):
+    from io import BytesIO
+    from base64 import b64decode
+
+    # First try the python implementation as it is faster
+    try:
+        return b64decode(raw)
+    except TypeError:
+        pass
+
+    # Try a more robust version (adapted from FBReader sources)
+    A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=')
+    raw = bytearray(raw)
+    out = BytesIO()
+    pos = 0
+    while pos < len(raw):
+        tot = 0
+        i = 0
+        while i < 4 and pos < len(raw):
+            byt = raw[pos]
+            pos += 1
+            num = 0
+            if A <= byt <= Z:
+                num = byt - A
+            elif a <= byt <= z:
+                num = byt - a + 26
+            elif zero <= byt <= nine:
+                num = byt - zero + 52
+            else:
+                num = {plus:62, slash:63, equal:64}.get(byt, None)
+                if num is None:
+                    # Ignore this byte
+                    continue
+            tot += num << (6 * (3 - i))
+            i += 1
+        triple = bytearray(3)
+        for j in (2, 1, 0):
+            triple[j] = tot & 0xff
+            tot >>= 8
+        out.write(bytes(triple))
+    return out.getvalue()
+
+
--- a/src/calibre/ebooks/metadata/fb2.py
+++ b/src/calibre/ebooks/metadata/fb2.py
@ -8,7 +8,6 @@ __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
 import os
 import datetime
 from functools import partial
-from base64 import b64decode
 from lxml import etree
 from calibre.utils.date import parse_date
 from calibre import guess_type, guess_all_extensions, prints, force_unicode
@ -143,6 +142,7 @@ def _parse_cover(root, mi):
            pass

 def _parse_cover_data(root, imgid, mi):
+    from calibre.ebooks.fb2 import base64_decode
    elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
    if elm_binary:
        mimetype = elm_binary[0].get('content-type', 'image/jpeg')
@ -156,7 +156,8 @@ def _parse_cover_data(root, imgid, mi):
        if mime_extensions:
            pic_data = elm_binary[0].text
            if pic_data:
-                mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
+                mi.cover_data = (mime_extensions[0][1:],
+                        base64_decode(pic_data.strip()))
        else:
            prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )