From db3203176e09965dc48ff41f0b7f05d03422abd1 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sun, 29 Apr 2012 13:51:09 +0530
Subject: [PATCH] FB2 Input: More robust base64 decoding to handle embedded
 images that are incorrectly encoded. Fixes #990929 (Private bug)

---
 .../ebooks/conversion/plugins/fb2_input.py    | 19 +++++--
 src/calibre/ebooks/fb2/__init__.py            | 52 +++++++++++++++++++
 src/calibre/ebooks/metadata/fb2.py            |  5 +-
 3 files changed, 70 insertions(+), 6 deletions(-)
diff --git a/src/calibre/ebooks/conversion/plugins/fb2_input.py b/src/calibre/ebooks/conversion/plugins/fb2_input.py
index b0d6a8b0ae..e1e619600d 100644
--- a/src/calibre/ebooks/conversion/plugins/fb2_input.py
+++ b/src/calibre/ebooks/conversion/plugins/fb2_input.py
@@ -5,13 +5,13 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
 Convert .fb2 files to .lrf
 """
 import os, re
-from base64 import b64decode
 
 from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
 from calibre import guess_type
 
 FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
 
+
 class FB2Input(InputFormatPlugin):
 
     name        = 'FB2 Input'
@@ -41,6 +41,7 @@ class FB2Input(InputFormatPlugin):
         from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
         from calibre.ebooks.chardet import xml_to_unicode
         NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
+        self.log = log
         log.debug('Parsing XML...')
         raw = stream.read().replace('\0', '')
         raw = xml_to_unicode(raw, strip_encoding_pats=True,
@@ -123,6 +124,7 @@ class FB2Input(InputFormatPlugin):
         return os.path.join(os.getcwdu(), u'metadata.opf')
 
     def extract_embedded_content(self, doc):
+        from calibre.ebooks.fb2 import base64_decode
         self.binary_map = {}
         for elem in doc.xpath('./*'):
             if elem.text and 'binary' in elem.tag and elem.attrib.has_key('id'):
@@ -130,8 +132,17 @@ class FB2Input(InputFormatPlugin):
                 fname = elem.attrib['id']
                 ext = ct.rpartition('/')[-1].lower()
                 if ext in ('png', 'jpeg', 'jpg'):
-                    fname += '.' + ext
+                    if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
+                            'png'}:
+                        fname += '.' + ext
                     self.binary_map[elem.get('id')] = fname
-                data = b64decode(elem.text.strip())
-                open(fname, 'wb').write(data)
+                raw = elem.text.strip()
+                try:
+                    data = base64_decode(raw)
+                except TypeError:
+                    self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
+                        elem.get('id')))
+                else:
+                    with open(fname, 'wb') as f:
+                        f.write(data)
 
diff --git a/src/calibre/ebooks/fb2/__init__.py b/src/calibre/ebooks/fb2/__init__.py
index e69de29bb2..944bfdd054 100644
--- a/src/calibre/ebooks/fb2/__init__.py
+++ b/src/calibre/ebooks/fb2/__init__.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+def base64_decode(raw):
+    from io import BytesIO
+    from base64 import b64decode
+
+    # First try the python implementation as it is faster
+    try:
+        return b64decode(raw)
+    except TypeError:
+        pass
+
+    # Try a more robust version (adapted from FBReader sources)
+    A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=')
+    raw = bytearray(raw)
+    out = BytesIO()
+    pos = 0
+    while pos < len(raw):
+        tot = 0
+        i = 0
+        while i < 4 and pos < len(raw):
+            byt = raw[pos]
+            pos += 1
+            num = 0
+            if A <= byt <= Z:
+                num = byt - A
+            elif a <= byt <= z:
+                num = byt - a + 26
+            elif zero <= byt <= nine:
+                num = byt - zero + 52
+            else:
+                num = {plus:62, slash:63, equal:64}.get(byt, None)
+                if num is None:
+                    # Ignore this byte
+                    continue
+            tot += num << (6 * (3 - i))
+            i += 1
+        triple = bytearray(3)
+        for j in (2, 1, 0):
+            triple[j] = tot & 0xff
+            tot >>= 8
+        out.write(bytes(triple))
+    return out.getvalue()
+
+
diff --git a/src/calibre/ebooks/metadata/fb2.py b/src/calibre/ebooks/metadata/fb2.py
index f5ba06e81f..2bcbe931b8 100644
--- a/src/calibre/ebooks/metadata/fb2.py
+++ b/src/calibre/ebooks/metadata/fb2.py
@@ -8,7 +8,6 @@ __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
 import os
 import datetime
 from functools import partial
-from base64 import b64decode
 from lxml import etree
 from calibre.utils.date import parse_date
 from calibre import guess_type, guess_all_extensions, prints, force_unicode
@@ -143,6 +142,7 @@ def _parse_cover(root, mi):
             pass
 
 def _parse_cover_data(root, imgid, mi):
+    from calibre.ebooks.fb2 import base64_decode
     elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
     if elm_binary:
         mimetype = elm_binary[0].get('content-type', 'image/jpeg')
@@ -156,7 +156,8 @@ def _parse_cover_data(root, imgid, mi):
         if mime_extensions:
             pic_data = elm_binary[0].text
             if pic_data:
-                mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
+                mi.cover_data = (mime_extensions[0][1:],
+                        base64_decode(pic_data.strip()))
         else:
             prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )