FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded. Fixes #990929 (Private bug)

This commit is contained in:
Kovid Goyal 2012-04-29 13:51:09 +05:30
parent 0d506a3ee7
commit db3203176e
3 changed files with 70 additions and 6 deletions

View File

@ -5,13 +5,13 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
Convert .fb2 files to .lrf
"""
import os, re
from base64 import b64decode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import guess_type
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
class FB2Input(InputFormatPlugin):
name = 'FB2 Input'
@ -41,6 +41,7 @@ class FB2Input(InputFormatPlugin):
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
from calibre.ebooks.chardet import xml_to_unicode
NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
self.log = log
log.debug('Parsing XML...')
raw = stream.read().replace('\0', '')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
@ -123,6 +124,7 @@ class FB2Input(InputFormatPlugin):
return os.path.join(os.getcwdu(), u'metadata.opf')
def extract_embedded_content(self, doc):
from calibre.ebooks.fb2 import base64_decode
self.binary_map = {}
for elem in doc.xpath('./*'):
if elem.text and 'binary' in elem.tag and elem.attrib.has_key('id'):
@ -130,8 +132,17 @@ class FB2Input(InputFormatPlugin):
fname = elem.attrib['id']
ext = ct.rpartition('/')[-1].lower()
if ext in ('png', 'jpeg', 'jpg'):
fname += '.' + ext
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
'png'}:
fname += '.' + ext
self.binary_map[elem.get('id')] = fname
data = b64decode(elem.text.strip())
open(fname, 'wb').write(data)
raw = elem.text.strip()
try:
data = base64_decode(raw)
except TypeError:
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
elem.get('id')))
else:
with open(fname, 'wb') as f:
f.write(data)

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
def base64_decode(raw):
from io import BytesIO
from base64 import b64decode
# First try the python implementation as it is faster
try:
return b64decode(raw)
except TypeError:
pass
# Try a more robust version (adapted from FBReader sources)
A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=')
raw = bytearray(raw)
out = BytesIO()
pos = 0
while pos < len(raw):
tot = 0
i = 0
while i < 4 and pos < len(raw):
byt = raw[pos]
pos += 1
num = 0
if A <= byt <= Z:
num = byt - A
elif a <= byt <= z:
num = byt - a + 26
elif zero <= byt <= nine:
num = byt - zero + 52
else:
num = {plus:62, slash:63, equal:64}.get(byt, None)
if num is None:
# Ignore this byte
continue
tot += num << (6 * (3 - i))
i += 1
triple = bytearray(3)
for j in (2, 1, 0):
triple[j] = tot & 0xff
tot >>= 8
out.write(bytes(triple))
return out.getvalue()

View File

@ -8,7 +8,6 @@ __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
import os
import datetime
from functools import partial
from base64 import b64decode
from lxml import etree
from calibre.utils.date import parse_date
from calibre import guess_type, guess_all_extensions, prints, force_unicode
@ -143,6 +142,7 @@ def _parse_cover(root, mi):
pass
def _parse_cover_data(root, imgid, mi):
from calibre.ebooks.fb2 import base64_decode
elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
@ -156,7 +156,8 @@ def _parse_cover_data(root, imgid, mi):
if mime_extensions:
pic_data = elm_binary[0].text
if pic_data:
mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
mi.cover_data = (mime_extensions[0][1:],
base64_decode(pic_data.strip()))
else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )