FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded. Fixes #990929 (Private bug)

This commit is contained in:
Kovid Goyal 2012-04-29 13:51:09 +05:30
parent 0d506a3ee7
commit db3203176e
3 changed files with 70 additions and 6 deletions

View File

@ -5,13 +5,13 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
Convert .fb2 files to .lrf Convert .fb2 files to .lrf
""" """
import os, re import os, re
from base64 import b64decode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import guess_type from calibre import guess_type
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0' FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
class FB2Input(InputFormatPlugin): class FB2Input(InputFormatPlugin):
name = 'FB2 Input' name = 'FB2 Input'
@ -41,6 +41,7 @@ class FB2Input(InputFormatPlugin):
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
NAMESPACES = {'f':FB2NS, 'l':XLINK_NS} NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
self.log = log
log.debug('Parsing XML...') log.debug('Parsing XML...')
raw = stream.read().replace('\0', '') raw = stream.read().replace('\0', '')
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
@ -123,6 +124,7 @@ class FB2Input(InputFormatPlugin):
return os.path.join(os.getcwdu(), u'metadata.opf') return os.path.join(os.getcwdu(), u'metadata.opf')
def extract_embedded_content(self, doc): def extract_embedded_content(self, doc):
from calibre.ebooks.fb2 import base64_decode
self.binary_map = {} self.binary_map = {}
for elem in doc.xpath('./*'): for elem in doc.xpath('./*'):
if elem.text and 'binary' in elem.tag and elem.attrib.has_key('id'): if elem.text and 'binary' in elem.tag and elem.attrib.has_key('id'):
@ -130,8 +132,17 @@ class FB2Input(InputFormatPlugin):
fname = elem.attrib['id'] fname = elem.attrib['id']
ext = ct.rpartition('/')[-1].lower() ext = ct.rpartition('/')[-1].lower()
if ext in ('png', 'jpeg', 'jpg'): if ext in ('png', 'jpeg', 'jpg'):
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
'png'}:
fname += '.' + ext fname += '.' + ext
self.binary_map[elem.get('id')] = fname self.binary_map[elem.get('id')] = fname
data = b64decode(elem.text.strip()) raw = elem.text.strip()
open(fname, 'wb').write(data) try:
data = base64_decode(raw)
except TypeError:
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
elem.get('id')))
else:
with open(fname, 'wb') as f:
f.write(data)

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
def base64_decode(raw):
from io import BytesIO
from base64 import b64decode
# First try the python implementation as it is faster
try:
return b64decode(raw)
except TypeError:
pass
# Try a more robust version (adapted from FBReader sources)
A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=')
raw = bytearray(raw)
out = BytesIO()
pos = 0
while pos < len(raw):
tot = 0
i = 0
while i < 4 and pos < len(raw):
byt = raw[pos]
pos += 1
num = 0
if A <= byt <= Z:
num = byt - A
elif a <= byt <= z:
num = byt - a + 26
elif zero <= byt <= nine:
num = byt - zero + 52
else:
num = {plus:62, slash:63, equal:64}.get(byt, None)
if num is None:
# Ignore this byte
continue
tot += num << (6 * (3 - i))
i += 1
triple = bytearray(3)
for j in (2, 1, 0):
triple[j] = tot & 0xff
tot >>= 8
out.write(bytes(triple))
return out.getvalue()

View File

@ -8,7 +8,6 @@ __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
import os import os
import datetime import datetime
from functools import partial from functools import partial
from base64 import b64decode
from lxml import etree from lxml import etree
from calibre.utils.date import parse_date from calibre.utils.date import parse_date
from calibre import guess_type, guess_all_extensions, prints, force_unicode from calibre import guess_type, guess_all_extensions, prints, force_unicode
@ -143,6 +142,7 @@ def _parse_cover(root, mi):
pass pass
def _parse_cover_data(root, imgid, mi): def _parse_cover_data(root, imgid, mi):
from calibre.ebooks.fb2 import base64_decode
elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root) elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
if elm_binary: if elm_binary:
mimetype = elm_binary[0].get('content-type', 'image/jpeg') mimetype = elm_binary[0].get('content-type', 'image/jpeg')
@ -156,7 +156,8 @@ def _parse_cover_data(root, imgid, mi):
if mime_extensions: if mime_extensions:
pic_data = elm_binary[0].text pic_data = elm_binary[0].text
if pic_data: if pic_data:
mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data)) mi.cover_data = (mime_extensions[0][1:],
base64_decode(pic_data.strip()))
else: else:
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) ) prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )