mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
FB2 Input: More robust base64 decoding to handle embedded images that are incorrectly encoded. Fixes #990929 (Private bug)
This commit is contained in:
parent
0d506a3ee7
commit
db3203176e
@ -5,13 +5,13 @@ __copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||
Convert .fb2 files to .lrf
|
||||
"""
|
||||
import os, re
|
||||
from base64 import b64decode
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre import guess_type
|
||||
|
||||
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
|
||||
|
||||
|
||||
class FB2Input(InputFormatPlugin):
|
||||
|
||||
name = 'FB2 Input'
|
||||
@ -41,6 +41,7 @@ class FB2Input(InputFormatPlugin):
|
||||
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS, RECOVER_PARSER
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
NAMESPACES = {'f':FB2NS, 'l':XLINK_NS}
|
||||
self.log = log
|
||||
log.debug('Parsing XML...')
|
||||
raw = stream.read().replace('\0', '')
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
@ -123,6 +124,7 @@ class FB2Input(InputFormatPlugin):
|
||||
return os.path.join(os.getcwdu(), u'metadata.opf')
|
||||
|
||||
def extract_embedded_content(self, doc):
|
||||
from calibre.ebooks.fb2 import base64_decode
|
||||
self.binary_map = {}
|
||||
for elem in doc.xpath('./*'):
|
||||
if elem.text and 'binary' in elem.tag and elem.attrib.has_key('id'):
|
||||
@ -130,8 +132,17 @@ class FB2Input(InputFormatPlugin):
|
||||
fname = elem.attrib['id']
|
||||
ext = ct.rpartition('/')[-1].lower()
|
||||
if ext in ('png', 'jpeg', 'jpg'):
|
||||
fname += '.' + ext
|
||||
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
|
||||
'png'}:
|
||||
fname += '.' + ext
|
||||
self.binary_map[elem.get('id')] = fname
|
||||
data = b64decode(elem.text.strip())
|
||||
open(fname, 'wb').write(data)
|
||||
raw = elem.text.strip()
|
||||
try:
|
||||
data = base64_decode(raw)
|
||||
except TypeError:
|
||||
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
|
||||
elem.get('id')))
|
||||
else:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
|
@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
def base64_decode(raw):
|
||||
from io import BytesIO
|
||||
from base64 import b64decode
|
||||
|
||||
# First try the python implementation as it is faster
|
||||
try:
|
||||
return b64decode(raw)
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
# Try a more robust version (adapted from FBReader sources)
|
||||
A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=')
|
||||
raw = bytearray(raw)
|
||||
out = BytesIO()
|
||||
pos = 0
|
||||
while pos < len(raw):
|
||||
tot = 0
|
||||
i = 0
|
||||
while i < 4 and pos < len(raw):
|
||||
byt = raw[pos]
|
||||
pos += 1
|
||||
num = 0
|
||||
if A <= byt <= Z:
|
||||
num = byt - A
|
||||
elif a <= byt <= z:
|
||||
num = byt - a + 26
|
||||
elif zero <= byt <= nine:
|
||||
num = byt - zero + 52
|
||||
else:
|
||||
num = {plus:62, slash:63, equal:64}.get(byt, None)
|
||||
if num is None:
|
||||
# Ignore this byte
|
||||
continue
|
||||
tot += num << (6 * (3 - i))
|
||||
i += 1
|
||||
triple = bytearray(3)
|
||||
for j in (2, 1, 0):
|
||||
triple[j] = tot & 0xff
|
||||
tot >>= 8
|
||||
out.write(bytes(triple))
|
||||
return out.getvalue()
|
||||
|
||||
|
@ -8,7 +8,6 @@ __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
|
||||
import os
|
||||
import datetime
|
||||
from functools import partial
|
||||
from base64 import b64decode
|
||||
from lxml import etree
|
||||
from calibre.utils.date import parse_date
|
||||
from calibre import guess_type, guess_all_extensions, prints, force_unicode
|
||||
@ -143,6 +142,7 @@ def _parse_cover(root, mi):
|
||||
pass
|
||||
|
||||
def _parse_cover_data(root, imgid, mi):
|
||||
from calibre.ebooks.fb2 import base64_decode
|
||||
elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
|
||||
if elm_binary:
|
||||
mimetype = elm_binary[0].get('content-type', 'image/jpeg')
|
||||
@ -156,7 +156,8 @@ def _parse_cover_data(root, imgid, mi):
|
||||
if mime_extensions:
|
||||
pic_data = elm_binary[0].text
|
||||
if pic_data:
|
||||
mi.cover_data = (mime_extensions[0][1:], b64decode(pic_data))
|
||||
mi.cover_data = (mime_extensions[0][1:],
|
||||
base64_decode(pic_data.strip()))
|
||||
else:
|
||||
prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid) )
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user