Fix bug in imghdr that caused some JPEG files to not be identified

This commit is contained in:
Kovid Goyal 2013-03-20 11:49:27 +05:30
parent 9d90ba326d
commit 79c8ede0a8
9 changed files with 179 additions and 18 deletions

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, tempfile, os, imghdr
import re, tempfile, os
from functools import partial
from itertools import izip
from urllib import quote
@ -17,6 +17,7 @@ from calibre.customize.conversion import (InputFormatPlugin,
OptionRecommendation)
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import what
class HTMLInput(InputFormatPlugin):
@ -250,7 +251,7 @@ class HTMLInput(InputFormatPlugin):
if media_type == self.BINARY_MIME:
# Check for the common case, images
try:
img = imghdr.what(link)
img = what(link)
except EnvironmentError:
pass
else:

View File

@ -105,7 +105,7 @@ class RTFInput(InputFormatPlugin):
return f.read()
def extract_images(self, picts):
import imghdr
from calibre.utils.imghdr import what
self.log('Extracting images...')
with open(picts, 'rb') as f:
@ -120,7 +120,7 @@ class RTFInput(InputFormatPlugin):
if len(enc) % 2 == 1:
enc = enc[:-1]
data = enc.decode('hex')
fmt = imghdr.what(None, data)
fmt = what(None, data)
if fmt is None:
fmt = 'wmf'
count += 1

View File

@ -9,7 +9,7 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \
'Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'
import os, cStringIO, imghdr
import os, cStringIO
from struct import pack, unpack
from cStringIO import StringIO
@ -18,12 +18,13 @@ from calibre.ebooks.mobi import MobiError, MAX_THUMB_DIMEN
from calibre.ebooks.mobi.utils import rescale_image
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.utils.date import now as nowf
from calibre.utils.imghdr import what
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
def is_image(ss):
if ss is None:
return False
return imghdr.what(None, ss[:200]) is not None
return what(None, ss[:200]) is not None
class StreamSlicer(object):

View File

@ -8,7 +8,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os, imghdr, struct, textwrap
import sys, os, struct, textwrap
from itertools import izip
from calibre import CurrentDir
@ -18,6 +18,7 @@ from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
from calibre.ebooks.mobi.debug import format_bytes
from calibre.ebooks.mobi.reader.headers import NULL_INDEX
from calibre.utils.imghdr import what
class FDST(object):
@ -173,7 +174,7 @@ class MOBIFile(object):
font['raw_data'])
prefix, ext = 'fonts', font['ext']
elif sig not in known_types:
q = imghdr.what(None, rec.raw)
q = what(None, rec.raw)
if q:
prefix, ext = 'images', q

View File

@ -7,7 +7,7 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, re, os, imghdr
import struct, re, os
from collections import namedtuple
from itertools import repeat, izip
from urlparse import urldefrag
@ -23,6 +23,7 @@ from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.mobi.utils import read_font_record
from calibre.ebooks.oeb.parse_utils import parse_html
from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
from calibre.utils.imghdr import what
Part = namedtuple('Part',
'num type filename start end aid')
@ -403,7 +404,7 @@ class Mobi8Reader(object):
if font['encrypted']:
self.encrypted_fonts.append(href)
else:
imgtype = imghdr.what(None, data)
imgtype = what(None, data)
if imgtype is None:
imgtype = 'unknown'
href = 'images/%05d.%s'%(fname_idx, imgtype)

View File

@ -7,11 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import struct, string, imghdr, zlib, os
import struct, string, zlib, os
from collections import OrderedDict
from io import BytesIO
from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
from calibre.utils.imghdr import what
from calibre.ebooks import normalize
IMAGE_MAX_SIZE = 10 * 1024 * 1024
@ -384,9 +385,9 @@ def to_base(num, base=32, min_num_digits=None):
def mobify_image(data):
'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG'
what = imghdr.what(None, data)
fmt = what(None, data)
if what == 'png':
if fmt == 'png':
im = Image()
im.load(data)
data = im.export('gif')

View File

@ -7,13 +7,12 @@ __license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import imghdr
from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
from calibre.ebooks.mobi.utils import (rescale_image, mobify_image,
write_font_record)
from calibre.ebooks import generate_masthead
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.utils.imghdr import what
PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00,\x00\x00\x00\x00\x01\x00\x01\x00@\x02\x01D\x00;'
@ -84,7 +83,7 @@ class Resources(object):
self.image_indices.add(len(self.records))
self.records.append(data)
self.item_map[item.href] = index
self.mime_map[item.href] = 'image/%s'%imghdr.what(None, data)
self.mime_map[item.href] = 'image/%s'%what(None, data)
index += 1
if cover_href and item.href == cover_href:

156
src/calibre/utils/imghdr.py Normal file
View File

@ -0,0 +1,156 @@
"""Recognize image file formats based on their first few bytes."""
__all__ = ["what"]
#-------------------------#
# Recognize image headers #
#-------------------------#
def what(file, h=None):
if h is None:
if isinstance(file, basestring):
f = open(file, 'rb')
h = f.read(32)
else:
location = file.tell()
h = file.read(32)
file.seek(location)
f = None
else:
f = None
try:
for tf in tests:
res = tf(h, f)
if res:
return res
finally:
if f: f.close()
return None
#---------------------------------#
# Subroutines per image file type #
#---------------------------------#
tests = []
def test_jpeg(h, f):
"""JPEG data in JFIF format (Changed by Kovid to mimic the file utility,
the original code was failing with some jpegs that included ICC_PROFILE
data, for example: http://nationalpostnews.files.wordpress.com/2013/03/budget.jpeg?w=300&h=1571)"""
if (h[6:10] in (b'JFIF', b'Exif')) or (h[:2] == b'\xff\xd8' and b'JFIF' in h[:32]):
return 'jpeg'
tests.append(test_jpeg)
def test_png(h, f):
if h[:8] == "\211PNG\r\n\032\n":
return 'png'
tests.append(test_png)
def test_gif(h, f):
"""GIF ('87 and '89 variants)"""
if h[:6] in ('GIF87a', 'GIF89a'):
return 'gif'
tests.append(test_gif)
def test_tiff(h, f):
"""TIFF (can be in Motorola or Intel byte order)"""
if h[:2] in ('MM', 'II'):
return 'tiff'
tests.append(test_tiff)
def test_rgb(h, f):
"""SGI image library"""
if h[:2] == '\001\332':
return 'rgb'
tests.append(test_rgb)
def test_pbm(h, f):
"""PBM (portable bitmap)"""
if len(h) >= 3 and \
h[0] == 'P' and h[1] in '14' and h[2] in ' \t\n\r':
return 'pbm'
tests.append(test_pbm)
def test_pgm(h, f):
"""PGM (portable graymap)"""
if len(h) >= 3 and \
h[0] == 'P' and h[1] in '25' and h[2] in ' \t\n\r':
return 'pgm'
tests.append(test_pgm)
def test_ppm(h, f):
"""PPM (portable pixmap)"""
if len(h) >= 3 and \
h[0] == 'P' and h[1] in '36' and h[2] in ' \t\n\r':
return 'ppm'
tests.append(test_ppm)
def test_rast(h, f):
"""Sun raster file"""
if h[:4] == '\x59\xA6\x6A\x95':
return 'rast'
tests.append(test_rast)
def test_xbm(h, f):
"""X bitmap (X10 or X11)"""
s = '#define '
if h[:len(s)] == s:
return 'xbm'
tests.append(test_xbm)
def test_bmp(h, f):
if h[:2] == 'BM':
return 'bmp'
tests.append(test_bmp)
#--------------------#
# Small test program #
#--------------------#
def test():
import sys
recursive = 0
if sys.argv[1:] and sys.argv[1] == '-r':
del sys.argv[1:2]
recursive = 1
try:
if sys.argv[1:]:
testall(sys.argv[1:], recursive, 1)
else:
testall(['.'], recursive, 1)
except KeyboardInterrupt:
sys.stderr.write('\n[Interrupted]\n')
sys.exit(1)
def testall(list, recursive, toplevel):
import sys
import os
for filename in list:
if os.path.isdir(filename):
print filename + '/:',
if recursive or toplevel:
print 'recursing down:'
import glob
names = glob.glob(os.path.join(filename, '*'))
testall(names, recursive, 0)
else:
print '*** directory (use -r) ***'
else:
print filename + ':',
sys.stdout.flush()
try:
print what(filename)
except IOError:
print '*** not found ***'

View File

@ -7,7 +7,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Fetch a webpage and its links recursively. The webpages are saved to disk in
UTF-8 encoding with any charset declarations removed.
'''
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback, imghdr
import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname, quote
from httplib import responses
from base64 import b64decode
@ -21,6 +21,7 @@ from calibre.utils.config import OptionParser
from calibre.utils.logging import Log
from calibre.utils.magick import Image
from calibre.utils.magick.draw import identify_data, thumbnail
from calibre.utils.imghdr import what
class FetchError(Exception):
pass
@ -413,7 +414,7 @@ class RecursiveFetcher(object):
fname = ascii_filename('img'+str(c))
if isinstance(fname, unicode):
fname = fname.encode('ascii', 'replace')
itype = imghdr.what(None, data)
itype = what(None, data)
if itype is None and b'<svg' in data[:1024]:
# SVG image
imgpath = os.path.join(diskpath, fname+'.svg')