Fix bug in imghdr that caused some JPEG files to not be identified

2025-08-30 23:00:21 -04:00 · 2013-03-20 11:49:27 +05:30 · 2013-03-20 11:49:27 +05:30 · 79c8ede0a8
commit 79c8ede0a8
parent 9d90ba326d
9 changed files with 179 additions and 18 deletions
--- a/src/calibre/ebooks/conversion/plugins/html_input.py
+++ b/src/calibre/ebooks/conversion/plugins/html_input.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import re, tempfile, os, imghdr
+import re, tempfile, os
 from functools import partial
 from itertools import izip
 from urllib import quote
@ -17,6 +17,7 @@ from calibre.customize.conversion import (InputFormatPlugin,
        OptionRecommendation)
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
+from calibre.utils.imghdr import what


 class HTMLInput(InputFormatPlugin):
@ -250,7 +251,7 @@ class HTMLInput(InputFormatPlugin):
            if media_type == self.BINARY_MIME:
                # Check for the common case, images
                try:
-                    img = imghdr.what(link)
+                    img = what(link)
                except EnvironmentError:
                    pass
                else:
--- a/src/calibre/ebooks/conversion/plugins/rtf_input.py
+++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py
@ -105,7 +105,7 @@ class RTFInput(InputFormatPlugin):
            return f.read()

    def extract_images(self, picts):
-        import imghdr
+        from calibre.utils.imghdr import what
        self.log('Extracting images...')

        with open(picts, 'rb') as f:
@ -120,7 +120,7 @@ class RTFInput(InputFormatPlugin):
            if len(enc) % 2 == 1:
                enc = enc[:-1]
            data = enc.decode('hex')
-            fmt = imghdr.what(None, data)
+            fmt = what(None, data)
            if fmt is None:
                fmt = 'wmf'
            count += 1
--- a/src/calibre/ebooks/metadata/mobi.py
+++ b/src/calibre/ebooks/metadata/mobi.py
@ -9,7 +9,7 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \
    'Marshall T. Vandegrift <llasram@gmail.com>'
 __docformat__ = 'restructuredtext en'

-import os, cStringIO, imghdr
+import os, cStringIO
 from struct import pack, unpack
 from cStringIO import StringIO

@ -18,12 +18,13 @@ from calibre.ebooks.mobi import MobiError, MAX_THUMB_DIMEN
 from calibre.ebooks.mobi.utils import rescale_image
 from calibre.ebooks.mobi.langcodes import iana2mobi
 from calibre.utils.date import now as nowf
+from calibre.utils.imghdr import what
 from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1

 def is_image(ss):
    if ss is None:
        return False
-    return imghdr.what(None, ss[:200]) is not None
+    return what(None, ss[:200]) is not None

 class StreamSlicer(object):

--- a/src/calibre/ebooks/mobi/debug/mobi8.py
+++ b/src/calibre/ebooks/mobi/debug/mobi8.py
@ -8,7 +8,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import sys, os, imghdr, struct, textwrap
+import sys, os, struct, textwrap
 from itertools import izip

 from calibre import CurrentDir
@ -18,6 +18,7 @@ from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex,
 from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE
 from calibre.ebooks.mobi.debug import format_bytes
 from calibre.ebooks.mobi.reader.headers import NULL_INDEX
+from calibre.utils.imghdr import what

 class FDST(object):

@ -173,7 +174,7 @@ class MOBIFile(object):
                        font['raw_data'])
                prefix, ext = 'fonts', font['ext']
            elif sig not in known_types:
-                q = imghdr.what(None, rec.raw)
+                q = what(None, rec.raw)
                if q:
                    prefix, ext = 'images', q

--- a/src/calibre/ebooks/mobi/reader/mobi8.py
+++ b/src/calibre/ebooks/mobi/reader/mobi8.py
@ -7,7 +7,7 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import struct, re, os, imghdr
+import struct, re, os
 from collections import namedtuple
 from itertools import repeat, izip
 from urlparse import urldefrag
@ -23,6 +23,7 @@ from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.mobi.utils import read_font_record
 from calibre.ebooks.oeb.parse_utils import parse_html
 from calibre.ebooks.oeb.base import XPath, XHTML, xml2text
+from calibre.utils.imghdr import what

 Part = namedtuple('Part',
    'num type filename start end aid')
@ -403,7 +404,7 @@ class Mobi8Reader(object):
                if font['encrypted']:
                    self.encrypted_fonts.append(href)
            else:
-                imgtype = imghdr.what(None, data)
+                imgtype = what(None, data)
                if imgtype is None:
                    imgtype = 'unknown'
                href = 'images/%05d.%s'%(fname_idx, imgtype)
--- a/src/calibre/ebooks/mobi/utils.py
+++ b/src/calibre/ebooks/mobi/utils.py
@ -7,11 +7,12 @@ __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import struct, string, imghdr, zlib, os
+import struct, string, zlib, os
 from collections import OrderedDict
 from io import BytesIO

 from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail
+from calibre.utils.imghdr import what
 from calibre.ebooks import normalize

 IMAGE_MAX_SIZE = 10 * 1024 * 1024
@ -384,9 +385,9 @@ def to_base(num, base=32, min_num_digits=None):

 def mobify_image(data):
    'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG'
-    what = imghdr.what(None, data)
+    fmt = what(None, data)

-    if what == 'png':
+    if fmt == 'png':
        im = Image()
        im.load(data)
        data = im.export('gif')
--- a/src/calibre/ebooks/mobi/writer2/resources.py
+++ b/src/calibre/ebooks/mobi/writer2/resources.py
@ -7,13 +7,12 @@ __license__   = 'GPL v3'
 __copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'

-import imghdr
-
 from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE
 from calibre.ebooks.mobi.utils import (rescale_image, mobify_image,
        write_font_record)
 from calibre.ebooks import generate_masthead
 from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
+from calibre.utils.imghdr import what

 PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00,\x00\x00\x00\x00\x01\x00\x01\x00@\x02\x01D\x00;'

@ -84,7 +83,7 @@ class Resources(object):
                self.image_indices.add(len(self.records))
                self.records.append(data)
                self.item_map[item.href] = index
-                self.mime_map[item.href] = 'image/%s'%imghdr.what(None, data)
+                self.mime_map[item.href] = 'image/%s'%what(None, data)
                index += 1

                if cover_href and item.href == cover_href:
--- a/src/calibre/utils/imghdr.py
+++ b/src/calibre/utils/imghdr.py
@ -0,0 +1,156 @@
+"""Recognize image file formats based on their first few bytes."""
+
+__all__ = ["what"]
+
+#-------------------------#
+# Recognize image headers #
+#-------------------------#
+
+def what(file, h=None):
+    if h is None:
+        if isinstance(file, basestring):
+            f = open(file, 'rb')
+            h = f.read(32)
+        else:
+            location = file.tell()
+            h = file.read(32)
+            file.seek(location)
+            f = None
+    else:
+        f = None
+    try:
+        for tf in tests:
+            res = tf(h, f)
+            if res:
+                return res
+    finally:
+        if f: f.close()
+    return None
+
+
+#---------------------------------#
+# Subroutines per image file type #
+#---------------------------------#
+
+tests = []
+
+def test_jpeg(h, f):
+    """JPEG data in JFIF format (Changed by Kovid to mimic the file utility,
+    the original code was failing with some jpegs that included ICC_PROFILE
+    data, for example: http://nationalpostnews.files.wordpress.com/2013/03/budget.jpeg?w=300&h=1571)"""
+    if (h[6:10] in (b'JFIF', b'Exif')) or (h[:2] == b'\xff\xd8' and b'JFIF' in h[:32]):
+        return 'jpeg'
+
+tests.append(test_jpeg)
+
+def test_png(h, f):
+    if h[:8] == "\211PNG\r\n\032\n":
+        return 'png'
+
+tests.append(test_png)
+
+def test_gif(h, f):
+    """GIF ('87 and '89 variants)"""
+    if h[:6] in ('GIF87a', 'GIF89a'):
+        return 'gif'
+
+tests.append(test_gif)
+
+def test_tiff(h, f):
+    """TIFF (can be in Motorola or Intel byte order)"""
+    if h[:2] in ('MM', 'II'):
+        return 'tiff'
+
+tests.append(test_tiff)
+
+def test_rgb(h, f):
+    """SGI image library"""
+    if h[:2] == '\001\332':
+        return 'rgb'
+
+tests.append(test_rgb)
+
+def test_pbm(h, f):
+    """PBM (portable bitmap)"""
+    if len(h) >= 3 and \
+        h[0] == 'P' and h[1] in '14' and h[2] in ' \t\n\r':
+        return 'pbm'
+
+tests.append(test_pbm)
+
+def test_pgm(h, f):
+    """PGM (portable graymap)"""
+    if len(h) >= 3 and \
+        h[0] == 'P' and h[1] in '25' and h[2] in ' \t\n\r':
+        return 'pgm'
+
+tests.append(test_pgm)
+
+def test_ppm(h, f):
+    """PPM (portable pixmap)"""
+    if len(h) >= 3 and \
+        h[0] == 'P' and h[1] in '36' and h[2] in ' \t\n\r':
+        return 'ppm'
+
+tests.append(test_ppm)
+
+def test_rast(h, f):
+    """Sun raster file"""
+    if h[:4] == '\x59\xA6\x6A\x95':
+        return 'rast'
+
+tests.append(test_rast)
+
+def test_xbm(h, f):
+    """X bitmap (X10 or X11)"""
+    s = '#define '
+    if h[:len(s)] == s:
+        return 'xbm'
+
+tests.append(test_xbm)
+
+def test_bmp(h, f):
+    if h[:2] == 'BM':
+        return 'bmp'
+
+tests.append(test_bmp)
+
+#--------------------#
+# Small test program #
+#--------------------#
+
+def test():
+    import sys
+    recursive = 0
+    if sys.argv[1:] and sys.argv[1] == '-r':
+        del sys.argv[1:2]
+        recursive = 1
+    try:
+        if sys.argv[1:]:
+            testall(sys.argv[1:], recursive, 1)
+        else:
+            testall(['.'], recursive, 1)
+    except KeyboardInterrupt:
+        sys.stderr.write('\n[Interrupted]\n')
+        sys.exit(1)
+
+def testall(list, recursive, toplevel):
+    import sys
+    import os
+    for filename in list:
+        if os.path.isdir(filename):
+            print filename + '/:',
+            if recursive or toplevel:
+                print 'recursing down:'
+                import glob
+                names = glob.glob(os.path.join(filename, '*'))
+                testall(names, recursive, 0)
+            else:
+                print '*** directory (use -r) ***'
+        else:
+            print filename + ':',
+            sys.stdout.flush()
+            try:
+                print what(filename)
+            except IOError:
+                print '*** not found ***'
--- a/src/calibre/web/fetch/simple.py
+++ b/src/calibre/web/fetch/simple.py
@ -7,7 +7,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 Fetch a webpage and its links recursively. The webpages are saved to disk in
 UTF-8 encoding with any charset declarations removed.
 '''
-import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback, imghdr
+import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback
 from urllib import url2pathname, quote
 from httplib import responses
 from base64 import b64decode
@ -21,6 +21,7 @@ from calibre.utils.config import OptionParser
 from calibre.utils.logging import Log
 from calibre.utils.magick import Image
 from calibre.utils.magick.draw import identify_data, thumbnail
+from calibre.utils.imghdr import what

 class FetchError(Exception):
    pass
@ -413,7 +414,7 @@ class RecursiveFetcher(object):
            fname = ascii_filename('img'+str(c))
            if isinstance(fname, unicode):
                fname = fname.encode('ascii', 'replace')
-            itype = imghdr.what(None, data)
+            itype = what(None, data)
            if itype is None and b'<svg' in data[:1024]:
                # SVG image
                imgpath = os.path.join(diskpath, fname+'.svg')