diff --git a/src/calibre/ebooks/conversion/plugins/html_input.py b/src/calibre/ebooks/conversion/plugins/html_input.py index f00ccb9d9b..558b4636b4 100644 --- a/src/calibre/ebooks/conversion/plugins/html_input.py +++ b/src/calibre/ebooks/conversion/plugins/html_input.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import re, tempfile, os, imghdr +import re, tempfile, os from functools import partial from itertools import izip from urllib import quote @@ -17,6 +17,7 @@ from calibre.customize.conversion import (InputFormatPlugin, OptionRecommendation) from calibre.utils.localization import get_lang from calibre.utils.filenames import ascii_filename +from calibre.utils.imghdr import what class HTMLInput(InputFormatPlugin): @@ -250,7 +251,7 @@ class HTMLInput(InputFormatPlugin): if media_type == self.BINARY_MIME: # Check for the common case, images try: - img = imghdr.what(link) + img = what(link) except EnvironmentError: pass else: diff --git a/src/calibre/ebooks/conversion/plugins/rtf_input.py b/src/calibre/ebooks/conversion/plugins/rtf_input.py index 9249ea8d48..45d7f16608 100644 --- a/src/calibre/ebooks/conversion/plugins/rtf_input.py +++ b/src/calibre/ebooks/conversion/plugins/rtf_input.py @@ -105,7 +105,7 @@ class RTFInput(InputFormatPlugin): return f.read() def extract_images(self, picts): - import imghdr + from calibre.utils.imghdr import what self.log('Extracting images...') with open(picts, 'rb') as f: @@ -120,7 +120,7 @@ class RTFInput(InputFormatPlugin): if len(enc) % 2 == 1: enc = enc[:-1] data = enc.decode('hex') - fmt = imghdr.what(None, data) + fmt = what(None, data) if fmt is None: fmt = 'wmf' count += 1 diff --git a/src/calibre/ebooks/metadata/mobi.py b/src/calibre/ebooks/metadata/mobi.py index ab475f33a8..7ad9a01962 100644 --- a/src/calibre/ebooks/metadata/mobi.py +++ b/src/calibre/ebooks/metadata/mobi.py @@ -9,7 +9,7 @@ __copyright__ = '2009, Kovid Goyal kovid@kovidgoyal.net and ' \ 'Marshall T. Vandegrift ' __docformat__ = 'restructuredtext en' -import os, cStringIO, imghdr +import os, cStringIO from struct import pack, unpack from cStringIO import StringIO @@ -18,12 +18,13 @@ from calibre.ebooks.mobi import MobiError, MAX_THUMB_DIMEN from calibre.ebooks.mobi.utils import rescale_image from calibre.ebooks.mobi.langcodes import iana2mobi from calibre.utils.date import now as nowf +from calibre.utils.imghdr import what from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 def is_image(ss): if ss is None: return False - return imghdr.what(None, ss[:200]) is not None + return what(None, ss[:200]) is not None class StreamSlicer(object): diff --git a/src/calibre/ebooks/mobi/debug/mobi8.py b/src/calibre/ebooks/mobi/debug/mobi8.py index 213e15cf85..e1c8ffba44 100644 --- a/src/calibre/ebooks/mobi/debug/mobi8.py +++ b/src/calibre/ebooks/mobi/debug/mobi8.py @@ -8,7 +8,7 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import sys, os, imghdr, struct, textwrap +import sys, os, struct, textwrap from itertools import izip from calibre import CurrentDir @@ -18,6 +18,7 @@ from calibre.ebooks.mobi.debug.index import (SKELIndex, SECTIndex, NCXIndex, from calibre.ebooks.mobi.utils import read_font_record, decode_tbs, RECORD_SIZE from calibre.ebooks.mobi.debug import format_bytes from calibre.ebooks.mobi.reader.headers import NULL_INDEX +from calibre.utils.imghdr import what class FDST(object): @@ -173,7 +174,7 @@ class MOBIFile(object): font['raw_data']) prefix, ext = 'fonts', font['ext'] elif sig not in known_types: - q = imghdr.what(None, rec.raw) + q = what(None, rec.raw) if q: prefix, ext = 'images', q diff --git a/src/calibre/ebooks/mobi/reader/mobi8.py b/src/calibre/ebooks/mobi/reader/mobi8.py index 8938b103d3..a55f6bd7e3 100644 --- a/src/calibre/ebooks/mobi/reader/mobi8.py +++ b/src/calibre/ebooks/mobi/reader/mobi8.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, re, os, imghdr +import struct, re, os from collections import namedtuple from itertools import repeat, izip from urlparse import urldefrag @@ -23,6 +23,7 @@ from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.mobi.utils import read_font_record from calibre.ebooks.oeb.parse_utils import parse_html from calibre.ebooks.oeb.base import XPath, XHTML, xml2text +from calibre.utils.imghdr import what Part = namedtuple('Part', 'num type filename start end aid') @@ -403,7 +404,7 @@ class Mobi8Reader(object): if font['encrypted']: self.encrypted_fonts.append(href) else: - imgtype = imghdr.what(None, data) + imgtype = what(None, data) if imgtype is None: imgtype = 'unknown' href = 'images/%05d.%s'%(fname_idx, imgtype) diff --git a/src/calibre/ebooks/mobi/utils.py b/src/calibre/ebooks/mobi/utils.py index 09e3055a6e..e9bc4f669f 100644 --- a/src/calibre/ebooks/mobi/utils.py +++ b/src/calibre/ebooks/mobi/utils.py @@ -7,11 +7,12 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import struct, string, imghdr, zlib, os +import struct, string, zlib, os from collections import OrderedDict from io import BytesIO from calibre.utils.magick.draw import Image, save_cover_data_to, thumbnail +from calibre.utils.imghdr import what from calibre.ebooks import normalize IMAGE_MAX_SIZE = 10 * 1024 * 1024 @@ -384,9 +385,9 @@ def to_base(num, base=32, min_num_digits=None): def mobify_image(data): 'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG' - what = imghdr.what(None, data) + fmt = what(None, data) - if what == 'png': + if fmt == 'png': im = Image() im.load(data) data = im.export('gif') diff --git a/src/calibre/ebooks/mobi/writer2/resources.py b/src/calibre/ebooks/mobi/writer2/resources.py index bdf20a6f2c..01ce6a0135 100644 --- a/src/calibre/ebooks/mobi/writer2/resources.py +++ b/src/calibre/ebooks/mobi/writer2/resources.py @@ -7,13 +7,12 @@ __license__ = 'GPL v3' __copyright__ = '2012, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import imghdr - from calibre.ebooks.mobi import MAX_THUMB_DIMEN, MAX_THUMB_SIZE from calibre.ebooks.mobi.utils import (rescale_image, mobify_image, write_font_record) from calibre.ebooks import generate_masthead from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES +from calibre.utils.imghdr import what PLACEHOLDER_GIF = b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00,\x00\x00\x00\x00\x01\x00\x01\x00@\x02\x01D\x00;' @@ -84,7 +83,7 @@ class Resources(object): self.image_indices.add(len(self.records)) self.records.append(data) self.item_map[item.href] = index - self.mime_map[item.href] = 'image/%s'%imghdr.what(None, data) + self.mime_map[item.href] = 'image/%s'%what(None, data) index += 1 if cover_href and item.href == cover_href: diff --git a/src/calibre/utils/imghdr.py b/src/calibre/utils/imghdr.py new file mode 100644 index 0000000000..3bd515bac5 --- /dev/null +++ b/src/calibre/utils/imghdr.py @@ -0,0 +1,156 @@ +"""Recognize image file formats based on their first few bytes.""" + +__all__ = ["what"] + +#-------------------------# +# Recognize image headers # +#-------------------------# + +def what(file, h=None): + if h is None: + if isinstance(file, basestring): + f = open(file, 'rb') + h = f.read(32) + else: + location = file.tell() + h = file.read(32) + file.seek(location) + f = None + else: + f = None + try: + for tf in tests: + res = tf(h, f) + if res: + return res + finally: + if f: f.close() + return None + + +#---------------------------------# +# Subroutines per image file type # +#---------------------------------# + +tests = [] + +def test_jpeg(h, f): + """JPEG data in JFIF format (Changed by Kovid to mimic the file utility, + the original code was failing with some jpegs that included ICC_PROFILE + data, for example: http://nationalpostnews.files.wordpress.com/2013/03/budget.jpeg?w=300&h=1571)""" + if (h[6:10] in (b'JFIF', b'Exif')) or (h[:2] == b'\xff\xd8' and b'JFIF' in h[:32]): + return 'jpeg' + +tests.append(test_jpeg) + +def test_png(h, f): + if h[:8] == "\211PNG\r\n\032\n": + return 'png' + +tests.append(test_png) + +def test_gif(h, f): + """GIF ('87 and '89 variants)""" + if h[:6] in ('GIF87a', 'GIF89a'): + return 'gif' + +tests.append(test_gif) + +def test_tiff(h, f): + """TIFF (can be in Motorola or Intel byte order)""" + if h[:2] in ('MM', 'II'): + return 'tiff' + +tests.append(test_tiff) + +def test_rgb(h, f): + """SGI image library""" + if h[:2] == '\001\332': + return 'rgb' + +tests.append(test_rgb) + +def test_pbm(h, f): + """PBM (portable bitmap)""" + if len(h) >= 3 and \ + h[0] == 'P' and h[1] in '14' and h[2] in ' \t\n\r': + return 'pbm' + +tests.append(test_pbm) + +def test_pgm(h, f): + """PGM (portable graymap)""" + if len(h) >= 3 and \ + h[0] == 'P' and h[1] in '25' and h[2] in ' \t\n\r': + return 'pgm' + +tests.append(test_pgm) + +def test_ppm(h, f): + """PPM (portable pixmap)""" + if len(h) >= 3 and \ + h[0] == 'P' and h[1] in '36' and h[2] in ' \t\n\r': + return 'ppm' + +tests.append(test_ppm) + +def test_rast(h, f): + """Sun raster file""" + if h[:4] == '\x59\xA6\x6A\x95': + return 'rast' + +tests.append(test_rast) + +def test_xbm(h, f): + """X bitmap (X10 or X11)""" + s = '#define ' + if h[:len(s)] == s: + return 'xbm' + +tests.append(test_xbm) + +def test_bmp(h, f): + if h[:2] == 'BM': + return 'bmp' + +tests.append(test_bmp) + +#--------------------# +# Small test program # +#--------------------# + +def test(): + import sys + recursive = 0 + if sys.argv[1:] and sys.argv[1] == '-r': + del sys.argv[1:2] + recursive = 1 + try: + if sys.argv[1:]: + testall(sys.argv[1:], recursive, 1) + else: + testall(['.'], recursive, 1) + except KeyboardInterrupt: + sys.stderr.write('\n[Interrupted]\n') + sys.exit(1) + +def testall(list, recursive, toplevel): + import sys + import os + for filename in list: + if os.path.isdir(filename): + print filename + '/:', + if recursive or toplevel: + print 'recursing down:' + import glob + names = glob.glob(os.path.join(filename, '*')) + testall(names, recursive, 0) + else: + print '*** directory (use -r) ***' + else: + print filename + ':', + sys.stdout.flush() + try: + print what(filename) + except IOError: + print '*** not found ***' diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 7cc8bd9309..95b8cf0253 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -7,7 +7,7 @@ __copyright__ = '2008, Kovid Goyal ' Fetch a webpage and its links recursively. The webpages are saved to disk in UTF-8 encoding with any charset declarations removed. ''' -import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback, imghdr +import sys, socket, os, urlparse, re, time, copy, urllib2, threading, traceback from urllib import url2pathname, quote from httplib import responses from base64 import b64decode @@ -21,6 +21,7 @@ from calibre.utils.config import OptionParser from calibre.utils.logging import Log from calibre.utils.magick import Image from calibre.utils.magick.draw import identify_data, thumbnail +from calibre.utils.imghdr import what class FetchError(Exception): pass @@ -413,7 +414,7 @@ class RecursiveFetcher(object): fname = ascii_filename('img'+str(c)) if isinstance(fname, unicode): fname = fname.encode('ascii', 'replace') - itype = imghdr.what(None, data) + itype = what(None, data) if itype is None and b'