From 63d9d6bca5d073b275cd4f171f65affdb88a14af Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 6 Aug 2015 17:37:49 +0530 Subject: [PATCH] Add a decompressor for LZMA1 --- src/lzma/errors.py | 25 ++++++++++++ src/lzma/lzma1.py | 38 ++++++++++++++++++ src/lzma/lzma_binding.c | 89 ++++++++++++++++++++++++++++++++++++++--- src/lzma/xz.py | 15 +------ 4 files changed, 148 insertions(+), 19 deletions(-) create mode 100644 src/lzma/errors.py create mode 100644 src/lzma/lzma1.py diff --git a/src/lzma/errors.py b/src/lzma/errors.py new file mode 100644 index 0000000000..5bb3415ac4 --- /dev/null +++ b/src/lzma/errors.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +from calibre.constants import plugins + +lzma = plugins['lzma_binding'][0] +if not lzma: + raise RuntimeError('Failed to load lzma_binding module with error: %s' % plugins['lzma_binding'][1]) + +LzmaError = lzma.error + +class NotXZ(LzmaError): + pass + +class InvalidXZ(LzmaError): + pass + +class NotLzma(LzmaError): + pass + diff --git a/src/lzma/lzma1.py b/src/lzma/lzma1.py new file mode 100644 index 0000000000..d469613733 --- /dev/null +++ b/src/lzma/lzma1.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import (unicode_literals, division, absolute_import, + print_function) + +__license__ = 'GPL v3' +__copyright__ = '2015, Kovid Goyal ' + +from io import BytesIO +from struct import unpack + +from calibre.ptempfile import SpooledTemporaryFile +from lzma.errors import NotLzma, lzma + +def read_header(f): + raw = f.read(13) + try: + props, dict_size, uncompressed_size = unpack(b' (4 * 5 + 4) * 9 + 8: + raise NotLzma('Not a LZMA file') + return uncompressed_size, raw + +def decompress(raw, outfile=None, bufsize=10*1024*1024): + if isinstance(raw, bytes): + raw = BytesIO(raw) + uncompressed_size, header = read_header(raw) + outfile = outfile or SpooledTemporaryFile(50 * 1024 * 1024, '_lzma_decompress') + lzma.decompress(raw.read, raw.seek, outfile.write, uncompressed_size, header, bufsize) + if uncompressed_size < outfile.tell(): + outfile.seek(uncompressed_size) + outfile.truncate() + return outfile + +if __name__ == '__main__': + import sys + decompress(open(sys.argv[-1], 'rb')) diff --git a/src/lzma/lzma_binding.c b/src/lzma/lzma_binding.c index e771ede742..86efadf9cd 100644 --- a/src/lzma/lzma_binding.c +++ b/src/lzma/lzma_binding.c @@ -87,11 +87,11 @@ delta_decode(PyObject *self, PyObject *args) { static PyObject * decompress2(PyObject *self, PyObject *args) { PyObject *read = NULL, *seek = NULL, *write = NULL, *rres = NULL; - unsigned long bufsize = 0, bytes_written = 0, bytes_read = 0, inbuf_pos = 0, inbuf_len = 0, leftover = 0; + SizeT bufsize = 0, bytes_written = 0, bytes_read = 0, inbuf_pos = 0, inbuf_len = 0, leftover = 0; unsigned char props = 0; char *inbuf = NULL, *outbuf = NULL; CLzma2Dec state; - SRes res = 0; + SRes res = SZ_OK; ELzmaStatus status = LZMA_STATUS_NOT_FINISHED; if (!PyArg_ParseTuple(args, "OOOBk", &read, &seek, &write, &props, &bufsize)) return NULL; @@ -108,13 +108,18 @@ decompress2(PyObject *self, PyObject *args) { while (status != LZMA_STATUS_FINISHED_WITH_MARK) { bytes_written = bufsize; bytes_read = inbuf_len - inbuf_pos; - Py_BEGIN_ALLOW_THREADS; - res = Lzma2Dec_DecodeToBuf(&state, (Byte*)outbuf, &bytes_written, (Byte*)(inbuf) + inbuf_pos, &bytes_read, LZMA_FINISH_ANY, &status); - Py_END_ALLOW_THREADS; + if (bytes_read) { + Py_BEGIN_ALLOW_THREADS; + res = Lzma2Dec_DecodeToBuf(&state, (Byte*)outbuf, &bytes_written, (Byte*)(inbuf) + inbuf_pos, &bytes_read, LZMA_FINISH_ANY, &status); + Py_END_ALLOW_THREADS; + } else { res = SZ_OK; bytes_written = 0; status = LZMA_STATUS_NEEDS_MORE_INPUT; } if (res != SZ_OK) { SET_ERROR(res); goto exit; } if (bytes_written > 0) { if(!PyObject_CallFunction(write, "s#", outbuf, bytes_written)) goto exit; } + if (inbuf_len > inbuf_pos && !bytes_read && !bytes_written && status != LZMA_STATUS_NEEDS_MORE_INPUT && status != LZMA_STATUS_FINISHED_WITH_MARK) { + SET_ERROR(SZ_ERROR_DATA); goto exit; + } if (bytes_read > 0) inbuf_pos += bytes_read; if (status == LZMA_STATUS_NEEDS_MORE_INPUT) { leftover = inbuf_len - inbuf_pos; @@ -141,11 +146,85 @@ exit: Py_RETURN_NONE; } +static PyObject* +decompress(PyObject *self, PyObject *args) { + PyObject *read = NULL, *seek = NULL, *write = NULL, *rres = NULL; + UInt64 decompressed_size = 0; + int size_known = 0; + Py_ssize_t header_size = 0; + unsigned char *header = NULL, *inbuf = NULL, *outbuf = NULL; + CLzmaDec state; + SRes res = 0; + SizeT bufsize = 0, bytes_written = 0, bytes_read = 0, inbuf_pos = 0, inbuf_len = 0, leftover = 0, total_written = 0; + ELzmaStatus status = LZMA_STATUS_NOT_FINISHED; + ELzmaFinishMode finish_mode = LZMA_FINISH_ANY; + + if(!PyArg_ParseTuple(args, "OOOKs#k", &read, &seek, &write, &decompressed_size, &header, &header_size, &bufsize)) return NULL; + size_known = (decompressed_size != (UInt64)(Int64)-1); + if (header_size != 13) { PyErr_SetString(LZMAError, "Header must be exactly 13 bytes long"); return NULL; } + if (!decompressed_size) { PyErr_SetString(LZMAError, "Cannot decompress empty file"); return NULL; } + + LzmaDec_Construct(&state); + res = LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &allocator); + if (res == SZ_ERROR_MEM) { PyErr_NoMemory(); return NULL; } + if (res != SZ_OK) { PyErr_SetString(PyExc_TypeError, "Incorrect stream properties"); goto exit; } + inbuf = (unsigned char*)PyMem_Malloc(bufsize); + outbuf = (unsigned char*)PyMem_Malloc(bufsize); + if (!inbuf || !outbuf) {PyErr_NoMemory(); goto exit;} + + LzmaDec_Init(&state); + + while (status != LZMA_STATUS_FINISHED_WITH_MARK) { + bytes_written = bufsize; bytes_read = inbuf_len - inbuf_pos; + if (bytes_read) { + Py_BEGIN_ALLOW_THREADS; + finish_mode = LZMA_FINISH_ANY; + if (size_known && total_written + bufsize > decompressed_size) finish_mode = LZMA_FINISH_END; + res = LzmaDec_DecodeToBuf(&state, (Byte*)outbuf, &bytes_written, (Byte*)(inbuf) + inbuf_pos, &bytes_read, finish_mode, &status); + Py_END_ALLOW_THREADS; + } else { res = SZ_OK; bytes_written = 0; status = LZMA_STATUS_NEEDS_MORE_INPUT; } + if (res != SZ_OK) { SET_ERROR(res); goto exit; } + if (bytes_written > 0) { + if(!PyObject_CallFunction(write, "s#", outbuf, bytes_written)) goto exit; + total_written += bytes_written; + } + if (inbuf_len > inbuf_pos && !bytes_read && !bytes_written && status != LZMA_STATUS_NEEDS_MORE_INPUT && status != LZMA_STATUS_FINISHED_WITH_MARK) { + SET_ERROR(SZ_ERROR_DATA); goto exit; + } + if (bytes_read > 0) inbuf_pos += bytes_read; + if (status == LZMA_STATUS_NEEDS_MORE_INPUT) { + leftover = inbuf_len - inbuf_pos; + inbuf_pos = 0; + if (!PyObject_CallFunction(seek, "ii", -leftover, SEEK_CUR)) goto exit; + rres = PyObject_CallFunction(read, "n", bufsize); + if (rres == NULL) goto exit; + inbuf_len = PyBytes_GET_SIZE(rres); + if (inbuf_len == 0) { PyErr_SetString(PyExc_ValueError, "LZMA block was truncated"); goto exit; } + memcpy(inbuf, PyBytes_AS_STRING(rres), inbuf_len); + Py_DECREF(rres); rres = NULL; + } + } + leftover = inbuf_len - inbuf_pos; + if (leftover > 0) { + if (!PyObject_CallFunction(seek, "ii", -leftover, SEEK_CUR)) goto exit; + } + +exit: + LzmaDec_Free(&state, &allocator); + PyMem_Free(inbuf); PyMem_Free(outbuf); + if (PyErr_Occurred()) return NULL; + Py_RETURN_NONE; +} + static PyMethodDef lzma_binding_methods[] = { {"decompress2", decompress2, METH_VARARGS, "Decompress an LZMA2 encoded block, of unknown compressed size (reads till LZMA2 EOS marker)" }, + {"decompress", decompress, METH_VARARGS, + "Decompress an LZMA encoded block, of (un)known size (reads till LZMA EOS marker when size unknown)" + }, + {"crc64", crc64, METH_VARARGS, "crc64(bytes) -> CRC 64 for the provided python bytes object" }, diff --git a/src/lzma/xz.py b/src/lzma/xz.py index cf2c9ba773..0f24abfaf6 100644 --- a/src/lzma/xz.py +++ b/src/lzma/xz.py @@ -12,12 +12,8 @@ from hashlib import sha256 from struct import unpack, error as struct_error from binascii import crc32 as _crc32 -from calibre.constants import plugins from calibre.ptempfile import SpooledTemporaryFile - -lzma = plugins['lzma_binding'][0] -if not lzma: - raise RuntimeError('Failed to load lzma_binding module with error: %s' % plugins['lzma_binding'][1]) +from lzma.errors import NotXZ, InvalidXZ, lzma HEADER_MAGIC = b'\xfd7zXZ\0' DELTA_FILTER_ID = 0x03 @@ -26,15 +22,6 @@ LZMA2_FILTER_ID = 0x21 def crc32(raw, start=0): return 0xFFFFFFFF & _crc32(raw, start) -class XZError(ValueError): - pass - -class NotXZ(XZError): - pass - -class InvalidXZ(XZError): - pass - def decode_var_int(f): ans, i, ch = 0, -1, 0x80 while ch >= 0x80: