Add a decompressor for LZMA1

This commit is contained in:
Kovid Goyal 2015-08-06 17:37:49 +05:30
parent a85e8b31ba
commit 63d9d6bca5
4 changed files with 148 additions and 19 deletions

25
src/lzma/errors.py Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.constants import plugins
lzma = plugins['lzma_binding'][0]
if not lzma:
raise RuntimeError('Failed to load lzma_binding module with error: %s' % plugins['lzma_binding'][1])
LzmaError = lzma.error
class NotXZ(LzmaError):
pass
class InvalidXZ(LzmaError):
pass
class NotLzma(LzmaError):
pass

38
src/lzma/lzma1.py Normal file
View File

@ -0,0 +1,38 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>'
from io import BytesIO
from struct import unpack
from calibre.ptempfile import SpooledTemporaryFile
from lzma.errors import NotLzma, lzma
def read_header(f):
raw = f.read(13)
try:
props, dict_size, uncompressed_size = unpack(b'<BIQ', raw)
except Exception:
raise NotLzma('Not a LZMA file')
if props > (4 * 5 + 4) * 9 + 8:
raise NotLzma('Not a LZMA file')
return uncompressed_size, raw
def decompress(raw, outfile=None, bufsize=10*1024*1024):
if isinstance(raw, bytes):
raw = BytesIO(raw)
uncompressed_size, header = read_header(raw)
outfile = outfile or SpooledTemporaryFile(50 * 1024 * 1024, '_lzma_decompress')
lzma.decompress(raw.read, raw.seek, outfile.write, uncompressed_size, header, bufsize)
if uncompressed_size < outfile.tell():
outfile.seek(uncompressed_size)
outfile.truncate()
return outfile
if __name__ == '__main__':
import sys
decompress(open(sys.argv[-1], 'rb'))

View File

@ -87,11 +87,11 @@ delta_decode(PyObject *self, PyObject *args) {
static PyObject *
decompress2(PyObject *self, PyObject *args) {
PyObject *read = NULL, *seek = NULL, *write = NULL, *rres = NULL;
unsigned long bufsize = 0, bytes_written = 0, bytes_read = 0, inbuf_pos = 0, inbuf_len = 0, leftover = 0;
SizeT bufsize = 0, bytes_written = 0, bytes_read = 0, inbuf_pos = 0, inbuf_len = 0, leftover = 0;
unsigned char props = 0;
char *inbuf = NULL, *outbuf = NULL;
CLzma2Dec state;
SRes res = 0;
SRes res = SZ_OK;
ELzmaStatus status = LZMA_STATUS_NOT_FINISHED;
if (!PyArg_ParseTuple(args, "OOOBk", &read, &seek, &write, &props, &bufsize)) return NULL;
@ -108,13 +108,18 @@ decompress2(PyObject *self, PyObject *args) {
while (status != LZMA_STATUS_FINISHED_WITH_MARK) {
bytes_written = bufsize; bytes_read = inbuf_len - inbuf_pos;
if (bytes_read) {
Py_BEGIN_ALLOW_THREADS;
res = Lzma2Dec_DecodeToBuf(&state, (Byte*)outbuf, &bytes_written, (Byte*)(inbuf) + inbuf_pos, &bytes_read, LZMA_FINISH_ANY, &status);
Py_END_ALLOW_THREADS;
} else { res = SZ_OK; bytes_written = 0; status = LZMA_STATUS_NEEDS_MORE_INPUT; }
if (res != SZ_OK) { SET_ERROR(res); goto exit; }
if (bytes_written > 0) {
if(!PyObject_CallFunction(write, "s#", outbuf, bytes_written)) goto exit;
}
if (inbuf_len > inbuf_pos && !bytes_read && !bytes_written && status != LZMA_STATUS_NEEDS_MORE_INPUT && status != LZMA_STATUS_FINISHED_WITH_MARK) {
SET_ERROR(SZ_ERROR_DATA); goto exit;
}
if (bytes_read > 0) inbuf_pos += bytes_read;
if (status == LZMA_STATUS_NEEDS_MORE_INPUT) {
leftover = inbuf_len - inbuf_pos;
@ -141,11 +146,85 @@ exit:
Py_RETURN_NONE;
}
static PyObject*
decompress(PyObject *self, PyObject *args) {
PyObject *read = NULL, *seek = NULL, *write = NULL, *rres = NULL;
UInt64 decompressed_size = 0;
int size_known = 0;
Py_ssize_t header_size = 0;
unsigned char *header = NULL, *inbuf = NULL, *outbuf = NULL;
CLzmaDec state;
SRes res = 0;
SizeT bufsize = 0, bytes_written = 0, bytes_read = 0, inbuf_pos = 0, inbuf_len = 0, leftover = 0, total_written = 0;
ELzmaStatus status = LZMA_STATUS_NOT_FINISHED;
ELzmaFinishMode finish_mode = LZMA_FINISH_ANY;
if(!PyArg_ParseTuple(args, "OOOKs#k", &read, &seek, &write, &decompressed_size, &header, &header_size, &bufsize)) return NULL;
size_known = (decompressed_size != (UInt64)(Int64)-1);
if (header_size != 13) { PyErr_SetString(LZMAError, "Header must be exactly 13 bytes long"); return NULL; }
if (!decompressed_size) { PyErr_SetString(LZMAError, "Cannot decompress empty file"); return NULL; }
LzmaDec_Construct(&state);
res = LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &allocator);
if (res == SZ_ERROR_MEM) { PyErr_NoMemory(); return NULL; }
if (res != SZ_OK) { PyErr_SetString(PyExc_TypeError, "Incorrect stream properties"); goto exit; }
inbuf = (unsigned char*)PyMem_Malloc(bufsize);
outbuf = (unsigned char*)PyMem_Malloc(bufsize);
if (!inbuf || !outbuf) {PyErr_NoMemory(); goto exit;}
LzmaDec_Init(&state);
while (status != LZMA_STATUS_FINISHED_WITH_MARK) {
bytes_written = bufsize; bytes_read = inbuf_len - inbuf_pos;
if (bytes_read) {
Py_BEGIN_ALLOW_THREADS;
finish_mode = LZMA_FINISH_ANY;
if (size_known && total_written + bufsize > decompressed_size) finish_mode = LZMA_FINISH_END;
res = LzmaDec_DecodeToBuf(&state, (Byte*)outbuf, &bytes_written, (Byte*)(inbuf) + inbuf_pos, &bytes_read, finish_mode, &status);
Py_END_ALLOW_THREADS;
} else { res = SZ_OK; bytes_written = 0; status = LZMA_STATUS_NEEDS_MORE_INPUT; }
if (res != SZ_OK) { SET_ERROR(res); goto exit; }
if (bytes_written > 0) {
if(!PyObject_CallFunction(write, "s#", outbuf, bytes_written)) goto exit;
total_written += bytes_written;
}
if (inbuf_len > inbuf_pos && !bytes_read && !bytes_written && status != LZMA_STATUS_NEEDS_MORE_INPUT && status != LZMA_STATUS_FINISHED_WITH_MARK) {
SET_ERROR(SZ_ERROR_DATA); goto exit;
}
if (bytes_read > 0) inbuf_pos += bytes_read;
if (status == LZMA_STATUS_NEEDS_MORE_INPUT) {
leftover = inbuf_len - inbuf_pos;
inbuf_pos = 0;
if (!PyObject_CallFunction(seek, "ii", -leftover, SEEK_CUR)) goto exit;
rres = PyObject_CallFunction(read, "n", bufsize);
if (rres == NULL) goto exit;
inbuf_len = PyBytes_GET_SIZE(rres);
if (inbuf_len == 0) { PyErr_SetString(PyExc_ValueError, "LZMA block was truncated"); goto exit; }
memcpy(inbuf, PyBytes_AS_STRING(rres), inbuf_len);
Py_DECREF(rres); rres = NULL;
}
}
leftover = inbuf_len - inbuf_pos;
if (leftover > 0) {
if (!PyObject_CallFunction(seek, "ii", -leftover, SEEK_CUR)) goto exit;
}
exit:
LzmaDec_Free(&state, &allocator);
PyMem_Free(inbuf); PyMem_Free(outbuf);
if (PyErr_Occurred()) return NULL;
Py_RETURN_NONE;
}
static PyMethodDef lzma_binding_methods[] = {
{"decompress2", decompress2, METH_VARARGS,
"Decompress an LZMA2 encoded block, of unknown compressed size (reads till LZMA2 EOS marker)"
},
{"decompress", decompress, METH_VARARGS,
"Decompress an LZMA encoded block, of (un)known size (reads till LZMA EOS marker when size unknown)"
},
{"crc64", crc64, METH_VARARGS,
"crc64(bytes) -> CRC 64 for the provided python bytes object"
},

View File

@ -12,12 +12,8 @@ from hashlib import sha256
from struct import unpack, error as struct_error
from binascii import crc32 as _crc32
from calibre.constants import plugins
from calibre.ptempfile import SpooledTemporaryFile
lzma = plugins['lzma_binding'][0]
if not lzma:
raise RuntimeError('Failed to load lzma_binding module with error: %s' % plugins['lzma_binding'][1])
from lzma.errors import NotXZ, InvalidXZ, lzma
HEADER_MAGIC = b'\xfd7zXZ\0'
DELTA_FILTER_ID = 0x03
@ -26,15 +22,6 @@ LZMA2_FILTER_ID = 0x21
def crc32(raw, start=0):
return 0xFFFFFFFF & _crc32(raw, start)
class XZError(ValueError):
pass
class NotXZ(XZError):
pass
class InvalidXZ(XZError):
pass
def decode_var_int(f):
ans, i, ch = 0, -1, 0x80
while ch >= 0x80: