From 1e13a8924cd1161e6c37b9238204bdd1a39b5dac Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 11:34:03 +0200 Subject: [PATCH 01/30] Add code from python-isal project --- Modules/zlibmodule.c | 374 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 2fc39a3bd56201..32f61b7e021600 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -185,6 +185,7 @@ OutputBuffer_WindowOnError(_BlocksOutputBuffer *buffer, _Uint32Window *window) /* Initial buffer size. */ #define DEF_BUF_SIZE (16*1024) +#define DEF_MAX_INITIAL_BUF_SIZE (16 * 1024 * 1024) static PyModuleDef zlibmodule; @@ -1380,6 +1381,379 @@ static PyMemberDef Decomp_members[] = { {NULL}, }; + +typedef struct { + PyObject_HEAD + z_stream zst; + PyObject *zdict; + PyThread_type_lock lock; + PyObject *unused_data; + PyObject *zdict; + uint8_t *input_buffer; + Py_ssize_t input_buffer_size; + /* zst>avail_in is only 32 bit, so we store the true length + separately. Conversion and looping is encapsulated in + decompress_buf() */ + Py_ssize_t avail_in_real; + int is_initialised; + char eof; /* T_BOOL expects a char */ + char needs_input; +} ZlibDecompressor; + +static void +ZlibDecompressor_dealloc(ZlibDecompressor *self) +{ + PyMem_Free(self->input_buffer); + Py_CLEAR(self->unused_data); + Py_CLEAR(self->zdict); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static inline void +arrange_input_buffer(uint32_t *avail_in, Py_ssize_t *remains) +{ + *avail_in = (uint32_t)Py_MIN((size_t)*remains, UINT32_MAX); + *remains -= *avail_in; +} + +static Py_ssize_t +arrange_output_buffer_with_maximum(uint32_t *avail_out, + uint8_t **next_out, + PyObject **buffer, + Py_ssize_t length, + Py_ssize_t max_length) +{ + Py_ssize_t occupied; + + if (*buffer == NULL) { + if (!(*buffer = PyBytes_FromStringAndSize(NULL, length))) + return -1; + occupied = 0; + } + else { + occupied = *next_out - (uint8_t *)PyBytes_AS_STRING(*buffer); + + if (length == occupied) { + Py_ssize_t new_length; + assert(length <= max_length); + /* can not scale the buffer over max_length */ + if (length == max_length) + return -2; + if (length <= (max_length >> 1)) + new_length = length << 1; + else + new_length = max_length; + if (_PyBytes_Resize(buffer, new_length) < 0) + return -1; + length = new_length; + } + } + + *avail_out = (uint32_t)Py_MIN((size_t)(length - occupied), UINT32_MAX); + *next_out = (uint8_t *)PyBytes_AS_STRING(*buffer) + occupied; + + return length; +} + +static inline Py_ssize_t +arrange_output_buffer(uint32_t *avail_out, + uint8_t **next_out, + PyObject **buffer, + Py_ssize_t length) +{ + Py_ssize_t ret; + + ret = arrange_output_buffer_with_maximum(avail_out, next_out, buffer, + length, + PY_SSIZE_T_MAX); + if (ret == -2) + PyErr_NoMemory(); + return ret; +} + +/* Decompress data of length d->bzs_avail_in_real in d->state.next_in. The output + buffer is allocated dynamically and returned. At most max_length bytes are + returned, so some of the input may not be consumed. d->state.next_in and + d->bzs_avail_in_real are updated to reflect the consumed input. */ +static PyObject* +decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) +{ + /* data_size is strictly positive, but because we repeatedly have to + compare against max_length and PyBytes_GET_SIZE we declare it as + signed */ + PyObject *RetVal = NULL; + Py_ssize_t hard_limit; + Py_ssize_t obuflen; + + int err; + + /* In Python 3.10 sometimes sys.maxsize is passed by default. In those cases + we do want to use DEF_BUF_SIZE as start buffer. */ + if ((max_length < 0) || max_length == PY_SSIZE_T_MAX) { + hard_limit = PY_SSIZE_T_MAX; + obuflen = DEF_BUF_SIZE; + } else { + /* Assume that decompressor is used in file decompression with a fixed + block size of max_length. In that case we will reach max_length almost + always (except at the end of the file). So it makes sense to allocate + max_length. */ + hard_limit = max_length; + obuflen = max_length; + if (obuflen > DEF_MAX_INITIAL_BUF_SIZE){ + // Safeguard against memory overflow. + obuflen = DEF_MAX_INITIAL_BUF_SIZE; + } + } + ENTER_ZLIB(self); + + do { + arrange_input_buffer(&(self->zst.avail_in), &(self->avail_in_real)); + + do { + obuflen = arrange_output_buffer_with_maximum(&(self->zst.avail_out), + &(self->zst.next_out), + &RetVal, + obuflen, + hard_limit); + if (obuflen == -1){ + PyErr_SetString(PyExc_MemoryError, + "Unsufficient memory for buffer allocation"); + goto error; + } + else if (obuflen == -2) + break; + + Py_BEGIN_ALLOW_THREADS + err = inflate(&self->zst, Z_SYNC_FLUSH); + Py_END_ALLOW_THREADS + + switch (err) { + case Z_OK: /* fall through */ + case Z_BUF_ERROR: /* fall through */ + case Z_STREAM_END: + break; + default: + if (err == Z_NEED_DICT) { + goto error; + } + else { + break; + } + } + } while (self->zst.avail_out == 0); + } while(err != Z_STREAM_END && self->avail_in_real != 0); + + if (err == Z_STREAM_END) { + self->eof = 1; + } else if (err != Z_OK && err != Z_BUF_ERROR) { + zlib_error(state, self->zst, err, "while decompressing data"); + } + + self->avail_in_real += self->zst.avail_in; + + if (_PyBytes_Resize(&RetVal, self->zst.next_out - + (uint8_t *)PyBytes_AS_STRING(RetVal)) != 0) + goto error; + + goto success; +error: + RetVal = NULL; +success: + LEAVE_ZLIB(self); + Py_CLEAR(RetVal); + return RetVal; +} + + +static PyObject * +decompress(ZlibDecompressor *self, PyTypeObject *cls, uint8_t *data, + size_t len, Py_ssize_t max_length) +{ + char input_buffer_in_use; + PyObject *result; + + /* Prepend unconsumed input if necessary */ + if (self->zst.next_in != NULL) { + size_t avail_now, avail_total; + + /* Number of bytes we can append to input buffer */ + avail_now = (self->input_buffer + self->input_buffer_size) + - (self->zst.next_in + self->avail_in_real); + + /* Number of bytes we can append if we move existing + contents to beginning of buffer (overwriting + consumed input) */ + avail_total = self->input_buffer_size - self->avail_in_real; + + if (avail_total < len) { + size_t offset = self->zst.next_in - self->input_buffer; + uint8_t *tmp; + size_t new_size = self->input_buffer_size + len - avail_now; + + /* Assign to temporary variable first, so we don't + lose address of allocated buffer if realloc fails */ + tmp = PyMem_Realloc(self->input_buffer, new_size); + if (tmp == NULL) { + PyErr_SetNone(PyExc_MemoryError); + return NULL; + } + self->input_buffer = tmp; + self->input_buffer_size = new_size; + + self->zst.next_in = self->input_buffer + offset; + } + else if (avail_now < len) { + memmove(self->input_buffer, self->zst.next_in, + self->avail_in_real); + self->zst.next_in = self->input_buffer; + } + memcpy((void*)(self->zst.next_in + self->avail_in_real), data, len); + self->avail_in_real += len; + input_buffer_in_use = 1; + } + else { + self->zst.next_in = data; + self->avail_in_real = len; + input_buffer_in_use = 0; + } + + result = decompress_buf(self, max_length); + if(result == NULL) { + self->zst.next_in = NULL; + return NULL; + } + + if (self->eof) { + self->needs_input = 0; + Py_ssize_t bytes_in_bitbuffer = bitbuffer_size(&(self->zst)); + if (self->avail_in_real + bytes_in_bitbuffer > 0) { + PyObject * new_data = PyBytes_FromStringAndSize( + NULL, self->avail_in_real + bytes_in_bitbuffer); + if (new_data == NULL) + goto error; + char * new_data_ptr = PyBytes_AS_STRING(new_data); + bitbuffer_copy(&(self->zst), new_data_ptr, bytes_in_bitbuffer); + memcpy(new_data_ptr + bytes_in_bitbuffer, self->zst.next_in, + self->avail_in_real); + Py_XSETREF(self->unused_data, new_data); + } + } + else if (self->avail_in_real == 0) { + self->zst.next_in = NULL; + self->needs_input = 1; + } + else { + self->needs_input = 0; + + /* If we did not use the input buffer, we now have + to copy the tail from the caller's buffer into the + input buffer */ + if (!input_buffer_in_use) { + + /* Discard buffer if it's too small + (resizing it may needlessly copy the current contents) */ + if (self->input_buffer != NULL && + self->input_buffer_size < self->avail_in_real) { + PyMem_Free(self->input_buffer); + self->input_buffer = NULL; + } + + /* Allocate if necessary */ + if (self->input_buffer == NULL) { + self->input_buffer = PyMem_Malloc(self->avail_in_real); + if (self->input_buffer == NULL) { + PyErr_SetNone(PyExc_MemoryError); + goto error; + } + self->input_buffer_size = self->avail_in_real; + } + + /* Copy tail */ + memcpy(self->input_buffer, self->zst.next_in, self->avail_in_real); + self->zst.next_in = self->input_buffer; + } + } + return result; + +error: + Py_XDECREF(result); + return NULL; +} + +PyDoc_STRVAR(igzip_lib_IgzipDecompressor___init____doc__, +"IgzipDecompressor(flag=0, hist_bits=15, zdict=b\'\')\n" +"--\n" +"\n" +"Create a decompressor object for decompressing data incrementally.\n" +"\n" +" wbits\n" +" The lookback distance is 2 ^ hist_bits.\n" +" zdict\n" +" Dictionary used for decompressing the data\n" +"\n" +"For one-shot decompression, use the decompress() function instead."); + +static PyObject * +igzip_lib_IgzipDecompressor__new__(PyTypeObject *type, + PyObject *args, + PyObject *kwargs) +{ + static char *keywords[] = {"flag", "hist_bits", "zdict", NULL}; + static char *format = "|iiO:IgzipDecompressor"; + int flag = ISAL_DEFLATE; + int hist_bits = ISAL_DEF_MAX_HIST_BITS; + PyObject *zdict = NULL; + + if (!PyArg_ParseTupleAndKeywords( + args, kwargs, format, keywords, &flag, &hist_bits, &zdict)) { + return NULL; + } + IgzipDecompressor *self = PyObject_New(IgzipDecompressor, type); + int err; + self->eof = 0; + self->needs_input = 1; + self->avail_in_real = 0; + self->input_buffer = NULL; + self->input_buffer_size = 0; + self->zdict = zdict; + self->unused_data = PyBytes_FromStringAndSize(NULL, 0); + if (self->unused_data == NULL) { + Py_CLEAR(self); + return NULL; + } + isal_inflate_init(&(self->state)); + self->state.hist_bits = hist_bits; + self->state.crc_flag = flag; + if (self->zdict != NULL){ + Py_buffer zdict_buf; + if (PyObject_GetBuffer(self->zdict, &zdict_buf, PyBUF_SIMPLE) == -1) { + Py_CLEAR(self); + return NULL; + } + if ((size_t)zdict_buf.len > UINT32_MAX) { + PyErr_SetString(PyExc_OverflowError, + "zdict length does not fit in an unsigned 32-bits int"); + PyBuffer_Release(&zdict_buf); + Py_CLEAR(self); + return NULL; + } + err = isal_inflate_set_dict(&(self->state), zdict_buf.buf, + (uint32_t)zdict_buf.len); + PyBuffer_Release(&zdict_buf); + if (err != ISAL_DECOMP_OK) { + isal_inflate_error(err); + Py_CLEAR(self); + return NULL; + } + } + return (PyObject *)self; +} + +static PyMethodDef IgzipDecompressor_methods[] = { + IGZIP_LIB_IGZIPDECOMPRESSOR_DECOMPRESS_METHODDEF, + {NULL} +}; + /*[clinic input] zlib.adler32 From 6a5cdfd0c36ec5969fd6e60f5bfabf59bf0638a6 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 12:36:07 +0200 Subject: [PATCH 02/30] Reorder code --- Modules/zlibmodule.c | 148 +++++++++++++++++++++++-------------------- 1 file changed, 79 insertions(+), 69 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 32f61b7e021600..1155745c9b29f7 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1351,36 +1351,6 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls, return RetVal; } -#include "clinic/zlibmodule.c.h" - -static PyMethodDef comp_methods[] = -{ - ZLIB_COMPRESS_COMPRESS_METHODDEF - ZLIB_COMPRESS_FLUSH_METHODDEF - ZLIB_COMPRESS_COPY_METHODDEF - ZLIB_COMPRESS___COPY___METHODDEF - ZLIB_COMPRESS___DEEPCOPY___METHODDEF - {NULL, NULL} -}; - -static PyMethodDef Decomp_methods[] = -{ - ZLIB_DECOMPRESS_DECOMPRESS_METHODDEF - ZLIB_DECOMPRESS_FLUSH_METHODDEF - ZLIB_DECOMPRESS_COPY_METHODDEF - ZLIB_DECOMPRESS___COPY___METHODDEF - ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF - {NULL, NULL} -}; - -#define COMP_OFF(x) offsetof(compobject, x) -static PyMemberDef Decomp_members[] = { - {"unused_data", T_OBJECT, COMP_OFF(unused_data), READONLY}, - {"unconsumed_tail", T_OBJECT, COMP_OFF(unconsumed_tail), READONLY}, - {"eof", T_BOOL, COMP_OFF(eof), READONLY}, - {NULL}, -}; - typedef struct { PyObject_HEAD @@ -1484,6 +1454,7 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) PyObject *RetVal = NULL; Py_ssize_t hard_limit; Py_ssize_t obuflen; + zlibstate *state = PyType_GetModuleState(Py_TYPE(self)); int err; @@ -1680,35 +1651,34 @@ decompress(ZlibDecompressor *self, PyTypeObject *cls, uint8_t *data, return NULL; } -PyDoc_STRVAR(igzip_lib_IgzipDecompressor___init____doc__, -"IgzipDecompressor(flag=0, hist_bits=15, zdict=b\'\')\n" +PyDoc_STRVAR(ZlibDecompressor__new____doc__, +"_ZlibDecompressor(wbits=15, zdict=b\'\')\n" "--\n" "\n" "Create a decompressor object for decompressing data incrementally.\n" "\n" -" wbits\n" -" The lookback distance is 2 ^ hist_bits.\n" +" wbits = 15\n" " zdict\n" -" Dictionary used for decompressing the data\n" -"\n" -"For one-shot decompression, use the decompress() function instead."); +" The predefined compression dictionary. This must be the same\n" +" dictionary as used by the compressor that produced the input data.\n" +"\n"); static PyObject * -igzip_lib_IgzipDecompressor__new__(PyTypeObject *type, - PyObject *args, - PyObject *kwargs) +ZlibDecompressor__new__(PyTypeObject *cls, + PyObject *args, + PyObject *kwargs) { - static char *keywords[] = {"flag", "hist_bits", "zdict", NULL}; - static char *format = "|iiO:IgzipDecompressor"; - int flag = ISAL_DEFLATE; - int hist_bits = ISAL_DEF_MAX_HIST_BITS; + static char *keywords[] = {"wbits", "zdict", NULL}; + static char *format = "|iO:IgzipDecompressor"; + int wbits = MAX_WBITS; PyObject *zdict = NULL; + zlibstate *state = PyType_GetModuleState(cls); if (!PyArg_ParseTupleAndKeywords( - args, kwargs, format, keywords, &flag, &hist_bits, &zdict)) { + args, kwargs, format, keywords, &wbits, &zdict)) { return NULL; } - IgzipDecompressor *self = PyObject_New(IgzipDecompressor, type); + ZlibDecompressor *self = PyObject_New(ZlibDecompressor, cls); int err; self->eof = 0; self->needs_input = 1; @@ -1716,41 +1686,81 @@ igzip_lib_IgzipDecompressor__new__(PyTypeObject *type, self->input_buffer = NULL; self->input_buffer_size = 0; self->zdict = zdict; + self->zst.opaque = NULL; + self->zst.zalloc = PyZlib_Malloc; + self->zst.zfree = PyZlib_Free; self->unused_data = PyBytes_FromStringAndSize(NULL, 0); if (self->unused_data == NULL) { Py_CLEAR(self); return NULL; } - isal_inflate_init(&(self->state)); - self->state.hist_bits = hist_bits; - self->state.crc_flag = flag; - if (self->zdict != NULL){ - Py_buffer zdict_buf; - if (PyObject_GetBuffer(self->zdict, &zdict_buf, PyBUF_SIMPLE) == -1) { - Py_CLEAR(self); + int err = inflateInit2(&(self->zst), wbits); + switch (err) { + case Z_OK: + self->is_initialised = 1; + if (self->zdict != NULL && wbits < 0) { +#ifdef AT_LEAST_ZLIB_1_2_2_1 + if (set_inflate_zdict(state, self) < 0) { + Py_DECREF(self); return NULL; - } - if ((size_t)zdict_buf.len > UINT32_MAX) { - PyErr_SetString(PyExc_OverflowError, - "zdict length does not fit in an unsigned 32-bits int"); - PyBuffer_Release(&zdict_buf); - Py_CLEAR(self); + } +#else + PyErr_Format(state->ZlibError, + "zlib version %s does not allow raw inflate with dictionary", + ZLIB_VERSION); + Py_DECREF(self); return NULL; +#endif } - err = isal_inflate_set_dict(&(self->state), zdict_buf.buf, - (uint32_t)zdict_buf.len); - PyBuffer_Release(&zdict_buf); - if (err != ISAL_DECOMP_OK) { - isal_inflate_error(err); - Py_CLEAR(self); - return NULL; - } + return (PyObject *)self; + case Z_STREAM_ERROR: + Py_DECREF(self); + PyErr_SetString(PyExc_ValueError, "Invalid initialization option"); + return NULL; + case Z_MEM_ERROR: + Py_DECREF(self); + PyErr_SetString(PyExc_MemoryError, + "Can't allocate memory for decompression object"); + return NULL; + default: + zlib_error(state, self->zst, err, "while creating decompression object"); + Py_DECREF(self); + return NULL; } - return (PyObject *)self; } +#include "clinic/zlibmodule.c.h" + +static PyMethodDef comp_methods[] = +{ + ZLIB_COMPRESS_COMPRESS_METHODDEF + ZLIB_COMPRESS_FLUSH_METHODDEF + ZLIB_COMPRESS_COPY_METHODDEF + ZLIB_COMPRESS___COPY___METHODDEF + ZLIB_COMPRESS___DEEPCOPY___METHODDEF + {NULL, NULL} +}; + +static PyMethodDef Decomp_methods[] = +{ + ZLIB_DECOMPRESS_DECOMPRESS_METHODDEF + ZLIB_DECOMPRESS_FLUSH_METHODDEF + ZLIB_DECOMPRESS_COPY_METHODDEF + ZLIB_DECOMPRESS___COPY___METHODDEF + ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF + {NULL, NULL} +}; + +#define COMP_OFF(x) offsetof(compobject, x) +static PyMemberDef Decomp_members[] = { + {"unused_data", T_OBJECT, COMP_OFF(unused_data), READONLY}, + {"unconsumed_tail", T_OBJECT, COMP_OFF(unconsumed_tail), READONLY}, + {"eof", T_BOOL, COMP_OFF(eof), READONLY}, + {NULL}, +}; + static PyMethodDef IgzipDecompressor_methods[] = { - IGZIP_LIB_IGZIPDECOMPRESSOR_DECOMPRESS_METHODDEF, + ZLIBCOMPRESSOR_DECOMPRESS_METHODDEF, {NULL} }; From 809ad5f6978eb0447edef2fa9aa23cbfa68bf0a6 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 13:53:35 +0200 Subject: [PATCH 03/30] Add ZlibDecompressor --- Modules/clinic/zlibmodule.c.h | 101 ++++++++++++++++++++++++++- Modules/zlibmodule.c | 128 +++++++++++++++++++++++++++++----- 2 files changed, 212 insertions(+), 17 deletions(-) diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h index a04b954a57f353..3a47d3bbe19f3b 100644 --- a/Modules/clinic/zlibmodule.c.h +++ b/Modules/clinic/zlibmodule.c.h @@ -897,6 +897,105 @@ zlib_Decompress_flush(compobject *self, PyTypeObject *cls, PyObject *const *args return return_value; } +PyDoc_STRVAR(zlib_ZlibDecompressor_decompress__doc__, +"decompress($self, /, data, max_length=-1)\n" +"--\n" +"\n" +"Decompress *data*, returning uncompressed data as bytes.\n" +"\n" +"If *max_length* is nonnegative, returns at most *max_length* bytes of\n" +"decompressed data. If this limit is reached and further output can be\n" +"produced, *self.needs_input* will be set to ``False``. In this case, the next\n" +"call to *decompress()* may provide *data* as b\'\' to obtain more of the output.\n" +"\n" +"If all of the input data was decompressed and returned (either because this\n" +"was less than *max_length* bytes, or because *max_length* was negative),\n" +"*self.needs_input* will be set to True.\n" +"\n" +"Attempting to decompress data after the end of stream is reached raises an\n" +"EOFError. Any data found after the end of the stream is ignored and saved in\n" +"the unused_data attribute."); + +#define ZLIB_ZLIBDECOMPRESSOR_DECOMPRESS_METHODDEF \ + {"decompress", _PyCFunction_CAST(zlib_ZlibDecompressor_decompress), METH_METHOD|METH_FASTCALL|METH_KEYWORDS, zlib_ZlibDecompressor_decompress__doc__}, + +static PyObject * +zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self, + PyTypeObject *cls, Py_buffer *data, + Py_ssize_t max_length); + +static PyObject * +zlib_ZlibDecompressor_decompress(ZlibDecompressor *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 3 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(data), &_Py_ID(max_length), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"data", "max_length", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "decompress", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; + Py_buffer data = {NULL, NULL}; + Py_ssize_t max_length = -1; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 2, 0, argsbuf); + if (!args) { + goto exit; + } + if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) { + goto exit; + } + if (!PyBuffer_IsContiguous(&data, 'C')) { + _PyArg_BadArgument("decompress", "argument 'data'", "contiguous buffer", args[0]); + goto exit; + } + if (!noptargs) { + goto skip_optional_pos; + } + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + max_length = ival; + } +skip_optional_pos: + return_value = zlib_ZlibDecompressor_decompress_impl(self, cls, &data, max_length); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + PyDoc_STRVAR(zlib_adler32__doc__, "adler32($module, data, value=1, /)\n" "--\n" @@ -1031,4 +1130,4 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs) #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */ -/*[clinic end generated code: output=9e5f9911d0c273e1 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=3dd7ac714bdbde08 input=a9049054013a1b77]*/ diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 1155745c9b29f7..3bbb49ac4e28e6 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1370,13 +1370,27 @@ typedef struct { char needs_input; } ZlibDecompressor; +/*[clinic input] +module zlib +class zlib.Compress "compobject *" "&Comptype" +class zlib.ZlibDecompressor "ZlibDecompressor *" "&ZlibDecompressorType" +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fc826e280aec6432]*/ + static void ZlibDecompressor_dealloc(ZlibDecompressor *self) { + PyObject *type = (PyObject *)Py_TYPE(self); + PyThread_free_lock(self->lock); + if (self->is_initialised) { + inflateEnd(&self->zst); + } + Dealloc(self); PyMem_Free(self->input_buffer); Py_CLEAR(self->unused_data); Py_CLEAR(self->zdict); - Py_TYPE(self)->tp_free((PyObject *)self); + PyObject_Free(self); + Py_DECREF(type); } static inline void @@ -1475,7 +1489,6 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) obuflen = DEF_MAX_INITIAL_BUF_SIZE; } } - ENTER_ZLIB(self); do { arrange_input_buffer(&(self->zst.avail_in), &(self->avail_in_real)); @@ -1516,6 +1529,14 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) if (err == Z_STREAM_END) { self->eof = 1; + self->is_initialised = 0; + /* Unlike the Decompress object we end here, since there is no + backwards-compatibility issues */ + err = inflateEnd(&self->zst); + if (err != Z_OK) { + zlib_error(state, self->zst, err, "while finishing decompression"); + goto error; + } } else if (err != Z_OK && err != Z_BUF_ERROR) { zlib_error(state, self->zst, err, "while decompressing data"); } @@ -1530,7 +1551,6 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) error: RetVal = NULL; success: - LEAVE_ZLIB(self); Py_CLEAR(RetVal); return RetVal; } @@ -1596,17 +1616,14 @@ decompress(ZlibDecompressor *self, PyTypeObject *cls, uint8_t *data, if (self->eof) { self->needs_input = 0; - Py_ssize_t bytes_in_bitbuffer = bitbuffer_size(&(self->zst)); - if (self->avail_in_real + bytes_in_bitbuffer > 0) { - PyObject * new_data = PyBytes_FromStringAndSize( - NULL, self->avail_in_real + bytes_in_bitbuffer); - if (new_data == NULL) + + if (self->avail_in_real > 0) { + PyObject *unused_data = PyBytes_FromStringAndSize( + self->zst.next_in, self->avail_in_real); + if (unused_data == NULL) { goto error; - char * new_data_ptr = PyBytes_AS_STRING(new_data); - bitbuffer_copy(&(self->zst), new_data_ptr, bytes_in_bitbuffer); - memcpy(new_data_ptr + bytes_in_bitbuffer, self->zst.next_in, - self->avail_in_real); - Py_XSETREF(self->unused_data, new_data); + } + Py_XSETREF(self->unused_data, unused_data); } } else if (self->avail_in_real == 0) { @@ -1651,6 +1668,47 @@ decompress(ZlibDecompressor *self, PyTypeObject *cls, uint8_t *data, return NULL; } +/*[clinic input] +zlib.ZlibDecompressor.decompress + + cls: defining_class + data: Py_buffer + max_length: Py_ssize_t=-1 + +Decompress *data*, returning uncompressed data as bytes. + +If *max_length* is nonnegative, returns at most *max_length* bytes of +decompressed data. If this limit is reached and further output can be +produced, *self.needs_input* will be set to ``False``. In this case, the next +call to *decompress()* may provide *data* as b'' to obtain more of the output. + +If all of the input data was decompressed and returned (either because this +was less than *max_length* bytes, or because *max_length* was negative), +*self.needs_input* will be set to True. + +Attempting to decompress data after the end of stream is reached raises an +EOFError. Any data found after the end of the stream is ignored and saved in +the unused_data attribute. +[clinic start generated code]*/ + +static PyObject * +zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self, + PyTypeObject *cls, Py_buffer *data, + Py_ssize_t max_length) +/*[clinic end generated code: output=62a74845fde185c1 input=e16759033492a273]*/ + +{ + PyObject *result = NULL; + + ENTER_ZLIB(self); + if (self->eof) + PyErr_SetString(PyExc_EOFError, "End of stream already reached"); + else + result = decompress(self, cls, data->buf, data->len, max_length); + LEAVE_ZLIB(self); + return result; +} + PyDoc_STRVAR(ZlibDecompressor__new____doc__, "_ZlibDecompressor(wbits=15, zdict=b\'\')\n" "--\n" @@ -1751,6 +1809,11 @@ static PyMethodDef Decomp_methods[] = {NULL, NULL} }; +static PyMethodDef ZlibDecompressor_methods[] = { + ZLIB_ZLIBDECOMPRESSOR_DECOMPRESS_METHODDEF + {NULL} +}; + #define COMP_OFF(x) offsetof(compobject, x) static PyMemberDef Decomp_members[] = { {"unused_data", T_OBJECT, COMP_OFF(unused_data), READONLY}, @@ -1759,11 +1822,25 @@ static PyMemberDef Decomp_members[] = { {NULL}, }; -static PyMethodDef IgzipDecompressor_methods[] = { - ZLIBCOMPRESSOR_DECOMPRESS_METHODDEF, - {NULL} +PyDoc_STRVAR(ZlibDecompressor_eof__doc__, +"True if the end-of-stream marker has been reached."); + +PyDoc_STRVAR(ZlibDecompressor_unused_data__doc__, +"Data found after the end of the compressed stream."); + +PyDoc_STRVAR(ZlibDecompressor_needs_input_doc, +"True if more input is needed before more decompressed data can be produced."); + +static PyMemberDef ZlibDecompressor_members[] = { + {"eof", T_BOOL, offsetof(ZlibDecompressor, eof), + READONLY, ZlibDecompressor_eof__doc__}, + {"unused_data", T_OBJECT_EX, offsetof(ZlibDecompressor, unused_data), + READONLY, ZlibDecompressor_unused_data__doc__}, + {"needs_input", T_BOOL, offsetof(ZlibDecompressor, needs_input), READONLY, + ZlibDecompressor_needs_input_doc}, }; + /*[clinic input] zlib.adler32 @@ -1881,6 +1958,25 @@ static PyType_Spec Decomptype_spec = { .slots = Decomptype_slots, }; +static PyType_Slot ZlibDecompressor_type_slots[] = { + {Py_tp_dealloc, ZlibDecompressor_dealloc}, + {Py_tp_members, ZlibDecompressor_members}, + {Py_tp_new, ZlibDecompressor__new__}, + {Py_tp_doc, ZlibDecompressor__new____doc__}, + {Py_tp_methods, ZlibDecompressor_methods}, + {0, 0}, +}; + +static PyType_Spec ZlibDecompressor_type_spec = { + .name = "zlib.ZlibDecompressor", + .basicsize = sizeof(ZlibDecompressor), + // Calling PyType_GetModuleState() on a subclass is not safe. + // ZlibDecompressor_type_spec does not have Py_TPFLAGS_BASETYPE flag + // which prevents to create a subclass. + // So calling PyType_GetModuleState() in this file is always safe. + .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE), + .slots = ZlibDecompressor_type_slots, +}; PyDoc_STRVAR(zlib_module_documentation, "The functions in this module allow compression and decompression using the\n" "zlib library, which is based on GNU zip.\n" From 03254b88271adf26e9441e188a9a549f7dcac69c Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 13:59:39 +0200 Subject: [PATCH 04/30] Add zlibdecompressor object --- Modules/zlibmodule.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 3bbb49ac4e28e6..3d2115367f4f86 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -192,6 +192,7 @@ static PyModuleDef zlibmodule; typedef struct { PyTypeObject *Comptype; PyTypeObject *Decomptype; + PyTypeObject *ZlibDecompressorType; PyObject *ZlibError; } zlibstate; @@ -1998,6 +1999,7 @@ zlib_clear(PyObject *mod) zlibstate *state = get_zlib_state(mod); Py_CLEAR(state->Comptype); Py_CLEAR(state->Decomptype); + Py_CLEAR(state->ZlibDecompressorType); Py_CLEAR(state->ZlibError); return 0; } @@ -2008,6 +2010,7 @@ zlib_traverse(PyObject *mod, visitproc visit, void *arg) zlibstate *state = get_zlib_state(mod); Py_VISIT(state->Comptype); Py_VISIT(state->Decomptype); + Py_VISIT(state->ZlibDecompressorType); Py_VISIT(state->ZlibError); return 0; } @@ -2035,6 +2038,12 @@ zlib_exec(PyObject *mod) return -1; } + state->ZlibDecompressorType = (PyTypeObject *)PyType_FromModuleAndSpec( + mod, &ZlibDecompressor_type_spec, NULL); + if (state->ZlibDecompressorType == NULL) { + return -1; + } + state->ZlibError = PyErr_NewException("zlib.error", NULL, NULL); if (state->ZlibError == NULL) { return -1; @@ -2045,6 +2054,12 @@ zlib_exec(PyObject *mod) Py_DECREF(state->ZlibError); return -1; } + Py_INCREF(state->ZlibDecompressorType); + if (PyModule_AddObject(mod, "_ZlibDecompressor", + state->ZlibDecompressorType) < 0) { + Py_DECREF(state->ZlibDecompressorType); + return -1; + } #define ZLIB_ADD_INT_MACRO(c) \ do { \ From 669848a6b0fe6fe52254f2001c45dd88f2b84a0f Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 14:26:47 +0200 Subject: [PATCH 05/30] Fix compile warnings --- Modules/zlibmodule.c | 46 ++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 3d2115367f4f86..b0f5d23b428f15 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1359,7 +1359,6 @@ typedef struct { PyObject *zdict; PyThread_type_lock lock; PyObject *unused_data; - PyObject *zdict; uint8_t *input_buffer; Py_ssize_t input_buffer_size; /* zst>avail_in is only 32 bit, so we store the true length @@ -1372,11 +1371,9 @@ typedef struct { } ZlibDecompressor; /*[clinic input] -module zlib -class zlib.Compress "compobject *" "&Comptype" class zlib.ZlibDecompressor "ZlibDecompressor *" "&ZlibDecompressorType" [clinic start generated code]*/ -/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fc826e280aec6432]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=0658178ab94645df]*/ static void ZlibDecompressor_dealloc(ZlibDecompressor *self) @@ -1386,7 +1383,6 @@ ZlibDecompressor_dealloc(ZlibDecompressor *self) if (self->is_initialised) { inflateEnd(&self->zst); } - Dealloc(self); PyMem_Free(self->input_buffer); Py_CLEAR(self->unused_data); Py_CLEAR(self->zdict); @@ -1394,11 +1390,28 @@ ZlibDecompressor_dealloc(ZlibDecompressor *self) Py_DECREF(type); } -static inline void -arrange_input_buffer(uint32_t *avail_in, Py_ssize_t *remains) +static int +set_inflate_zdict_ZlibDecompressor(zlibstate *state, ZlibDecompressor *self) { - *avail_in = (uint32_t)Py_MIN((size_t)*remains, UINT32_MAX); - *remains -= *avail_in; + Py_buffer zdict_buf; + if (PyObject_GetBuffer(self->zdict, &zdict_buf, PyBUF_SIMPLE) == -1) { + return -1; + } + if ((size_t)zdict_buf.len > UINT_MAX) { + PyErr_SetString(PyExc_OverflowError, + "zdict length does not fit in an unsigned int"); + PyBuffer_Release(&zdict_buf); + return -1; + } + int err; + err = inflateSetDictionary(&self->zst, + zdict_buf.buf, (unsigned int)zdict_buf.len); + PyBuffer_Release(&zdict_buf); + if (err != Z_OK) { + zlib_error(state, self->zst, err, "while setting zdict"); + return -1; + } + return 0; } static Py_ssize_t @@ -1470,8 +1483,8 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) Py_ssize_t hard_limit; Py_ssize_t obuflen; zlibstate *state = PyType_GetModuleState(Py_TYPE(self)); - - int err; + + int err = Z_OK; /* In Python 3.10 sometimes sys.maxsize is passed by default. In those cases we do want to use DEF_BUF_SIZE as start buffer. */ @@ -1492,7 +1505,7 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) } do { - arrange_input_buffer(&(self->zst.avail_in), &(self->avail_in_real)); + arrange_input_buffer(&(self->zst), &(self->avail_in_real)); do { obuflen = arrange_output_buffer_with_maximum(&(self->zst.avail_out), @@ -1620,7 +1633,7 @@ decompress(ZlibDecompressor *self, PyTypeObject *cls, uint8_t *data, if (self->avail_in_real > 0) { PyObject *unused_data = PyBytes_FromStringAndSize( - self->zst.next_in, self->avail_in_real); + (char *)self->zst.next_in, self->avail_in_real); if (unused_data == NULL) { goto error; } @@ -1738,7 +1751,6 @@ ZlibDecompressor__new__(PyTypeObject *cls, return NULL; } ZlibDecompressor *self = PyObject_New(ZlibDecompressor, cls); - int err; self->eof = 0; self->needs_input = 1; self->avail_in_real = 0; @@ -1759,7 +1771,7 @@ ZlibDecompressor__new__(PyTypeObject *cls, self->is_initialised = 1; if (self->zdict != NULL && wbits < 0) { #ifdef AT_LEAST_ZLIB_1_2_2_1 - if (set_inflate_zdict(state, self) < 0) { + if (set_inflate_zdict_ZlibDecompressor(state, self) < 0) { Py_DECREF(self); return NULL; } @@ -1963,7 +1975,7 @@ static PyType_Slot ZlibDecompressor_type_slots[] = { {Py_tp_dealloc, ZlibDecompressor_dealloc}, {Py_tp_members, ZlibDecompressor_members}, {Py_tp_new, ZlibDecompressor__new__}, - {Py_tp_doc, ZlibDecompressor__new____doc__}, + {Py_tp_doc, (char *)ZlibDecompressor__new____doc__}, {Py_tp_methods, ZlibDecompressor_methods}, {0, 0}, }; @@ -2056,7 +2068,7 @@ zlib_exec(PyObject *mod) } Py_INCREF(state->ZlibDecompressorType); if (PyModule_AddObject(mod, "_ZlibDecompressor", - state->ZlibDecompressorType) < 0) { + (PyObject *)state->ZlibDecompressorType) < 0) { Py_DECREF(state->ZlibDecompressorType); return -1; } From 69ff613e6036fe43e0882992da66179b0f7ca21e Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 14:34:51 +0200 Subject: [PATCH 06/30] Do not use class input --- Modules/clinic/zlibmodule.c.h | 13 ++++++------- Modules/zlibmodule.c | 10 ++++------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h index 3a47d3bbe19f3b..65412b2435ade1 100644 --- a/Modules/clinic/zlibmodule.c.h +++ b/Modules/clinic/zlibmodule.c.h @@ -917,20 +917,19 @@ PyDoc_STRVAR(zlib_ZlibDecompressor_decompress__doc__, "the unused_data attribute."); #define ZLIB_ZLIBDECOMPRESSOR_DECOMPRESS_METHODDEF \ - {"decompress", _PyCFunction_CAST(zlib_ZlibDecompressor_decompress), METH_METHOD|METH_FASTCALL|METH_KEYWORDS, zlib_ZlibDecompressor_decompress__doc__}, + {"decompress", _PyCFunction_CAST(zlib_ZlibDecompressor_decompress), METH_FASTCALL|METH_KEYWORDS, zlib_ZlibDecompressor_decompress__doc__}, static PyObject * zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self, - PyTypeObject *cls, Py_buffer *data, - Py_ssize_t max_length); + Py_buffer *data, Py_ssize_t max_length); static PyObject * -zlib_ZlibDecompressor_decompress(ZlibDecompressor *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +zlib_ZlibDecompressor_decompress(ZlibDecompressor *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - #define NUM_KEYWORDS 3 + #define NUM_KEYWORDS 2 static struct { PyGC_Head _this_is_not_used; PyObject_VAR_HEAD @@ -985,7 +984,7 @@ zlib_ZlibDecompressor_decompress(ZlibDecompressor *self, PyTypeObject *cls, PyOb max_length = ival; } skip_optional_pos: - return_value = zlib_ZlibDecompressor_decompress_impl(self, cls, &data, max_length); + return_value = zlib_ZlibDecompressor_decompress_impl(self, &data, max_length); exit: /* Cleanup for data */ @@ -1130,4 +1129,4 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs) #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */ -/*[clinic end generated code: output=3dd7ac714bdbde08 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=57ff7b511ab23132 input=a9049054013a1b77]*/ diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index b0f5d23b428f15..843cebf02bbe5e 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1571,7 +1571,7 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) static PyObject * -decompress(ZlibDecompressor *self, PyTypeObject *cls, uint8_t *data, +decompress(ZlibDecompressor *self, uint8_t *data, size_t len, Py_ssize_t max_length) { char input_buffer_in_use; @@ -1685,7 +1685,6 @@ decompress(ZlibDecompressor *self, PyTypeObject *cls, uint8_t *data, /*[clinic input] zlib.ZlibDecompressor.decompress - cls: defining_class data: Py_buffer max_length: Py_ssize_t=-1 @@ -1707,9 +1706,8 @@ the unused_data attribute. static PyObject * zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self, - PyTypeObject *cls, Py_buffer *data, - Py_ssize_t max_length) -/*[clinic end generated code: output=62a74845fde185c1 input=e16759033492a273]*/ + Py_buffer *data, Py_ssize_t max_length) +/*[clinic end generated code: output=990d32787b775f85 input=0b29d99715250b96]*/ { PyObject *result = NULL; @@ -1718,7 +1716,7 @@ zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self, if (self->eof) PyErr_SetString(PyExc_EOFError, "End of stream already reached"); else - result = decompress(self, cls, data->buf, data->len, max_length); + result = decompress(self, data->buf, data->len, max_length); LEAVE_ZLIB(self); return result; } From 6fa43ae31f144fa8564275cca7a7042abe8a2ef7 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 14:41:16 +0200 Subject: [PATCH 07/30] Fix lock stuff --- Modules/zlibmodule.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 843cebf02bbe5e..b4ed6666580304 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1763,6 +1763,12 @@ ZlibDecompressor__new__(PyTypeObject *cls, Py_CLEAR(self); return NULL; } + self->lock = PyThread_allocate_lock(); + if (self->lock == NULL) { + Py_DECREF(self); + PyErr_SetString(PyExc_MemoryError, "Unable to allocate lock"); + return NULL; + } int err = inflateInit2(&(self->zst), wbits); switch (err) { case Z_OK: From cdc597274b66ee123d8a5d3e9a5e39b04cb43200 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 15:57:42 +0200 Subject: [PATCH 08/30] Fix incorrect error handling --- Modules/zlibmodule.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index b4ed6666580304..6ea17a0fe054a2 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1524,7 +1524,6 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) Py_BEGIN_ALLOW_THREADS err = inflate(&self->zst, Z_SYNC_FLUSH); Py_END_ALLOW_THREADS - switch (err) { case Z_OK: /* fall through */ case Z_BUF_ERROR: /* fall through */ @@ -1558,14 +1557,14 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) self->avail_in_real += self->zst.avail_in; if (_PyBytes_Resize(&RetVal, self->zst.next_out - - (uint8_t *)PyBytes_AS_STRING(RetVal)) != 0) + (uint8_t *)PyBytes_AS_STRING(RetVal)) != 0) { goto error; + } goto success; error: - RetVal = NULL; -success: Py_CLEAR(RetVal); +success: return RetVal; } From 7820627adb1da292a4a0696ae69ab97e1c5b4832 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 28 Sep 2022 16:10:12 +0200 Subject: [PATCH 09/30] Rework _GzipReader to be more efficient --- Lib/gzip.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 8edcda4493c894..0492be30b11e10 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -21,6 +21,8 @@ _COMPRESS_LEVEL_TRADEOFF = 6 _COMPRESS_LEVEL_BEST = 9 +READ_BUFFER_SIZE = 128 * 1024 + def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, encoding=None, errors=None, newline=None): @@ -446,7 +448,7 @@ def _read_gzip_header(fp): class _GzipReader(_compression.DecompressReader): def __init__(self, fp): - super().__init__(_PaddedFile(fp), zlib.decompressobj, + super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor, wbits=-zlib.MAX_WBITS) # Set flag indicating start of a new member self._new_member = True @@ -494,12 +496,14 @@ def read(self, size=-1): self._new_member = False # Read a chunk of data from the file - buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) + # Read a chunk of data from the file + if self._decompressor.needs_input: + buf = self._fp.read(READ_BUFFER_SIZE) + uncompress = self._decompressor.decompress(buf, size) + else: + uncompress = self._decompressor.decompress(b"", size) - uncompress = self._decompressor.decompress(buf, size) - if self._decompressor.unconsumed_tail != b"": - self._fp.prepend(self._decompressor.unconsumed_tail) - elif self._decompressor.unused_data != b"": + if self._decompressor.unused_data != b"": # Prepend the already read bytes to the fileobj so they can # be seen by _read_eof() and _read_gzip_header() self._fp.prepend(self._decompressor.unused_data) @@ -510,14 +514,11 @@ def read(self, size=-1): raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") - self._add_read_data( uncompress ) + self._crc = zlib.crc32(uncompress, self._crc) + self._stream_size = self._stream_size + len(uncompress) self._pos += len(uncompress) return uncompress - def _add_read_data(self, data): - self._crc = zlib.crc32(data, self._crc) - self._stream_size = self._stream_size + len(data) - def _read_eof(self): # We've read to the end of the file # We check that the computed CRC and size of the From 6f8b64aff5bac693155b8c02092e03654b611e69 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Sep 2022 08:47:20 +0200 Subject: [PATCH 10/30] Properly initialize zstate --- Modules/zlibmodule.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 6ea17a0fe054a2..df7852f8ad563a 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1738,7 +1738,7 @@ ZlibDecompressor__new__(PyTypeObject *cls, PyObject *kwargs) { static char *keywords[] = {"wbits", "zdict", NULL}; - static char *format = "|iO:IgzipDecompressor"; + static char *format = "|iO:ZlibDecompressor"; int wbits = MAX_WBITS; PyObject *zdict = NULL; zlibstate *state = PyType_GetModuleState(cls); @@ -1753,10 +1753,15 @@ ZlibDecompressor__new__(PyTypeObject *cls, self->avail_in_real = 0; self->input_buffer = NULL; self->input_buffer_size = 0; + if (zdict != NULL) { + Py_INCREF(zdict); + } self->zdict = zdict; self->zst.opaque = NULL; self->zst.zalloc = PyZlib_Malloc; self->zst.zfree = PyZlib_Free; + self->zst.next_in = NULL; + self->zst.avail_in = 0; self->unused_data = PyBytes_FromStringAndSize(NULL, 0); if (self->unused_data == NULL) { Py_CLEAR(self); From 3e2a4f5be2eff8593fb1d1ef014a658c5af86a33 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Sep 2022 09:22:45 +0200 Subject: [PATCH 11/30] Add blurb for increased gzip read speed --- .../next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst diff --git a/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst b/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst new file mode 100644 index 00000000000000..b99f94c07a0611 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst @@ -0,0 +1 @@ +`gzip.GzipFile` reads files 10% faster. From 070df1cfeb8d4d7f6be59e31342929baf566936a Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Sep 2022 09:42:46 +0200 Subject: [PATCH 12/30] Make sure self->initialised is set to 0. Reword some comments. --- Modules/zlibmodule.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index df7852f8ad563a..bad0f7af39e9de 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1469,10 +1469,12 @@ arrange_output_buffer(uint32_t *avail_out, return ret; } -/* Decompress data of length d->bzs_avail_in_real in d->state.next_in. The output - buffer is allocated dynamically and returned. At most max_length bytes are - returned, so some of the input may not be consumed. d->state.next_in and - d->bzs_avail_in_real are updated to reflect the consumed input. */ +/* Decompress data of length self->avail_in_real in self->state.next_in. The + output buffer is allocated dynamically and returned. If the max_length is + of sufficiently low size, max_length is allocated immediately. At most + max_length bytes are returned, so some of the input may not be consumed. + self->state.next_in and self->avail_in_real are updated to reflect the + consumed input. */ static PyObject* decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) { @@ -1486,8 +1488,9 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) int err = Z_OK; - /* In Python 3.10 sometimes sys.maxsize is passed by default. In those cases - we do want to use DEF_BUF_SIZE as start buffer. */ + /* When sys.maxsize is passed as default use DEF_BUF_SIZE as start buffer. + In this particular case the data may not necessarily be very big, so + it is better to grow dynamically.*/ if ((max_length < 0) || max_length == PY_SSIZE_T_MAX) { hard_limit = PY_SSIZE_T_MAX; obuflen = DEF_BUF_SIZE; @@ -1543,13 +1546,14 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) if (err == Z_STREAM_END) { self->eof = 1; self->is_initialised = 0; - /* Unlike the Decompress object we end here, since there is no - backwards-compatibility issues */ + /* Unlike the Decompress object we call inflateEnd here as there are no + backwards compatibility issues */ err = inflateEnd(&self->zst); if (err != Z_OK) { zlib_error(state, self->zst, err, "while finishing decompression"); goto error; } + self->is_initialised = 0; } else if (err != Z_OK && err != Z_BUF_ERROR) { zlib_error(state, self->zst, err, "while decompressing data"); } From 70b7d4d454b287bd70b2a8e4e185a6e3dde11b88 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Sep 2022 09:54:35 +0200 Subject: [PATCH 13/30] Add appropriate doctype in blurb --- .../next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst b/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst index b99f94c07a0611..c16efe7b4fd2d1 100644 --- a/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst +++ b/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst @@ -1 +1 @@ -`gzip.GzipFile` reads files 10% faster. +:meth:`gzip.GzipFile` reads files 10% faster. From 18a769262dcc20f695aa9046c5cdd09a77a793a0 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Sep 2022 11:40:18 +0200 Subject: [PATCH 14/30] Add missing NULL member to ZlibDecompressor_Members --- Modules/zlibmodule.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index bad0f7af39e9de..b1d22bbe4d6e6a 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1863,6 +1863,7 @@ static PyMemberDef ZlibDecompressor_members[] = { READONLY, ZlibDecompressor_unused_data__doc__}, {"needs_input", T_BOOL, offsetof(ZlibDecompressor, needs_input), READONLY, ZlibDecompressor_needs_input_doc}, + {NULL}, }; From c90096fa33daf8d543052a491340b851d3fbd92b Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Sep 2022 11:58:07 +0200 Subject: [PATCH 15/30] Remove double comment --- Lib/gzip.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 0492be30b11e10..10ea44038c4799 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -495,7 +495,6 @@ def read(self, size=-1): return b"" self._new_member = False - # Read a chunk of data from the file # Read a chunk of data from the file if self._decompressor.needs_input: buf = self._fp.read(READ_BUFFER_SIZE) From 1c1583960d1d670f4db1742d7e0fbc3da829d51e Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Sep 2022 12:15:18 +0200 Subject: [PATCH 16/30] Use READ_BUFFER_SIZE in python -m gzip command line application --- Lib/gzip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index 10ea44038c4799..fd84a5a62543ea 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -647,7 +647,7 @@ def main(): f = builtins.open(arg, "rb") g = open(arg + ".gz", "wb") while True: - chunk = f.read(io.DEFAULT_BUFFER_SIZE) + chunk = f.read(READ_BUFFER_SIZE) if not chunk: break g.write(chunk) From d0ff4f01594e3ec430d60f837175969df4e3412b Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Sep 2022 13:34:44 +0200 Subject: [PATCH 17/30] Fix error in news entry --- .../next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst b/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst index c16efe7b4fd2d1..131d4ed69991a3 100644 --- a/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst +++ b/Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst @@ -1 +1 @@ -:meth:`gzip.GzipFile` reads files 10% faster. +:meth:`gzip.GzipFile.read` reads 10% faster. From afd92ab6a8eb8da31a1c299b761d21bbd7255d7f Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Fri, 30 Sep 2022 11:08:59 -0700 Subject: [PATCH 18/30] minor edit, use += --- Lib/gzip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/gzip.py b/Lib/gzip.py index fd84a5a62543ea..75c6ddc3f2cffb 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -514,7 +514,7 @@ def read(self, size=-1): "end-of-stream marker was reached") self._crc = zlib.crc32(uncompress, self._crc) - self._stream_size = self._stream_size + len(uncompress) + self._stream_size += len(uncompress) self._pos += len(uncompress) return uncompress From 922ac5c40b8d81a11aa477521700f2a6566a0514 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 09:49:21 +0200 Subject: [PATCH 19/30] Throw compile warning on zlib versions that are too old --- Modules/zlibmodule.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index b1d22bbe4d6e6a..1c2918581f9b99 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -9,6 +9,10 @@ #include "structmember.h" // PyMemberDef #include "zlib.h" +#if defined(ZLIB_VERNUM) && ZLIB_VERNUM < 0x1221 +#error "At least zlib version 1.2.2.1 is required" +#endif + // Blocks output buffer wrappers #include "pycore_blocks_output_buffer.h" @@ -171,9 +175,6 @@ OutputBuffer_WindowOnError(_BlocksOutputBuffer *buffer, _Uint32Window *window) } } while (0) #define LEAVE_ZLIB(obj) PyThread_release_lock((obj)->lock); -#if defined(ZLIB_VERNUM) && ZLIB_VERNUM >= 0x1221 -# define AT_LEAST_ZLIB_1_2_2_1 -#endif /* The following parameters are copied from zutil.h, version 0.95 */ #define DEFLATED 8 @@ -677,18 +678,10 @@ zlib_decompressobj_impl(PyObject *module, int wbits, PyObject *zdict) case Z_OK: self->is_initialised = 1; if (self->zdict != NULL && wbits < 0) { -#ifdef AT_LEAST_ZLIB_1_2_2_1 if (set_inflate_zdict(state, self) < 0) { Py_DECREF(self); return NULL; } -#else - PyErr_Format(state->ZlibError, - "zlib version %s does not allow raw inflate with dictionary", - ZLIB_VERSION); - Py_DECREF(self); - return NULL; -#endif } return (PyObject *)self; case Z_STREAM_ERROR: @@ -1782,18 +1775,10 @@ ZlibDecompressor__new__(PyTypeObject *cls, case Z_OK: self->is_initialised = 1; if (self->zdict != NULL && wbits < 0) { -#ifdef AT_LEAST_ZLIB_1_2_2_1 if (set_inflate_zdict_ZlibDecompressor(state, self) < 0) { Py_DECREF(self); return NULL; } -#else - PyErr_Format(state->ZlibError, - "zlib version %s does not allow raw inflate with dictionary", - ZLIB_VERSION); - Py_DECREF(self); - return NULL; -#endif } return (PyObject *)self; case Z_STREAM_ERROR: From dc7de612f3cfef7e3aa7ef4ec50c5c56fc9e5ae2 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 09:51:48 +0200 Subject: [PATCH 20/30] Use bool instead of int --- Modules/zlibmodule.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 1c2918581f9b99..433d776ca9ae70 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -8,6 +8,7 @@ #include "Python.h" #include "structmember.h" // PyMemberDef #include "zlib.h" +#include "stdbool.h" #if defined(ZLIB_VERNUM) && ZLIB_VERNUM < 0x1221 #error "At least zlib version 1.2.2.1 is required" @@ -212,7 +213,7 @@ typedef struct PyObject *unused_data; PyObject *unconsumed_tail; char eof; - int is_initialised; + bool is_initialised; PyObject *zdict; PyThread_type_lock lock; } compobject; @@ -1358,7 +1359,7 @@ typedef struct { separately. Conversion and looping is encapsulated in decompress_buf() */ Py_ssize_t avail_in_real; - int is_initialised; + bool is_initialised; char eof; /* T_BOOL expects a char */ char needs_input; } ZlibDecompressor; From ca12c1f185dd99dcfff976e0ce1bd6da973fa087 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 09:52:26 +0200 Subject: [PATCH 21/30] Correct spelling of insufficient --- Modules/zlibmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 433d776ca9ae70..1f0e81f0044314 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1512,7 +1512,7 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) hard_limit); if (obuflen == -1){ PyErr_SetString(PyExc_MemoryError, - "Unsufficient memory for buffer allocation"); + "Insufficient memory for buffer allocation"); goto error; } else if (obuflen == -2) From 1ce342b7efb98c8b1fd39d5c6e10753d93e6a504 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 09:53:05 +0200 Subject: [PATCH 22/30] Put brackets around if statement --- Modules/zlibmodule.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 1f0e81f0044314..ed60372d8602c5 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1515,9 +1515,9 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) "Insufficient memory for buffer allocation"); goto error; } - else if (obuflen == -2) + else if (obuflen == -2) { break; - + } Py_BEGIN_ALLOW_THREADS err = inflate(&self->zst, Z_SYNC_FLUSH); Py_END_ALLOW_THREADS From 0b7735e6cfc6ac40ec51ef848e52f809873f2f2b Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 09:55:56 +0200 Subject: [PATCH 23/30] Remove strange default case --- Modules/zlibmodule.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index ed60372d8602c5..183728770331e8 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1302,13 +1302,6 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls, case Z_STREAM_END: break; default: - if (err == Z_NEED_DICT && self->zdict != NULL) { - if (set_inflate_zdict(state, self) < 0) { - goto abort; - } - else - break; - } goto save; } From 043a376b919a9bcc13199eec49789c6049802015 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 09:58:09 +0200 Subject: [PATCH 24/30] Remove unnecessary zero op --- Modules/zlibmodule.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 183728770331e8..fa0c4382bb230f 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1540,7 +1540,6 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) zlib_error(state, self->zst, err, "while finishing decompression"); goto error; } - self->is_initialised = 0; } else if (err != Z_OK && err != Z_BUF_ERROR) { zlib_error(state, self->zst, err, "while decompressing data"); } From 2a653a996e6c8b30af99d418842cddcc0efac4e1 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 10:00:23 +0200 Subject: [PATCH 25/30] Change RetVal to return_value --- Modules/zlibmodule.c | 108 +++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index fa0c4382bb230f..e0af0b9e5acd99 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -324,7 +324,7 @@ static PyObject * zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits) /*[clinic end generated code: output=46bd152fadd66df2 input=c4d06ee5782a7e3f]*/ { - PyObject *RetVal; + PyObject *return_value; int flush; z_stream zst; _BlocksOutputBuffer buffer = {.list = NULL}; @@ -391,11 +391,11 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits) err = deflateEnd(&zst); if (err == Z_OK) { - RetVal = OutputBuffer_Finish(&buffer, zst.avail_out); - if (RetVal == NULL) { + return_value = OutputBuffer_Finish(&buffer, zst.avail_out); + if (return_value == NULL) { goto error; } - return RetVal; + return return_value; } else zlib_error(state, zst, err, "while finishing compression"); @@ -423,7 +423,7 @@ zlib_decompress_impl(PyObject *module, Py_buffer *data, int wbits, Py_ssize_t bufsize) /*[clinic end generated code: output=77c7e35111dc8c42 input=a9ac17beff1f893f]*/ { - PyObject *RetVal; + PyObject *return_value; Byte *ibuf; Py_ssize_t ibuflen; int err, flush; @@ -518,9 +518,9 @@ zlib_decompress_impl(PyObject *module, Py_buffer *data, int wbits, goto error; } - RetVal = OutputBuffer_WindowFinish(&buffer, &window, zst.avail_out); - if (RetVal != NULL) { - return RetVal; + return_value = OutputBuffer_WindowFinish(&buffer, &window, zst.avail_out); + if (return_value != NULL) { + return return_value; } error: @@ -749,7 +749,7 @@ zlib_Compress_compress_impl(compobject *self, PyTypeObject *cls, Py_buffer *data) /*[clinic end generated code: output=6731b3f0ff357ca6 input=04d00f65ab01d260]*/ { - PyObject *RetVal; + PyObject *return_value; int err; _BlocksOutputBuffer buffer = {.list = NULL}; zlibstate *state = PyType_GetModuleState(cls); @@ -787,17 +787,17 @@ zlib_Compress_compress_impl(compobject *self, PyTypeObject *cls, } while (ibuflen != 0); - RetVal = OutputBuffer_Finish(&buffer, self->zst.avail_out); - if (RetVal != NULL) { + return_value = OutputBuffer_Finish(&buffer, self->zst.avail_out); + if (return_value != NULL) { goto success; } error: OutputBuffer_OnError(&buffer); - RetVal = NULL; + return_value = NULL; success: LEAVE_ZLIB(self); - return RetVal; + return return_value; } /* Helper for objdecompress() and flush(). Saves any unconsumed input data in @@ -871,7 +871,7 @@ zlib_Decompress_decompress_impl(compobject *self, PyTypeObject *cls, { int err = Z_OK; Py_ssize_t ibuflen; - PyObject *RetVal; + PyObject *return_value; _BlocksOutputBuffer buffer = {.list = NULL}; PyObject *module = PyType_GetModule(cls); @@ -949,17 +949,17 @@ zlib_Decompress_decompress_impl(compobject *self, PyTypeObject *cls, goto abort; } - RetVal = OutputBuffer_Finish(&buffer, self->zst.avail_out); - if (RetVal != NULL) { + return_value = OutputBuffer_Finish(&buffer, self->zst.avail_out); + if (return_value != NULL) { goto success; } abort: OutputBuffer_OnError(&buffer); - RetVal = NULL; + return_value = NULL; success: LEAVE_ZLIB(self); - return RetVal; + return return_value; } /*[clinic input] @@ -981,7 +981,7 @@ zlib_Compress_flush_impl(compobject *self, PyTypeObject *cls, int mode) /*[clinic end generated code: output=c7efd13efd62add2 input=286146e29442eb6c]*/ { int err; - PyObject *RetVal; + PyObject *return_value; _BlocksOutputBuffer buffer = {.list = NULL}; zlibstate *state = PyType_GetModuleState(cls); @@ -1038,17 +1038,17 @@ zlib_Compress_flush_impl(compobject *self, PyTypeObject *cls, int mode) goto error; } - RetVal = OutputBuffer_Finish(&buffer, self->zst.avail_out); - if (RetVal != NULL) { + return_value = OutputBuffer_Finish(&buffer, self->zst.avail_out); + if (return_value != NULL) { goto success; } error: OutputBuffer_OnError(&buffer); - RetVal = NULL; + return_value = NULL; success: LEAVE_ZLIB(self); - return RetVal; + return return_value; } #ifdef HAVE_ZLIB_COPY @@ -1067,14 +1067,14 @@ zlib_Compress_copy_impl(compobject *self, PyTypeObject *cls) { zlibstate *state = PyType_GetModuleState(cls); - compobject *retval = newcompobject(state->Comptype); - if (!retval) return NULL; + compobject *return_value = newcompobject(state->Comptype); + if (!return_value) return NULL; /* Copy the zstream state * We use ENTER_ZLIB / LEAVE_ZLIB to make this thread-safe */ ENTER_ZLIB(self); - int err = deflateCopy(&retval->zst, &self->zst); + int err = deflateCopy(&return_value->zst, &self->zst); switch (err) { case Z_OK: break; @@ -1090,22 +1090,22 @@ zlib_Compress_copy_impl(compobject *self, PyTypeObject *cls) goto error; } Py_INCREF(self->unused_data); - Py_XSETREF(retval->unused_data, self->unused_data); + Py_XSETREF(return_value->unused_data, self->unused_data); Py_INCREF(self->unconsumed_tail); - Py_XSETREF(retval->unconsumed_tail, self->unconsumed_tail); + Py_XSETREF(return_value->unconsumed_tail, self->unconsumed_tail); Py_XINCREF(self->zdict); - Py_XSETREF(retval->zdict, self->zdict); - retval->eof = self->eof; + Py_XSETREF(return_value->zdict, self->zdict); + return_value->eof = self->eof; /* Mark it as being initialized */ - retval->is_initialised = 1; + return_value->is_initialised = 1; LEAVE_ZLIB(self); - return (PyObject *)retval; + return (PyObject *)return_value; error: LEAVE_ZLIB(self); - Py_XDECREF(retval); + Py_XDECREF(return_value); return NULL; } @@ -1154,14 +1154,14 @@ zlib_Decompress_copy_impl(compobject *self, PyTypeObject *cls) { zlibstate *state = PyType_GetModuleState(cls); - compobject *retval = newcompobject(state->Decomptype); - if (!retval) return NULL; + compobject *return_value = newcompobject(state->Decomptype); + if (!return_value) return NULL; /* Copy the zstream state * We use ENTER_ZLIB / LEAVE_ZLIB to make this thread-safe */ ENTER_ZLIB(self); - int err = inflateCopy(&retval->zst, &self->zst); + int err = inflateCopy(&return_value->zst, &self->zst); switch (err) { case Z_OK: break; @@ -1178,22 +1178,22 @@ zlib_Decompress_copy_impl(compobject *self, PyTypeObject *cls) } Py_INCREF(self->unused_data); - Py_XSETREF(retval->unused_data, self->unused_data); + Py_XSETREF(return_value->unused_data, self->unused_data); Py_INCREF(self->unconsumed_tail); - Py_XSETREF(retval->unconsumed_tail, self->unconsumed_tail); + Py_XSETREF(return_value->unconsumed_tail, self->unconsumed_tail); Py_XINCREF(self->zdict); - Py_XSETREF(retval->zdict, self->zdict); - retval->eof = self->eof; + Py_XSETREF(return_value->zdict, self->zdict); + return_value->eof = self->eof; /* Mark it as being initialized */ - retval->is_initialised = 1; + return_value->is_initialised = 1; LEAVE_ZLIB(self); - return (PyObject *)retval; + return (PyObject *)return_value; error: LEAVE_ZLIB(self); - Py_XDECREF(retval); + Py_XDECREF(return_value); return NULL; } @@ -1248,7 +1248,7 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls, { int err, flush; Py_buffer data; - PyObject *RetVal; + PyObject *return_value; Py_ssize_t ibuflen; _BlocksOutputBuffer buffer = {.list = NULL}; _Uint32Window window; // output buffer's UINT32_MAX sliding window @@ -1325,18 +1325,18 @@ zlib_Decompress_flush_impl(compobject *self, PyTypeObject *cls, } } - RetVal = OutputBuffer_WindowFinish(&buffer, &window, self->zst.avail_out); - if (RetVal != NULL) { + return_value = OutputBuffer_WindowFinish(&buffer, &window, self->zst.avail_out); + if (return_value != NULL) { goto success; } abort: OutputBuffer_WindowOnError(&buffer, &window); - RetVal = NULL; + return_value = NULL; success: PyBuffer_Release(&data); LEAVE_ZLIB(self); - return RetVal; + return return_value; } @@ -1468,7 +1468,7 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) /* data_size is strictly positive, but because we repeatedly have to compare against max_length and PyBytes_GET_SIZE we declare it as signed */ - PyObject *RetVal = NULL; + PyObject *return_value = NULL; Py_ssize_t hard_limit; Py_ssize_t obuflen; zlibstate *state = PyType_GetModuleState(Py_TYPE(self)); @@ -1500,7 +1500,7 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) do { obuflen = arrange_output_buffer_with_maximum(&(self->zst.avail_out), &(self->zst.next_out), - &RetVal, + &return_value, obuflen, hard_limit); if (obuflen == -1){ @@ -1546,16 +1546,16 @@ decompress_buf(ZlibDecompressor *self, Py_ssize_t max_length) self->avail_in_real += self->zst.avail_in; - if (_PyBytes_Resize(&RetVal, self->zst.next_out - - (uint8_t *)PyBytes_AS_STRING(RetVal)) != 0) { + if (_PyBytes_Resize(&return_value, self->zst.next_out - + (uint8_t *)PyBytes_AS_STRING(return_value)) != 0) { goto error; } goto success; error: - Py_CLEAR(RetVal); + Py_CLEAR(return_value); success: - return RetVal; + return return_value; } From 475aef649ffa342e704a9477d3fb0e03b03e1389 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 10:01:00 +0200 Subject: [PATCH 26/30] Change char to bool --- Modules/zlibmodule.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index e0af0b9e5acd99..7b2d649785a62c 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1563,7 +1563,7 @@ static PyObject * decompress(ZlibDecompressor *self, uint8_t *data, size_t len, Py_ssize_t max_length) { - char input_buffer_in_use; + bool input_buffer_in_use; PyObject *result; /* Prepend unconsumed input if necessary */ From 41ba076d2e067805a16fecefe9cb221192b9075c Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 10:01:53 +0200 Subject: [PATCH 27/30] Properly bracketify if-else clause --- Modules/zlibmodule.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 7b2d649785a62c..88c71753e4cdd4 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1702,10 +1702,12 @@ zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self, PyObject *result = NULL; ENTER_ZLIB(self); - if (self->eof) + if (self->eof) { PyErr_SetString(PyExc_EOFError, "End of stream already reached"); - else + } + else { result = decompress(self, data->buf, data->len, max_length); + } LEAVE_ZLIB(self); return result; } From 5f1901d8cf575b90b40284bd99d93db0028635e2 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 10:03:44 +0200 Subject: [PATCH 28/30] Prefix underscore to _ZlibDecompressor name --- Modules/zlibmodule.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 88c71753e4cdd4..2b723f754ed1d9 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1730,7 +1730,7 @@ ZlibDecompressor__new__(PyTypeObject *cls, PyObject *kwargs) { static char *keywords[] = {"wbits", "zdict", NULL}; - static char *format = "|iO:ZlibDecompressor"; + static char *format = "|iO:_ZlibDecompressor"; int wbits = MAX_WBITS; PyObject *zdict = NULL; zlibstate *state = PyType_GetModuleState(cls); @@ -1974,7 +1974,7 @@ static PyType_Slot ZlibDecompressor_type_slots[] = { }; static PyType_Spec ZlibDecompressor_type_spec = { - .name = "zlib.ZlibDecompressor", + .name = "zlib._ZlibDecompressor", .basicsize = sizeof(ZlibDecompressor), // Calling PyType_GetModuleState() on a subclass is not safe. // ZlibDecompressor_type_spec does not have Py_TPFLAGS_BASETYPE flag From c5d68889c7a4033f1cbad9f5beb88ae956704f42 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Sun, 2 Oct 2022 10:07:51 +0200 Subject: [PATCH 29/30] Copy explanation about zdict from python docs into function docstring --- Modules/zlibmodule.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index 2b723f754ed1d9..88d2dd5563536b 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -1720,8 +1720,12 @@ PyDoc_STRVAR(ZlibDecompressor__new____doc__, "\n" " wbits = 15\n" " zdict\n" -" The predefined compression dictionary. This must be the same\n" -" dictionary as used by the compressor that produced the input data.\n" +" The predefined compression dictionary. This is a sequence of bytes\n" +" (such as a bytes object) containing subsequences that are expected\n" +" to occur frequently in the data that is to be compressed. Those\n" +" subsequences that are expected to be most common should come at the\n" +" end of the dictionary. This must be the same dictionary as used by the\n" +" compressor that produced the input data.\n" "\n"); static PyObject * From e3da4155a2c10dd0d8e286c8d5b04c6f36f51a50 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 3 Oct 2022 10:24:43 +0200 Subject: [PATCH 30/30] Add tests for _ZlibDecompressor --- Lib/test/test_zlib.py | 167 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) diff --git a/Lib/test/test_zlib.py b/Lib/test/test_zlib.py index f20aad051da960..ae54f6c46891c6 100644 --- a/Lib/test/test_zlib.py +++ b/Lib/test/test_zlib.py @@ -944,6 +944,173 @@ def choose_lines(source, number, seed=None, generator=random): """ +class ZlibDecompressorTest(): + # Test adopted from test_bz2.py + TEXT = HAMLET_SCENE + DATA = zlib.compress(HAMLET_SCENE) + BAD_DATA = b"Not a valid deflate block" + def test_Constructor(self): + self.assertRaises(TypeError, zlib._ZlibDecompressor, 42) + + def testDecompress(self): + zlibd = zlib._ZlibDecompressor() + self.assertRaises(TypeError, zlibd.decompress) + text = zlibd.decompress(self.DATA) + self.assertEqual(text, self.TEXT) + + def testDecompressChunks10(self): + zlibd = zlib._ZlibDecompressor() + text = b'' + n = 0 + while True: + str = self.DATA[n*10:(n+1)*10] + if not str: + break + text += zlibd.decompress(str) + n += 1 + self.assertEqual(text, self.TEXT) + + def testDecompressUnusedData(self): + zlibd = zlib._ZlibDecompressor() + unused_data = b"this is unused data" + text = zlibd.decompress(self.DATA+unused_data) + self.assertEqual(text, self.TEXT) + self.assertEqual(zlibd.unused_data, unused_data) + + def testEOFError(self): + zlibd = zlib._ZlibDecompressor() + text = zlibd.decompress(self.DATA) + self.assertRaises(EOFError, zlibd.decompress, b"anything") + self.assertRaises(EOFError, zlibd.decompress, b"") + + @support.skip_if_pgo_task + @bigmemtest(size=_4G + 100, memuse=3.3) + def testDecompress4G(self, size): + # "Test zlib._ZlibDecompressor.decompress() with >4GiB input" + blocksize = 10 * 1024 * 1024 + block = random.randbytes(blocksize) + try: + data = block * (size // blocksize + 1) + compressed = zlib.compress(data) + zlibd = zlib._ZlibDecompressor() + decompressed = zlibd.decompress(compressed) + self.assertTrue(decompressed == data) + finally: + data = None + compressed = None + decompressed = None + + def testPickle(self): + for proto in range(pickle.HIGHEST_PROTOCOL + 1): + with self.assertRaises(TypeError): + pickle.dumps(zlib._ZlibDecompressor(), proto) + + def testDecompressorChunksMaxsize(self): + zlibd = zlib._ZlibDecompressor() + max_length = 100 + out = [] + + # Feed some input + len_ = len(self.BIG_DATA) - 64 + out.append(zlibd.decompress(self.BIG_DATA[:len_], + max_length=max_length)) + self.assertFalse(zlibd.needs_input) + self.assertEqual(len(out[-1]), max_length) + + # Retrieve more data without providing more input + out.append(zlibd.decompress(b'', max_length=max_length)) + self.assertFalse(zlibd.needs_input) + self.assertEqual(len(out[-1]), max_length) + + # Retrieve more data while providing more input + out.append(zlibd.decompress(self.BIG_DATA[len_:], + max_length=max_length)) + self.assertLessEqual(len(out[-1]), max_length) + + # Retrieve remaining uncompressed data + while not zlibd.eof: + out.append(zlibd.decompress(b'', max_length=max_length)) + self.assertLessEqual(len(out[-1]), max_length) + + out = b"".join(out) + self.assertEqual(out, self.BIG_TEXT) + self.assertEqual(zlibd.unused_data, b"") + + def test_decompressor_inputbuf_1(self): + # Test reusing input buffer after moving existing + # contents to beginning + zlibd = zlib._ZlibDecompressor() + out = [] + + # Create input buffer and fill it + self.assertEqual(zlibd.decompress(self.DATA[:100], + max_length=0), b'') + + # Retrieve some results, freeing capacity at beginning + # of input buffer + out.append(zlibd.decompress(b'', 2)) + + # Add more data that fits into input buffer after + # moving existing data to beginning + out.append(zlibd.decompress(self.DATA[100:105], 15)) + + # Decompress rest of data + out.append(zlibd.decompress(self.DATA[105:])) + self.assertEqual(b''.join(out), self.TEXT) + + def test_decompressor_inputbuf_2(self): + # Test reusing input buffer by appending data at the + # end right away + zlibd = zlib._ZlibDecompressor() + out = [] + + # Create input buffer and empty it + self.assertEqual(zlibd.decompress(self.DATA[:200], + max_length=0), b'') + out.append(zlibd.decompress(b'')) + + # Fill buffer with new data + out.append(zlibd.decompress(self.DATA[200:280], 2)) + + # Append some more data, not enough to require resize + out.append(zlibd.decompress(self.DATA[280:300], 2)) + + # Decompress rest of data + out.append(zlibd.decompress(self.DATA[300:])) + self.assertEqual(b''.join(out), self.TEXT) + + def test_decompressor_inputbuf_3(self): + # Test reusing input buffer after extending it + + zlibd = zlib._ZlibDecompressor() + out = [] + + # Create almost full input buffer + out.append(zlibd.decompress(self.DATA[:200], 5)) + + # Add even more data to it, requiring resize + out.append(zlibd.decompress(self.DATA[200:300], 5)) + + # Decompress rest of data + out.append(zlibd.decompress(self.DATA[300:])) + self.assertEqual(b''.join(out), self.TEXT) + + def test_failure(self): + zlibd = zlib._ZlibDecompressor() + self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30) + # Previously, a second call could crash due to internal inconsistency + self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30) + + @support.refcount_test + def test_refleaks_in___init__(self): + gettotalrefcount = support.get_attribute(sys, 'gettotalrefcount') + zlibd = zlib._ZlibDecompressor() + refs_before = gettotalrefcount() + for i in range(100): + zlibd.__init__() + self.assertAlmostEqual(gettotalrefcount() - refs_before, 0, delta=10) + + class CustomInt: def __index__(self): return 100