Skip to content

[WIP] bpo-23689: re module, fix memory leak when a match is terminated by a signal #32188

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions Lib/sre_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
_REPEAT_COUNT_OFFSET = 5

_REPEATING_CODES = {
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
Expand Down Expand Up @@ -155,6 +156,8 @@ def _compile(code, pattern, flags):
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
emit(code[_REPEAT_COUNT_OFFSET]) # REPEAT index
code[_REPEAT_COUNT_OFFSET] += 1 # REPEAT count + 1
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
emit(REPEATING_CODES[op][1])
Expand Down Expand Up @@ -551,7 +554,8 @@ def _compile_info(code, pattern, flags):
if hi > MAXCODE:
hi = MAXCODE
if lo == 0:
code.extend([INFO, 4, 0, lo, hi])
# INFO, skip, mask, lo, hi, repeat_count
code.extend([INFO, 5, 0, lo, hi, 0])
return
# look for a literal prefix
prefix = []
Expand Down Expand Up @@ -587,6 +591,9 @@ def _compile_info(code, pattern, flags):
emit(MAXCODE)
prefix = prefix[:MAXCODE]
emit(min(hi, MAXCODE))
# REPEAT count
assert len(code) == _REPEAT_COUNT_OFFSET
emit(0)
Comment on lines +594 to +596
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me that INFO is only used for optimization. If you ignore it, all will work, but maybe slower.

Essential information like the number of groups etc is passed to _sre.compile() as separate arguments. The REPEAT count is of such kind. If it is not known we cannot start matching.

I propose to pass it as a separate argument. And in _validate_inner() check that all repeat_index are less than that count (like we validate group indices and the MARK argument).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds reasonable, I'll do these.
Please merge your PR first.

# add literal prefix
if prefix:
emit(len(prefix)) # length
Expand Down Expand Up @@ -719,8 +726,14 @@ def print_2(*args):
else:
print_(FAILURE)
i += 1
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
elif op in (REPEAT, POSSESSIVE_REPEAT):
skip, min, max, repeat_index = code[i: i+4]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, min, max, repeat_index, to=i+skip)
dis_(i+4, i+skip)
i += skip
elif op in (REPEAT_ONE, MIN_REPEAT_ONE, POSSESSIVE_REPEAT_ONE):
skip, min, max = code[i: i+3]
if max == MAXREPEAT:
max = 'MAXREPEAT'
Expand All @@ -742,15 +755,15 @@ def print_2(*args):
dis_(i+1, i+skip)
i += skip
elif op is INFO:
skip, flags, min, max = code[i: i+4]
skip, flags, min, max, repeat_count = code[i: i+5]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, bin(flags), min, max, to=i+skip)
start = i+4
print_(op, skip, bin(flags), min, max, repeat_count, to=i+skip)
start = i+5
if flags & SRE_INFO_PREFIX:
prefix_len, prefix_skip = code[i+4: i+6]
prefix_len, prefix_skip = code[i+5: i+7]
print_2(' prefix_skip', prefix_skip)
start = i + 6
start = i + 7
prefix = code[start: start+prefix_len]
print_2(' prefix',
'[%s]' % ', '.join('%#02x' % x for x in prefix),
Expand Down
2 changes: 1 addition & 1 deletion Lib/sre_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

# update when constants are added or removed

MAGIC = 20220318
MAGIC = 20220330

from _sre import MAXREPEAT, MAXGROUPS

Expand Down
80 changes: 40 additions & 40 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -2264,30 +2264,30 @@ def test_debug_flag(self):
LITERAL 58
LITERAL 32

0. INFO 8 0b1 2 5 (to 9)
0. INFO 9 0b1 2 5 0 (to 10)
prefix_skip 0
prefix [0x2e] ('.')
overlap [0]
9: MARK 0
11. LITERAL 0x2e ('.')
13. MARK 1
15. BRANCH 10 (to 26)
17. IN 6 (to 24)
19. LITERAL 0x63 ('c')
21. LITERAL 0x68 ('h')
23. FAILURE
24: JUMP 9 (to 34)
26: branch 7 (to 33)
27. LITERAL 0x70 ('p')
29. LITERAL 0x79 ('y')
31. JUMP 2 (to 34)
33: FAILURE
34: GROUPREF_EXISTS 0 6 (to 41)
37. AT END
39. JUMP 5 (to 45)
41: LITERAL 0x3a (':')
43. LITERAL 0x20 (' ')
45: SUCCESS
10: MARK 0
12. LITERAL 0x2e ('.')
14. MARK 1
16. BRANCH 10 (to 27)
18. IN 6 (to 25)
20. LITERAL 0x63 ('c')
22. LITERAL 0x68 ('h')
24. FAILURE
25: JUMP 9 (to 35)
27: branch 7 (to 34)
28. LITERAL 0x70 ('p')
30. LITERAL 0x79 ('y')
32. JUMP 2 (to 35)
34: FAILURE
35: GROUPREF_EXISTS 0 6 (to 42)
38. AT END
40. JUMP 5 (to 46)
42: LITERAL 0x3a (':')
44. LITERAL 0x20 (' ')
46: SUCCESS
'''
self.assertEqual(get_debug_out(pat), dump)
# Debug output is output again even a second time (bypassing
Expand All @@ -2298,26 +2298,26 @@ def test_atomic_group(self):
self.assertEqual(get_debug_out(r'(?>ab?)'), '''\
ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))]

0. INFO 4 0b0 1 2 (to 5)
5: ATOMIC_GROUP 11 (to 17)
7. LITERAL 0x61 ('a')
9. REPEAT_ONE 6 0 1 (to 16)
13. LITERAL 0x62 ('b')
15. SUCCESS
16: SUCCESS
17: SUCCESS
0. INFO 5 0b0 1 2 0 (to 6)
6: ATOMIC_GROUP 11 (to 18)
8. LITERAL 0x61 ('a')
10. REPEAT_ONE 6 0 1 (to 17)
14. LITERAL 0x62 ('b')
16. SUCCESS
17: SUCCESS
18: SUCCESS
''')

def test_possesive_repeat_one(self):
self.assertEqual(get_debug_out(r'a?+'), '''\
POSSESSIVE_REPEAT 0 1
LITERAL 97

0. INFO 4 0b0 0 1 (to 5)
5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12)
9. LITERAL 0x61 ('a')
11. SUCCESS
12: SUCCESS
0. INFO 5 0b0 0 1 0 (to 6)
6: POSSESSIVE_REPEAT_ONE 6 0 1 (to 13)
10. LITERAL 0x61 ('a')
12. SUCCESS
13: SUCCESS
''')

def test_possesive_repeat(self):
Expand All @@ -2326,12 +2326,12 @@ def test_possesive_repeat(self):
LITERAL 97
LITERAL 98

0. INFO 4 0b0 0 2 (to 5)
5: POSSESSIVE_REPEAT 7 0 1 (to 13)
9. LITERAL 0x61 ('a')
11. LITERAL 0x62 ('b')
13: SUCCESS
14. SUCCESS
0. INFO 5 0b0 0 2 1 (to 6)
6: POSSESSIVE_REPEAT 8 0 1 0 (to 15)
11. LITERAL 0x61 ('a')
13. LITERAL 0x62 ('b')
15: SUCCESS
16. SUCCESS
''')


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
:mod:`re` module: fix memory leak when a match is terminated by a signal.
Patch by Ma Lin.
23 changes: 18 additions & 5 deletions Modules/_sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,12 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->lastmark = -1;
state->lastindex = -1;

state->repeats_array = PyMem_New(SRE_REPEAT, pattern->code[5]);
if (!state->repeats_array) {
PyErr_NoMemory();
goto err;
}

state->buffer.buf = NULL;
ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
if (!ptr)
Expand Down Expand Up @@ -476,6 +482,10 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
safely casted to `void*`, see bpo-39943 for details. */
PyMem_Free((void*) state->mark);
state->mark = NULL;

PyMem_Free((void*) state->repeats_array);
state->repeats_array = NULL;

if (state->buffer.buf)
PyBuffer_Release(&state->buffer);
return NULL;
Expand All @@ -490,6 +500,7 @@ state_fini(SRE_STATE* state)
data_stack_dealloc(state);
/* See above PyMem_Del for why we explicitly cast here. */
PyMem_Free((void*) state->mark);
PyMem_Free((void*) state->repeats_array);
state->mark = NULL;
}

Expand Down Expand Up @@ -1731,16 +1742,17 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
case SRE_OP_INFO:
{
/* A minimal info field is
<INFO> <1=skip> <2=flags> <3=min> <4=max>;
<INFO> <1=skip> <2=flags> <3=min> <4=max> <5=repeat_count>;
If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
more follows. */
SRE_CODE flags, i;
SRE_CODE *newcode;
GET_SKIP;
newcode = code+skip-1;
GET_ARG; flags = arg;
GET_ARG;
GET_ARG;
GET_ARG; // min
GET_ARG; // max
GET_ARG; // repeat count
/* Check that only valid flags are present */
if ((flags & ~(SRE_INFO_PREFIX |
SRE_INFO_LITERAL |
Expand Down Expand Up @@ -1841,13 +1853,14 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
GET_SKIP;
GET_ARG; min = arg;
GET_ARG; max = arg;
GET_ARG; // repeat index
if (min > max)
FAIL;
if (max > SRE_MAXREPEAT)
FAIL;
if (!_validate_inner(code, code+skip-3, groups))
if (!_validate_inner(code, code+skip-4, groups))
FAIL;
code += skip-3;
code += skip-4;
GET_OP;
if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
if (op != SRE_OP_SUCCESS)
Expand Down
2 changes: 2 additions & 0 deletions Modules/sre.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ typedef struct {
size_t data_stack_base;
/* current repeat context */
SRE_REPEAT *repeat;
/* repeat contexts array */
SRE_REPEAT *repeats_array;
} SRE_STATE;

typedef struct {
Expand Down
2 changes: 1 addition & 1 deletion Modules/sre_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution.
*/

#define SRE_MAGIC 20220318
#define SRE_MAGIC 20220330
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
Expand Down
Loading