diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 0867200a59a230..6c5a2d1ccacbb6 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -20,6 +20,7 @@ _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} _UNIT_CODES = _LITERAL_CODES | {ANY, IN} +_REPEAT_COUNT_OFFSET = 5 _REPEATING_CODES = { MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), @@ -155,6 +156,8 @@ def _compile(code, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) + emit(code[_REPEAT_COUNT_OFFSET]) # REPEAT index + code[_REPEAT_COUNT_OFFSET] += 1 # REPEAT count + 1 _compile(code, av[2], flags) code[skip] = _len(code) - skip emit(REPEATING_CODES[op][1]) @@ -551,7 +554,8 @@ def _compile_info(code, pattern, flags): if hi > MAXCODE: hi = MAXCODE if lo == 0: - code.extend([INFO, 4, 0, lo, hi]) + # INFO, skip, mask, lo, hi, repeat_count + code.extend([INFO, 5, 0, lo, hi, 0]) return # look for a literal prefix prefix = [] @@ -587,6 +591,9 @@ def _compile_info(code, pattern, flags): emit(MAXCODE) prefix = prefix[:MAXCODE] emit(min(hi, MAXCODE)) + # REPEAT count + assert len(code) == _REPEAT_COUNT_OFFSET + emit(0) # add literal prefix if prefix: emit(len(prefix)) # length @@ -719,8 +726,14 @@ def print_2(*args): else: print_(FAILURE) i += 1 - elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, - POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): + elif op in (REPEAT, POSSESSIVE_REPEAT): + skip, min, max, repeat_index = code[i: i+4] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, min, max, repeat_index, to=i+skip) + dis_(i+4, i+skip) + i += skip + elif op in (REPEAT_ONE, MIN_REPEAT_ONE, POSSESSIVE_REPEAT_ONE): skip, min, max = code[i: i+3] if max == MAXREPEAT: max = 'MAXREPEAT' @@ -742,15 +755,15 @@ def print_2(*args): dis_(i+1, i+skip) i += skip elif op is INFO: - skip, flags, min, max = code[i: i+4] + skip, flags, min, max, repeat_count = code[i: i+5] if max == MAXREPEAT: max = 'MAXREPEAT' - print_(op, skip, bin(flags), min, max, to=i+skip) - start = i+4 + print_(op, skip, bin(flags), min, max, repeat_count, to=i+skip) + start = i+5 if flags & SRE_INFO_PREFIX: - prefix_len, prefix_skip = code[i+4: i+6] + prefix_len, prefix_skip = code[i+5: i+7] print_2(' prefix_skip', prefix_skip) - start = i + 6 + start = i + 7 prefix = code[start: start+prefix_len] print_2(' prefix', '[%s]' % ', '.join('%#02x' % x for x in prefix), diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index a00b0170607b59..d2f547340209fe 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20220318 +MAGIC = 20220330 from _sre import MAXREPEAT, MAXGROUPS diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 85716fbe2a8e8d..1428f83615ab7b 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2264,30 +2264,30 @@ def test_debug_flag(self): LITERAL 58 LITERAL 32 - 0. INFO 8 0b1 2 5 (to 9) + 0. INFO 9 0b1 2 5 0 (to 10) prefix_skip 0 prefix [0x2e] ('.') overlap [0] - 9: MARK 0 -11. LITERAL 0x2e ('.') -13. MARK 1 -15. BRANCH 10 (to 26) -17. IN 6 (to 24) -19. LITERAL 0x63 ('c') -21. LITERAL 0x68 ('h') -23. FAILURE -24: JUMP 9 (to 34) -26: branch 7 (to 33) -27. LITERAL 0x70 ('p') -29. LITERAL 0x79 ('y') -31. JUMP 2 (to 34) -33: FAILURE -34: GROUPREF_EXISTS 0 6 (to 41) -37. AT END -39. JUMP 5 (to 45) -41: LITERAL 0x3a (':') -43. LITERAL 0x20 (' ') -45: SUCCESS +10: MARK 0 +12. LITERAL 0x2e ('.') +14. MARK 1 +16. BRANCH 10 (to 27) +18. IN 6 (to 25) +20. LITERAL 0x63 ('c') +22. LITERAL 0x68 ('h') +24. FAILURE +25: JUMP 9 (to 35) +27: branch 7 (to 34) +28. LITERAL 0x70 ('p') +30. LITERAL 0x79 ('y') +32. JUMP 2 (to 35) +34: FAILURE +35: GROUPREF_EXISTS 0 6 (to 42) +38. AT END +40. JUMP 5 (to 46) +42: LITERAL 0x3a (':') +44. LITERAL 0x20 (' ') +46: SUCCESS ''' self.assertEqual(get_debug_out(pat), dump) # Debug output is output again even a second time (bypassing @@ -2298,14 +2298,14 @@ def test_atomic_group(self): self.assertEqual(get_debug_out(r'(?>ab?)'), '''\ ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))] - 0. INFO 4 0b0 1 2 (to 5) - 5: ATOMIC_GROUP 11 (to 17) - 7. LITERAL 0x61 ('a') - 9. REPEAT_ONE 6 0 1 (to 16) -13. LITERAL 0x62 ('b') -15. SUCCESS -16: SUCCESS -17: SUCCESS + 0. INFO 5 0b0 1 2 0 (to 6) + 6: ATOMIC_GROUP 11 (to 18) + 8. LITERAL 0x61 ('a') +10. REPEAT_ONE 6 0 1 (to 17) +14. LITERAL 0x62 ('b') +16. SUCCESS +17: SUCCESS +18: SUCCESS ''') def test_possesive_repeat_one(self): @@ -2313,11 +2313,11 @@ def test_possesive_repeat_one(self): POSSESSIVE_REPEAT 0 1 LITERAL 97 - 0. INFO 4 0b0 0 1 (to 5) - 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12) - 9. LITERAL 0x61 ('a') -11. SUCCESS -12: SUCCESS + 0. INFO 5 0b0 0 1 0 (to 6) + 6: POSSESSIVE_REPEAT_ONE 6 0 1 (to 13) +10. LITERAL 0x61 ('a') +12. SUCCESS +13: SUCCESS ''') def test_possesive_repeat(self): @@ -2326,12 +2326,12 @@ def test_possesive_repeat(self): LITERAL 97 LITERAL 98 - 0. INFO 4 0b0 0 2 (to 5) - 5: POSSESSIVE_REPEAT 7 0 1 (to 13) - 9. LITERAL 0x61 ('a') -11. LITERAL 0x62 ('b') -13: SUCCESS -14. SUCCESS + 0. INFO 5 0b0 0 2 1 (to 6) + 6: POSSESSIVE_REPEAT 8 0 1 0 (to 15) +11. LITERAL 0x61 ('a') +13. LITERAL 0x62 ('b') +15: SUCCESS +16. SUCCESS ''') diff --git a/Misc/NEWS.d/next/Library/2022-03-30-15-50-46.bpo-23689.kQj4p0.rst b/Misc/NEWS.d/next/Library/2022-03-30-15-50-46.bpo-23689.kQj4p0.rst new file mode 100644 index 00000000000000..4acb602f3f7298 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-30-15-50-46.bpo-23689.kQj4p0.rst @@ -0,0 +1,2 @@ +:mod:`re` module: fix memory leak when a match is terminated by a signal. +Patch by Ma Lin. diff --git a/Modules/_sre.c b/Modules/_sre.c index 48193f82475a42..b17adccee2916d 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -427,6 +427,12 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->lastmark = -1; state->lastindex = -1; + state->repeats_array = PyMem_New(SRE_REPEAT, pattern->code[5]); + if (!state->repeats_array) { + PyErr_NoMemory(); + goto err; + } + state->buffer.buf = NULL; ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer); if (!ptr) @@ -476,6 +482,10 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, safely casted to `void*`, see bpo-39943 for details. */ PyMem_Free((void*) state->mark); state->mark = NULL; + + PyMem_Free((void*) state->repeats_array); + state->repeats_array = NULL; + if (state->buffer.buf) PyBuffer_Release(&state->buffer); return NULL; @@ -490,6 +500,7 @@ state_fini(SRE_STATE* state) data_stack_dealloc(state); /* See above PyMem_Del for why we explicitly cast here. */ PyMem_Free((void*) state->mark); + PyMem_Free((void*) state->repeats_array); state->mark = NULL; } @@ -1731,7 +1742,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_INFO: { /* A minimal info field is - <1=skip> <2=flags> <3=min> <4=max>; + <1=skip> <2=flags> <3=min> <4=max> <5=repeat_count>; If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags, more follows. */ SRE_CODE flags, i; @@ -1739,8 +1750,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) GET_SKIP; newcode = code+skip-1; GET_ARG; flags = arg; - GET_ARG; - GET_ARG; + GET_ARG; // min + GET_ARG; // max + GET_ARG; // repeat count /* Check that only valid flags are present */ if ((flags & ~(SRE_INFO_PREFIX | SRE_INFO_LITERAL | @@ -1841,13 +1853,14 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; + GET_ARG; // repeat index if (min > max) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (!_validate_inner(code, code+skip-3, groups)) + if (!_validate_inner(code, code+skip-4, groups)) FAIL; - code += skip-3; + code += skip-4; GET_OP; if (op1 == SRE_OP_POSSESSIVE_REPEAT) { if (op != SRE_OP_SUCCESS) diff --git a/Modules/sre.h b/Modules/sre.h index 785adbd003e7fd..4d6886d5021943 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -83,6 +83,8 @@ typedef struct { size_t data_stack_base; /* current repeat context */ SRE_REPEAT *repeat; + /* repeat contexts array */ + SRE_REPEAT *repeats_array; } SRE_STATE; typedef struct { diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 8b9125b75b4568..2670e338b5d853 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20220318 +#define SRE_MAGIC 20220330 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 8e4e714eada389..8110760476f935 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -546,7 +546,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) if (ctx->pattern[0] == SRE_OP_INFO) { /* optimization info block */ - /* <1=skip> <2=flags> <3=min> ... */ + /* <1=skip> <2=flags> <3=min> <4=max> + <5=repeat_count> ... */ if (ctx->pattern[3] && (uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) { TRACE(("reject (got %zd chars, need %zd)\n", end - ctx->ptr, (Py_ssize_t) ctx->pattern[3])); @@ -1032,16 +1033,14 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ - /* <1=min> <2=max> item tail */ - TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); + /* <1=min> <2=max> + <3=repeat_index> item tail */ + TRACE(("|%p|%p|REPEAT %d %d %d\n", ctx->pattern, ctx->ptr, + ctx->pattern[1], ctx->pattern[2], ctx->pattern[3])); + + /* install repeat context */ + ctx->u.rep = &state->repeats_array[ctx->pattern[3]]; - /* install new repeat context */ - ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep)); - if (!ctx->u.rep) { - PyErr_NoMemory(); - RETURN_FAILURE; - } ctx->u.rep->count = -1; ctx->u.rep->pattern = ctx->pattern; ctx->u.rep->prev = state->repeat; @@ -1051,7 +1050,6 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); state->repeat = ctx->u.rep->prev; - PyObject_Free(ctx->u.rep); if (ret) { RETURN_ON_ERROR(ret); @@ -1061,7 +1059,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_MAX_UNTIL: /* maximizing repeat */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> + <3=repeat_index> item tail */ /* FIXME: we probably need to deal with zero-width matches in here... */ @@ -1081,7 +1080,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1103,7 +1102,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { MARK_POP_DISCARD(ctx->lastmark); @@ -1128,7 +1127,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_MIN_UNTIL: /* minimizing repeat */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> + <3=repeat_index> item tail */ ctx->u.rep = state->repeat; if (!ctx->u.rep) @@ -1145,7 +1145,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1188,7 +1188,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { RETURN_ON_ERROR(ret); @@ -1200,10 +1200,10 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_POSSESSIVE_REPEAT: /* create possessive repeat contexts. */ - /* <1=min> <2=max> pattern - tail */ - TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + /* <1=min> <2=max> + <3=repeat_index> pattern tail */ + TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2], ctx->pattern[3])); /* Set the global Input pointer to this context's Input pointer */ @@ -1216,7 +1216,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) while (ctx->count < (Py_ssize_t)ctx->pattern[1]) { /* not enough matches */ DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1, - &ctx->pattern[3]); + &ctx->pattern[4]); if (ret) { RETURN_ON_ERROR(ret); ctx->count++; @@ -1263,7 +1263,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* We have not reached the maximin matches, so try to match once more. */ DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2, - &ctx->pattern[3]); + &ctx->pattern[4]); /* Check to see if the last attempted match succeeded. */ @@ -1593,7 +1593,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (pattern[0] == SRE_OP_INFO) { /* optimization info block */ - /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ + /* <1=skip> <2=flags> <3=min> <4=max> + <5=repeat_count> <6=prefix info> */ flags = pattern[2]; @@ -1613,14 +1614,14 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (flags & SRE_INFO_PREFIX) { /* pattern starts with a known prefix */ /* */ - prefix_len = pattern[5]; - prefix_skip = pattern[6]; - prefix = pattern + 7; + prefix_len = pattern[6]; + prefix_skip = pattern[7]; + prefix = pattern + 8; overlap = prefix + prefix_len - 1; } else if (flags & SRE_INFO_CHARSET) /* pattern starts with a character from a known set */ /* */ - charset = pattern + 5; + charset = pattern + 6; pattern += 1 + pattern[1]; }