From 584a190c0c929ca1227468ae835b3966446c473e Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sat, 6 Nov 2021 10:40:10 +0000 Subject: [PATCH 1/2] Support underscore separators when formatting Decimal objects --- Doc/library/decimal.rst | 5 +++ Doc/whatsnew/3.11.rst | 9 +++++ Lib/_pydecimal.py | 2 +- Lib/test/test_decimal.py | 34 ++++++++++++++++++- .../2021-11-06-10-33-43.bpo-45708.LEc5wE.rst | 3 ++ Modules/_decimal/libmpdec/io.c | 6 ++++ 6 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2021-11-06-10-33-43.bpo-45708.LEc5wE.rst diff --git a/Doc/library/decimal.rst b/Doc/library/decimal.rst index e759c5cf23b9e7..c6f34486cb89df 100644 --- a/Doc/library/decimal.rst +++ b/Doc/library/decimal.rst @@ -403,6 +403,11 @@ Decimal objects Underscores are allowed for grouping, as with integral and floating-point literals in code. + .. versionchanged:: 3.11 + The underscore grouping option in the :ref:`formatting mini-language + ` is now supported for :class:`Decimal` objects: + ``f"{Decimal(1234567):_}"`` gives ``'1_234_567'``. + Decimal floating point objects share many properties with the other built-in numeric types such as :class:`float` and :class:`int`. All of the usual math operations and special methods apply. Likewise, decimal objects can be diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 7b15ace4031703..327b90eaaea501 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -190,6 +190,15 @@ New Modules Improved Modules ================ +decimal +------- + +* Formatting of :class:`~decimal.Decimal` objects now supports underscores + for grouping, as outlined in :PEP:`515`. For example, + ``f"{Decimal(1234567):_}"`` gives ``'1_234_567'``. Previously, only commas + were supported for grouping. (Contributed by Mark Dickinson in + :issue:`45708`.) + fractions --------- diff --git a/Lib/_pydecimal.py b/Lib/_pydecimal.py index f6d9ddf42e4734..9dec04ed4ea3d9 100644 --- a/Lib/_pydecimal.py +++ b/Lib/_pydecimal.py @@ -6154,7 +6154,7 @@ def _convert_for_comparison(self, other, equality_op=False): (?P\#)? (?P0)? (?P(?!0)\d+)? -(?P,)? +(?P[_,])? (?:\.(?P0|(?!0)\d+))? (?P[eEfFgGn%])? \Z diff --git a/Lib/test/test_decimal.py b/Lib/test/test_decimal.py index b6173a5ffec96c..86957098df2bf5 100644 --- a/Lib/test/test_decimal.py +++ b/Lib/test/test_decimal.py @@ -1050,7 +1050,7 @@ def test_formatting(self): ('\x00>10', '1.2345', '\x00\x00\x00\x001.2345'), ('\x00<10', '1.2345', '1.2345\x00\x00\x00\x00'), - # thousands separator + # thousands separator: ',' (',', '1234567', '1,234,567'), (',', '123456', '123,456'), (',', '12345', '12,345'), @@ -1082,6 +1082,38 @@ def test_formatting(self): (',e', '123456', '1.23456e+5'), (',E', '123456', '1.23456E+5'), + # thousands separator: '_' + ('_', '1234567', '1_234_567'), + ('_', '123456', '123_456'), + ('_', '12345', '12_345'), + ('_', '1234', '1_234'), + ('_', '123', '123'), + ('_', '12', '12'), + ('_', '1', '1'), + ('_', '0', '0'), + ('_', '-1234567', '-1_234_567'), + ('_', '-123456', '-123_456'), + ('7_', '123456', '123_456'), + ('8_', '123456', ' 123_456'), + ('08_', '123456', '0_123_456'), # special case: extra 0 needed + ('+08_', '123456', '+123_456'), # but not if there's a sign + (' 08_', '123456', ' 123_456'), + ('08_', '-123456', '-123_456'), + ('+09_', '123456', '+0_123_456'), + # ... with fractional part... + ('07_', '1234.56', '1_234.56'), + ('08_', '1234.56', '1_234.56'), + ('09_', '1234.56', '01_234.56'), + ('010_', '1234.56', '001_234.56'), + ('011_', '1234.56', '0_001_234.56'), + ('012_', '1234.56', '0_001_234.56'), + ('08_.1f', '1234.5', '01_234.5'), + # no thousands separators in fraction part + ('_', '1.23456789', '1.23456789'), + ('_%', '123.456789', '12_345.6789%'), + ('_e', '123456', '1.23456e+5'), + ('_E', '123456', '1.23456E+5'), + # issue 6850 ('a=-7.0', '0.12345', 'aaaa0.1'), diff --git a/Misc/NEWS.d/next/Library/2021-11-06-10-33-43.bpo-45708.LEc5wE.rst b/Misc/NEWS.d/next/Library/2021-11-06-10-33-43.bpo-45708.LEc5wE.rst new file mode 100644 index 00000000000000..09447a6da4e04b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-11-06-10-33-43.bpo-45708.LEc5wE.rst @@ -0,0 +1,3 @@ +Support underscores in :class:`~decimal.Decimal` object formatting: for +example, ``f"{Decimal(1234567):_}"`` now gives ``'1_234_567'``. (Previously, +this gave an "invalid format string" :exc:`ValueError`.) diff --git a/Modules/_decimal/libmpdec/io.c b/Modules/_decimal/libmpdec/io.c index e7bd6aee170056..7503ffdf554e4d 100644 --- a/Modules/_decimal/libmpdec/io.c +++ b/Modules/_decimal/libmpdec/io.c @@ -861,6 +861,12 @@ mpd_parse_fmt_str(mpd_spec_t *spec, const char *fmt, int caps) spec->grouping = "\003\003"; cp++; } + else if (*cp == '_') { + spec->dot = "."; + spec->sep = "_"; + spec->grouping = "\003\003"; + cp++; + } /* fraction digits or significant digits */ if (*cp == '.') { From 55820a6833a20ab8d6cf96c9a61312161599463c Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sun, 7 Nov 2021 08:59:10 +0000 Subject: [PATCH 2/2] Do format string parsing in _decimal.c instead of libmpdec --- Modules/_decimal/_decimal.c | 241 ++++++++++++++++++++++++++++++++- Modules/_decimal/libmpdec/io.c | 6 - 2 files changed, 238 insertions(+), 9 deletions(-) diff --git a/Modules/_decimal/_decimal.c b/Modules/_decimal/_decimal.c index 7fc7315603e7ad..458229548266ee 100644 --- a/Modules/_decimal/_decimal.c +++ b/Modules/_decimal/_decimal.c @@ -35,6 +35,7 @@ #include "mpdecimal.h" #include +#include #include "docstrings.h" @@ -3208,6 +3209,240 @@ dict_get_item_string(PyObject *dict, const char *key, PyObject **valueobj, const return 0; } +/* Mini-language format string parser. + + This is a version of libmdpec's mpd_parse_fmt_str, adapted for CPython. It + currently differs from the libmpdec version in only one respect: it + supports the use of underscores for thousands separators. */ + +/* Copy a single UTF-8 char to dest. See: The Unicode Standard, version 5.2, + chapter 3.9: Well-formed UTF-8 byte sequences. */ +static int +_mpd_copy_utf8(char dest[5], const char *s) +{ + const unsigned char *cp = (const unsigned char *)s; + unsigned char lb, ub; + int count, i; + + + if (*cp == 0) { + /* empty string */ + dest[0] = '\0'; + return 0; + } + else if (*cp <= 0x7f) { + /* ascii */ + dest[0] = *cp; + dest[1] = '\0'; + return 1; + } + else if (0xc2 <= *cp && *cp <= 0xdf) { + lb = 0x80; ub = 0xbf; + count = 2; + } + else if (*cp == 0xe0) { + lb = 0xa0; ub = 0xbf; + count = 3; + } + else if (*cp <= 0xec) { + lb = 0x80; ub = 0xbf; + count = 3; + } + else if (*cp == 0xed) { + lb = 0x80; ub = 0x9f; + count = 3; + } + else if (*cp <= 0xef) { + lb = 0x80; ub = 0xbf; + count = 3; + } + else if (*cp == 0xf0) { + lb = 0x90; ub = 0xbf; + count = 4; + } + else if (*cp <= 0xf3) { + lb = 0x80; ub = 0xbf; + count = 4; + } + else if (*cp == 0xf4) { + lb = 0x80; ub = 0x8f; + count = 4; + } + else { + /* invalid */ + goto error; + } + + dest[0] = *cp++; + if (*cp < lb || ub < *cp) { + goto error; + } + dest[1] = *cp++; + for (i = 2; i < count; i++) { + if (*cp < 0x80 || 0xbf < *cp) { + goto error; + } + dest[i] = *cp++; + } + dest[i] = '\0'; + + return count; + +error: + dest[0] = '\0'; + return -1; +} + +#if SIZE_MAX == MPD_SIZE_MAX + #define mpd_strtossize _mpd_strtossize +#else +#include + +static inline mpd_ssize_t +mpd_strtossize(const char *s, char **end, int base) +{ + int64_t retval; + + errno = 0; + retval = _mpd_strtossize(s, end, base); + if (errno == 0 && (retval > MPD_SSIZE_MAX || retval < MPD_SSIZE_MIN)) { + errno = ERANGE; + } + if (errno == ERANGE) { + return (retval < 0) ? MPD_SSIZE_MIN : MPD_SSIZE_MAX; + } + + return (mpd_ssize_t)retval; +} +#endif + +static int +mpd_parse_fmt_str_ex(mpd_spec_t *spec, const char *fmt, int caps) +{ + char *cp = (char *)fmt; + int have_align = 0, n; + + /* defaults */ + spec->min_width = 0; + spec->prec = -1; + spec->type = caps ? 'G' : 'g'; + spec->align = '>'; + spec->sign = '-'; + spec->dot = ""; + spec->sep = ""; + spec->grouping = ""; + + + /* presume that the first character is a UTF-8 fill character */ + if ((n = _mpd_copy_utf8(spec->fill, cp)) < 0) { + return 0; + } + + /* alignment directive, prefixed by a fill character */ + if (*cp && (*(cp+n) == '<' || *(cp+n) == '>' || + *(cp+n) == '=' || *(cp+n) == '^')) { + cp += n; + spec->align = *cp++; + have_align = 1; + } /* alignment directive */ + else { + /* default fill character */ + spec->fill[0] = ' '; + spec->fill[1] = '\0'; + if (*cp == '<' || *cp == '>' || + *cp == '=' || *cp == '^') { + spec->align = *cp++; + have_align = 1; + } + } + + /* sign formatting */ + if (*cp == '+' || *cp == '-' || *cp == ' ') { + spec->sign = *cp++; + } + + /* zero padding */ + if (*cp == '0') { + /* zero padding implies alignment, which should not be + * specified twice. */ + if (have_align) { + return 0; + } + spec->align = 'z'; + spec->fill[0] = *cp++; + spec->fill[1] = '\0'; + } + + /* minimum width */ + if (isdigit((unsigned char)*cp)) { + if (*cp == '0') { + return 0; + } + errno = 0; + spec->min_width = mpd_strtossize(cp, &cp, 10); + if (errno == ERANGE || errno == EINVAL) { + return 0; + } + } + + /* thousands separator */ + if (*cp == ',') { + spec->dot = "."; + spec->sep = ","; + spec->grouping = "\003\003"; + cp++; + } + else if (*cp == '_') { + spec->dot = "."; + spec->sep = "_"; + spec->grouping = "\003\003"; + cp++; + } + + /* fraction digits or significant digits */ + if (*cp == '.') { + cp++; + if (!isdigit((unsigned char)*cp)) { + return 0; + } + errno = 0; + spec->prec = mpd_strtossize(cp, &cp, 10); + if (errno == ERANGE || errno == EINVAL) { + return 0; + } + } + + /* type */ + if (*cp == 'E' || *cp == 'e' || *cp == 'F' || *cp == 'f' || + *cp == 'G' || *cp == 'g' || *cp == '%') { + spec->type = *cp++; + } + else if (*cp == 'N' || *cp == 'n') { + /* locale specific conversion */ + struct lconv *lc; + /* separator has already been specified */ + if (*spec->sep) { + return 0; + } + spec->type = *cp++; + spec->type = (spec->type == 'N') ? 'G' : 'g'; + lc = localeconv(); + spec->dot = lc->decimal_point; + spec->sep = lc->thousands_sep; + spec->grouping = lc->grouping; + if (mpd_validate_lconv(spec) < 0) { + return 0; /* GCOV_NOT_REACHED */ + } + } + + /* check correctness */ + if (*cp != '\0') { + return 0; + } + + return 1; +} + /* Formatted representation of a PyDecObject. */ static PyObject * dec_format(PyObject *dec, PyObject *args) @@ -3239,7 +3474,7 @@ dec_format(PyObject *dec, PyObject *args) } if (size > 0 && fmt[0] == '\0') { /* NUL fill character: must be replaced with a valid UTF-8 char - before calling mpd_parse_fmt_str(). */ + before calling mpd_parse_fmt_str_ex(). */ replace_fillchar = 1; fmt = dec_strdup(fmt, size); if (fmt == NULL) { @@ -3254,7 +3489,7 @@ dec_format(PyObject *dec, PyObject *args) return NULL; } - if (!mpd_parse_fmt_str(&spec, fmt, CtxCaps(context))) { + if (!mpd_parse_fmt_str_ex(&spec, fmt, CtxCaps(context))) { PyErr_SetString(PyExc_ValueError, "invalid format string"); goto finish; @@ -3271,7 +3506,7 @@ dec_format(PyObject *dec, PyObject *args) /* Values for decimal_point, thousands_sep and grouping can be explicitly specified in the override dict. These values take precedence over the values obtained from localeconv() - in mpd_parse_fmt_str(). The feature is not documented and + in mpd_parse_fmt_str_ex(). The feature is not documented and is only used in test_decimal. */ if (!PyDict_Check(override)) { PyErr_SetString(PyExc_TypeError, diff --git a/Modules/_decimal/libmpdec/io.c b/Modules/_decimal/libmpdec/io.c index 7503ffdf554e4d..e7bd6aee170056 100644 --- a/Modules/_decimal/libmpdec/io.c +++ b/Modules/_decimal/libmpdec/io.c @@ -861,12 +861,6 @@ mpd_parse_fmt_str(mpd_spec_t *spec, const char *fmt, int caps) spec->grouping = "\003\003"; cp++; } - else if (*cp == '_') { - spec->dot = "."; - spec->sep = "_"; - spec->grouping = "\003\003"; - cp++; - } /* fraction digits or significant digits */ if (*cp == '.') {