diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 8ecb0df2f8e5dd..3f569e6baee676 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -616,6 +616,34 @@ def test_issue10254(self): b = 'C\u0338' * 20 + '\xC7' self.assertEqual(self.db.normalize('NFC', a), b) + def test_long_combining_mark_run(self): + # GH-XXXXX: avoid quadratic canonical ordering. + payload = "a" + ("\u0300\u0327" * 32) + nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32) + nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31) + + self.assertEqual(self.db.normalize("NFD", payload), nfd) + self.assertEqual(self.db.normalize("NFKD", payload), nfd) + self.assertEqual(self.db.normalize("NFC", payload), nfc) + self.assertEqual(self.db.normalize("NFKC", payload), nfc) + + def test_combining_mark_run_fast_paths(self): + # GH-XXXXX: cover short runs and already-sorted long runs. + short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300" + short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10) + short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9) + long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30) + long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29) + + self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd) + self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd) + self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc) + self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc) + self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted) + self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted) + self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc) + self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc) + def test_issue29456(self): # Fix #29456 u1176_str_a = '\u1100\u1176\u11a8' diff --git a/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst b/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst new file mode 100644 index 00000000000000..72e8374acffb1f --- /dev/null +++ b/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst @@ -0,0 +1,5 @@ +Fix a potential denial of service in :func:`unicodedata.normalize`. The +canonical ordering step of Unicode normalization used an O(n²) insertion +sort for reordering combining characters, which could be exploited with +crafted input containing many combining characters in non-canonical order. +Replaced with a linear-time counting sort for long runs. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 55b33a76e7af8a..9ac2faffb40bfb 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -552,19 +552,88 @@ get_decomp_record(PyObject *self, Py_UCS4 code, (*index)++; } +/* Small combining runs are usually cheaper with insertion sort. */ +#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20 + +static void +canonical_ordering_sort_insertion(int kind, void *data, + Py_ssize_t start, Py_ssize_t end) +{ + for (Py_ssize_t i = start + 1; i < end; i++) { + Py_UCS4 code = PyUnicode_READ(kind, data, i); + unsigned char combining = _getrecord_ex(code)->combining; + Py_ssize_t j = i; + + while (j > start) { + Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1); + if (_getrecord_ex(previous)->combining <= combining) { + break; + } + PyUnicode_WRITE(kind, data, j, previous); + j--; + } + if (j != i) { + PyUnicode_WRITE(kind, data, j, code); + } + } +} + +static void +canonical_ordering_sort_counting(int kind, void *data, + Py_ssize_t start, Py_ssize_t end, + Py_UCS4 *sortbuf) +{ + Py_ssize_t counts[256] = {0}; + Py_ssize_t run_length = end - start; + Py_ssize_t total = 0; + unsigned char min_combining = 255; + unsigned char max_combining = 0; + + for (Py_ssize_t i = start; i < end; i++) { + Py_UCS4 code = PyUnicode_READ(kind, data, i); + unsigned char combining = _getrecord_ex(code)->combining; + counts[combining]++; + if (combining < min_combining) { + min_combining = combining; + } + if (combining > max_combining) { + max_combining = combining; + } + } + + for (Py_ssize_t i = min_combining; i <= max_combining; i++) { + Py_ssize_t count = counts[i]; + counts[i] = total; + total += count; + } + + /* Reuse counts[] as the next output slot for each CCC. */ + for (Py_ssize_t i = start; i < end; i++) { + Py_UCS4 code = PyUnicode_READ(kind, data, i); + unsigned char combining = _getrecord_ex(code)->combining; + sortbuf[counts[combining]++] = code; + } + for (Py_ssize_t i = 0; i < run_length; i++) { + PyUnicode_WRITE(kind, data, start + i, sortbuf[i]); + } +} + static PyObject* nfd_nfkd(PyObject *self, PyObject *input, int k) { PyObject *result; Py_UCS4 *output; Py_ssize_t i, o, osize; - int kind; - const void *data; + int input_kind, result_kind; + const void *input_data; + void *result_data; /* Longest decomposition in Unicode 3.2: U+FDFA */ Py_UCS4 stack[20]; - Py_ssize_t space, isize; + Py_ssize_t space, isize, length; int index, prefix, count, stackptr; unsigned char prev, cur; + Py_UCS4 *sortbuf = NULL; + Py_ssize_t sortbuflen = 0; stackptr = 0; isize = PyUnicode_GET_LENGTH(input); @@ -584,11 +653,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) return NULL; } i = o = 0; - kind = PyUnicode_KIND(input); - data = PyUnicode_DATA(input); + input_kind = PyUnicode_KIND(input); + input_data = PyUnicode_DATA(input); while (i < isize) { - stack[stackptr++] = PyUnicode_READ(kind, data, i++); + stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++); while(stackptr) { Py_UCS4 code = stack[--stackptr]; /* Hangul Decomposition adds three characters in @@ -656,34 +725,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) if (!result) return NULL; - kind = PyUnicode_KIND(result); - data = PyUnicode_DATA(result); + result_kind = PyUnicode_KIND(result); + result_data = PyUnicode_DATA(result); + length = PyUnicode_GET_LENGTH(result); - /* Sort canonically. */ + /* Sort each consecutive combining-character run canonically. */ i = 0; - prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; - for (i++; i < PyUnicode_GET_LENGTH(result); i++) { - cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; - if (prev == 0 || cur == 0 || prev <= cur) { - prev = cur; + while (i < length) { + Py_ssize_t run_length, run_start; + int needs_sort = 0; + + prev = _getrecord_ex( + PyUnicode_READ(result_kind, result_data, i))->combining; + if (prev == 0) { + i++; continue; } - /* Non-canonical order. Need to switch *i with previous. */ - o = i - 1; - while (1) { - Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1); - PyUnicode_WRITE(kind, data, o+1, - PyUnicode_READ(kind, data, o)); - PyUnicode_WRITE(kind, data, o, tmp); - o--; - if (o < 0) - break; - prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining; - if (prev == 0 || prev <= cur) + + run_start = i++; + while (i < length) { + cur = _getrecord_ex( + PyUnicode_READ(result_kind, result_data, i))->combining; + if (cur == 0) { break; + } + if (prev > cur) { + needs_sort = 1; + } + prev = cur; + i++; + } + if (!needs_sort) { + continue; + } + + run_length = i - run_start; + if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) { + canonical_ordering_sort_insertion(result_kind, result_data, + run_start, i); + continue; } - prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; + + if (run_length > sortbuflen) { + Py_UCS4 *new_sortbuf = PyMem_Realloc(sortbuf, + run_length * sizeof(Py_UCS4)); + if (new_sortbuf == NULL) { + PyErr_NoMemory(); + PyMem_Free(sortbuf); + Py_DECREF(result); + return NULL; + } + sortbuf = new_sortbuf; + sortbuflen = run_length; + } + + canonical_ordering_sort_counting(result_kind, result_data, + run_start, i, sortbuf); } + PyMem_Free(sortbuf); return result; }