Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,34 @@ def test_issue10254(self):
b = 'C\u0338' * 20 + '\xC7'
self.assertEqual(self.db.normalize('NFC', a), b)

def test_long_combining_mark_run(self):
# GH-XXXXX: avoid quadratic canonical ordering.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-        # GH-XXXXX: avoid quadratic canonical ordering.
+        # gh-149079: avoid quadratic canonical ordering.

payload = "a" + ("\u0300\u0327" * 32)
nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)

self.assertEqual(self.db.normalize("NFD", payload), nfd)
self.assertEqual(self.db.normalize("NFKD", payload), nfd)
self.assertEqual(self.db.normalize("NFC", payload), nfc)
self.assertEqual(self.db.normalize("NFKC", payload), nfc)

def test_combining_mark_run_fast_paths(self):
# GH-XXXXX: cover short runs and already-sorted long runs.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-        # GH-XXXXX: cover short runs and already-sorted long runs.
+        # gh-149079: cover short runs and already-sorted long runs.

short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30)
long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29)

self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd)
self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd)
self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc)
self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc)
self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted)
self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted)
self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc)
self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc)

def test_issue29456(self):
# Fix #29456
u1176_str_a = '\u1100\u1176\u11a8'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fix a potential denial of service in :func:`unicodedata.normalize`. The
canonical ordering step of Unicode normalization used an O(n²) insertion
Copy link
Copy Markdown
Member

@picnixz picnixz Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
canonical ordering step of Unicode normalization used an O(n²) insertion
canonical ordering step of Unicode normalization used a quadratic-time insertion

I suggested to avoid possible rendering issues (we also say "linear-time" afterwards)

sort for reordering combining characters, which could be exploited with
crafted input containing many combining characters in non-canonical order.
Replaced with a linear-time counting sort for long runs.
153 changes: 126 additions & 27 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -552,19 +552,88 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
(*index)++;
}

/* Small combining runs are usually cheaper with insertion sort. */
#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20

static void
canonical_ordering_sort_insertion(int kind, void *data,
Py_ssize_t start, Py_ssize_t end)
{
for (Py_ssize_t i = start + 1; i < end; i++) {
Py_UCS4 code = PyUnicode_READ(kind, data, i);
unsigned char combining = _getrecord_ex(code)->combining;
Py_ssize_t j = i;

while (j > start) {
Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1);
if (_getrecord_ex(previous)->combining <= combining) {
break;
}
PyUnicode_WRITE(kind, data, j, previous);
j--;
}
if (j != i) {
PyUnicode_WRITE(kind, data, j, code);
}
}
}

static void
canonical_ordering_sort_counting(int kind, void *data,
Py_ssize_t start, Py_ssize_t end,
Py_UCS4 *sortbuf)
{
Py_ssize_t counts[256] = {0};
Py_ssize_t run_length = end - start;
Py_ssize_t total = 0;
unsigned char min_combining = 255;
unsigned char max_combining = 0;

for (Py_ssize_t i = start; i < end; i++) {
Py_UCS4 code = PyUnicode_READ(kind, data, i);
unsigned char combining = _getrecord_ex(code)->combining;
counts[combining]++;
if (combining < min_combining) {
min_combining = combining;
}
if (combining > max_combining) {
max_combining = combining;
}
}

for (Py_ssize_t i = min_combining; i <= max_combining; i++) {
Py_ssize_t count = counts[i];
counts[i] = total;
total += count;
}
Comment on lines +592 to +608
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we can drop the min/max stuff as I suggest privately.


/* Reuse counts[] as the next output slot for each CCC. */
for (Py_ssize_t i = start; i < end; i++) {
Py_UCS4 code = PyUnicode_READ(kind, data, i);
unsigned char combining = _getrecord_ex(code)->combining;
sortbuf[counts[combining]++] = code;
}
for (Py_ssize_t i = 0; i < run_length; i++) {
PyUnicode_WRITE(kind, data, start + i, sortbuf[i]);
}
}

static PyObject*
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
PyObject *result;
Py_UCS4 *output;
Py_ssize_t i, o, osize;
int kind;
const void *data;
int input_kind, result_kind;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not reuse the same variable?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, I asked to have two different variables for readability purposes. We could reuse it but when reading the code, it was cleaner when I saw the separation. But it can be reverted if you insist.

const void *input_data;
void *result_data;
/* Longest decomposition in Unicode 3.2: U+FDFA */
Py_UCS4 stack[20];
Py_ssize_t space, isize;
Py_ssize_t space, isize, length;
int index, prefix, count, stackptr;
unsigned char prev, cur;
Py_UCS4 *sortbuf = NULL;
Py_ssize_t sortbuflen = 0;

stackptr = 0;
isize = PyUnicode_GET_LENGTH(input);
Expand All @@ -584,11 +653,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
return NULL;
}
i = o = 0;
kind = PyUnicode_KIND(input);
data = PyUnicode_DATA(input);
input_kind = PyUnicode_KIND(input);
input_data = PyUnicode_DATA(input);

while (i < isize) {
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++);
while(stackptr) {
Py_UCS4 code = stack[--stackptr];
/* Hangul Decomposition adds three characters in
Expand Down Expand Up @@ -656,34 +725,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
if (!result)
return NULL;

kind = PyUnicode_KIND(result);
data = PyUnicode_DATA(result);
result_kind = PyUnicode_KIND(result);
result_data = PyUnicode_DATA(result);
length = PyUnicode_GET_LENGTH(result);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is the same as o.


/* Sort canonically. */
/* Sort each consecutive combining-character run canonically. */
i = 0;
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
if (prev == 0 || cur == 0 || prev <= cur) {
prev = cur;
while (i < length) {
Py_ssize_t run_length, run_start;
int needs_sort = 0;

prev = _getrecord_ex(
PyUnicode_READ(result_kind, result_data, i))->combining;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And here, for readability, I suggested storing PyUnicode_READ() in a temp var (same some lines below).

if (prev == 0) {
i++;
continue;
}
/* Non-canonical order. Need to switch *i with previous. */
o = i - 1;
while (1) {
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
PyUnicode_WRITE(kind, data, o+1,
PyUnicode_READ(kind, data, o));
PyUnicode_WRITE(kind, data, o, tmp);
o--;
if (o < 0)
break;
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
if (prev == 0 || prev <= cur)

run_start = i++;
while (i < length) {
cur = _getrecord_ex(
PyUnicode_READ(result_kind, result_data, i))->combining;
if (cur == 0) {
break;
}
if (prev > cur) {
needs_sort = 1;
}
prev = cur;
i++;
}
if (!needs_sort) {
continue;
}

run_length = i - run_start;
if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) {
canonical_ordering_sort_insertion(result_kind, result_data,
run_start, i);
continue;
}
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;

if (run_length > sortbuflen) {
Py_UCS4 *new_sortbuf = PyMem_Realloc(sortbuf,
run_length * sizeof(Py_UCS4));
Copy link
Copy Markdown
Contributor

@maurycy maurycy Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe PyMem_Resize instead of calculating manually?

cpython/Include/pymem.h

Lines 58 to 60 in 005555a

* or NULL if the request was too large or memory allocation failed. Use
* these macros rather than doing the multiplication yourself so that proper
* overflow checking is always done.

if (new_sortbuf == NULL) {
PyErr_NoMemory();
PyMem_Free(sortbuf);
Py_DECREF(result);
return NULL;
}
sortbuf = new_sortbuf;
sortbuflen = run_length;
}

canonical_ordering_sort_counting(result_kind, result_data,
run_start, i, sortbuf);
}
PyMem_Free(sortbuf);
return result;
}

Expand Down
Loading