-
-
Notifications
You must be signed in to change notification settings - Fork 34.5k
gh-149079: Fix O(n^2) canonical ordering in unicodedata.normalize() #149080
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -616,6 +616,34 @@ def test_issue10254(self): | |
| b = 'C\u0338' * 20 + '\xC7' | ||
| self.assertEqual(self.db.normalize('NFC', a), b) | ||
|
|
||
| def test_long_combining_mark_run(self): | ||
| # GH-XXXXX: avoid quadratic canonical ordering. | ||
| payload = "a" + ("\u0300\u0327" * 32) | ||
| nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32) | ||
| nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31) | ||
|
|
||
| self.assertEqual(self.db.normalize("NFD", payload), nfd) | ||
| self.assertEqual(self.db.normalize("NFKD", payload), nfd) | ||
| self.assertEqual(self.db.normalize("NFC", payload), nfc) | ||
| self.assertEqual(self.db.normalize("NFKC", payload), nfc) | ||
|
|
||
| def test_combining_mark_run_fast_paths(self): | ||
| # GH-XXXXX: cover short runs and already-sorted long runs. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. - # GH-XXXXX: cover short runs and already-sorted long runs.
+ # gh-149079: cover short runs and already-sorted long runs. |
||
| short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300" | ||
| short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10) | ||
| short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9) | ||
| long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30) | ||
| long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29) | ||
|
|
||
| self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd) | ||
| self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd) | ||
| self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc) | ||
| self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc) | ||
| self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted) | ||
| self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted) | ||
| self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc) | ||
| self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc) | ||
|
|
||
| def test_issue29456(self): | ||
| # Fix #29456 | ||
| u1176_str_a = '\u1100\u1176\u11a8' | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,5 @@ | ||||||
| Fix a potential denial of service in :func:`unicodedata.normalize`. The | ||||||
| canonical ordering step of Unicode normalization used an O(n²) insertion | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I suggested to avoid possible rendering issues (we also say "linear-time" afterwards) |
||||||
| sort for reordering combining characters, which could be exploited with | ||||||
| crafted input containing many combining characters in non-canonical order. | ||||||
| Replaced with a linear-time counting sort for long runs. | ||||||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -552,19 +552,88 @@ get_decomp_record(PyObject *self, Py_UCS4 code, | |||||||
| (*index)++; | ||||||||
| } | ||||||||
|
|
||||||||
| /* Small combining runs are usually cheaper with insertion sort. */ | ||||||||
| #define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20 | ||||||||
|
|
||||||||
| static void | ||||||||
| canonical_ordering_sort_insertion(int kind, void *data, | ||||||||
| Py_ssize_t start, Py_ssize_t end) | ||||||||
| { | ||||||||
| for (Py_ssize_t i = start + 1; i < end; i++) { | ||||||||
| Py_UCS4 code = PyUnicode_READ(kind, data, i); | ||||||||
| unsigned char combining = _getrecord_ex(code)->combining; | ||||||||
| Py_ssize_t j = i; | ||||||||
|
|
||||||||
| while (j > start) { | ||||||||
| Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1); | ||||||||
| if (_getrecord_ex(previous)->combining <= combining) { | ||||||||
| break; | ||||||||
| } | ||||||||
| PyUnicode_WRITE(kind, data, j, previous); | ||||||||
| j--; | ||||||||
| } | ||||||||
| if (j != i) { | ||||||||
| PyUnicode_WRITE(kind, data, j, code); | ||||||||
| } | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| static void | ||||||||
| canonical_ordering_sort_counting(int kind, void *data, | ||||||||
| Py_ssize_t start, Py_ssize_t end, | ||||||||
| Py_UCS4 *sortbuf) | ||||||||
| { | ||||||||
| Py_ssize_t counts[256] = {0}; | ||||||||
| Py_ssize_t run_length = end - start; | ||||||||
| Py_ssize_t total = 0; | ||||||||
| unsigned char min_combining = 255; | ||||||||
| unsigned char max_combining = 0; | ||||||||
|
|
||||||||
| for (Py_ssize_t i = start; i < end; i++) { | ||||||||
| Py_UCS4 code = PyUnicode_READ(kind, data, i); | ||||||||
| unsigned char combining = _getrecord_ex(code)->combining; | ||||||||
| counts[combining]++; | ||||||||
| if (combining < min_combining) { | ||||||||
| min_combining = combining; | ||||||||
| } | ||||||||
| if (combining > max_combining) { | ||||||||
| max_combining = combining; | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| for (Py_ssize_t i = min_combining; i <= max_combining; i++) { | ||||||||
| Py_ssize_t count = counts[i]; | ||||||||
| counts[i] = total; | ||||||||
| total += count; | ||||||||
| } | ||||||||
|
Comment on lines
+592
to
+608
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we can drop the min/max stuff as I suggest privately. |
||||||||
|
|
||||||||
| /* Reuse counts[] as the next output slot for each CCC. */ | ||||||||
| for (Py_ssize_t i = start; i < end; i++) { | ||||||||
| Py_UCS4 code = PyUnicode_READ(kind, data, i); | ||||||||
| unsigned char combining = _getrecord_ex(code)->combining; | ||||||||
| sortbuf[counts[combining]++] = code; | ||||||||
| } | ||||||||
| for (Py_ssize_t i = 0; i < run_length; i++) { | ||||||||
| PyUnicode_WRITE(kind, data, start + i, sortbuf[i]); | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| static PyObject* | ||||||||
| nfd_nfkd(PyObject *self, PyObject *input, int k) | ||||||||
| { | ||||||||
| PyObject *result; | ||||||||
| Py_UCS4 *output; | ||||||||
| Py_ssize_t i, o, osize; | ||||||||
| int kind; | ||||||||
| const void *data; | ||||||||
| int input_kind, result_kind; | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not reuse the same variable?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC, I asked to have two different variables for readability purposes. We could reuse it but when reading the code, it was cleaner when I saw the separation. But it can be reverted if you insist. |
||||||||
| const void *input_data; | ||||||||
| void *result_data; | ||||||||
| /* Longest decomposition in Unicode 3.2: U+FDFA */ | ||||||||
| Py_UCS4 stack[20]; | ||||||||
| Py_ssize_t space, isize; | ||||||||
| Py_ssize_t space, isize, length; | ||||||||
| int index, prefix, count, stackptr; | ||||||||
| unsigned char prev, cur; | ||||||||
| Py_UCS4 *sortbuf = NULL; | ||||||||
| Py_ssize_t sortbuflen = 0; | ||||||||
|
|
||||||||
| stackptr = 0; | ||||||||
| isize = PyUnicode_GET_LENGTH(input); | ||||||||
|
|
@@ -584,11 +653,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) | |||||||
| return NULL; | ||||||||
| } | ||||||||
| i = o = 0; | ||||||||
| kind = PyUnicode_KIND(input); | ||||||||
| data = PyUnicode_DATA(input); | ||||||||
| input_kind = PyUnicode_KIND(input); | ||||||||
| input_data = PyUnicode_DATA(input); | ||||||||
|
|
||||||||
| while (i < isize) { | ||||||||
| stack[stackptr++] = PyUnicode_READ(kind, data, i++); | ||||||||
| stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++); | ||||||||
| while(stackptr) { | ||||||||
| Py_UCS4 code = stack[--stackptr]; | ||||||||
| /* Hangul Decomposition adds three characters in | ||||||||
|
|
@@ -656,34 +725,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) | |||||||
| if (!result) | ||||||||
| return NULL; | ||||||||
|
|
||||||||
| kind = PyUnicode_KIND(result); | ||||||||
| data = PyUnicode_DATA(result); | ||||||||
| result_kind = PyUnicode_KIND(result); | ||||||||
| result_data = PyUnicode_DATA(result); | ||||||||
| length = PyUnicode_GET_LENGTH(result); | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is the same as |
||||||||
|
|
||||||||
| /* Sort canonically. */ | ||||||||
| /* Sort each consecutive combining-character run canonically. */ | ||||||||
| i = 0; | ||||||||
| prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; | ||||||||
| for (i++; i < PyUnicode_GET_LENGTH(result); i++) { | ||||||||
| cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; | ||||||||
| if (prev == 0 || cur == 0 || prev <= cur) { | ||||||||
| prev = cur; | ||||||||
| while (i < length) { | ||||||||
| Py_ssize_t run_length, run_start; | ||||||||
| int needs_sort = 0; | ||||||||
|
|
||||||||
| prev = _getrecord_ex( | ||||||||
| PyUnicode_READ(result_kind, result_data, i))->combining; | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And here, for readability, I suggested storing PyUnicode_READ() in a temp var (same some lines below). |
||||||||
| if (prev == 0) { | ||||||||
| i++; | ||||||||
| continue; | ||||||||
| } | ||||||||
| /* Non-canonical order. Need to switch *i with previous. */ | ||||||||
| o = i - 1; | ||||||||
| while (1) { | ||||||||
| Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1); | ||||||||
| PyUnicode_WRITE(kind, data, o+1, | ||||||||
| PyUnicode_READ(kind, data, o)); | ||||||||
| PyUnicode_WRITE(kind, data, o, tmp); | ||||||||
| o--; | ||||||||
| if (o < 0) | ||||||||
| break; | ||||||||
| prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining; | ||||||||
| if (prev == 0 || prev <= cur) | ||||||||
|
|
||||||||
| run_start = i++; | ||||||||
| while (i < length) { | ||||||||
| cur = _getrecord_ex( | ||||||||
| PyUnicode_READ(result_kind, result_data, i))->combining; | ||||||||
| if (cur == 0) { | ||||||||
| break; | ||||||||
| } | ||||||||
| if (prev > cur) { | ||||||||
| needs_sort = 1; | ||||||||
| } | ||||||||
| prev = cur; | ||||||||
| i++; | ||||||||
| } | ||||||||
| if (!needs_sort) { | ||||||||
| continue; | ||||||||
| } | ||||||||
|
|
||||||||
| run_length = i - run_start; | ||||||||
| if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) { | ||||||||
| canonical_ordering_sort_insertion(result_kind, result_data, | ||||||||
| run_start, i); | ||||||||
| continue; | ||||||||
| } | ||||||||
| prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; | ||||||||
|
|
||||||||
| if (run_length > sortbuflen) { | ||||||||
| Py_UCS4 *new_sortbuf = PyMem_Realloc(sortbuf, | ||||||||
| run_length * sizeof(Py_UCS4)); | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe Lines 58 to 60 in 005555a
|
||||||||
| if (new_sortbuf == NULL) { | ||||||||
| PyErr_NoMemory(); | ||||||||
| PyMem_Free(sortbuf); | ||||||||
| Py_DECREF(result); | ||||||||
| return NULL; | ||||||||
| } | ||||||||
| sortbuf = new_sortbuf; | ||||||||
| sortbuflen = run_length; | ||||||||
| } | ||||||||
|
|
||||||||
| canonical_ordering_sort_counting(result_kind, result_data, | ||||||||
| run_start, i, sortbuf); | ||||||||
| } | ||||||||
| PyMem_Free(sortbuf); | ||||||||
| return result; | ||||||||
| } | ||||||||
|
|
||||||||
|
|
||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.