gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful()#120639

serhiy-storchaka · 2024-06-20T13:09:54Z

This do nothing in non-debug build.

Assertions are always built in _testcapi.c: the NDEBUG macro is undefined early in parts.h.

serhiy-storchaka · 2024-06-20T13:21:57Z

Also test surrogate pairs and non-BMP characters.
Since the code depends on the kind of the buffer string, you need to test different combinations: write different strings after writing a UCS2 or UCS4 string.
I suggest to implement in C a function which creates a PyUnicodeWriter, write the first argument as a Python string, then covert the second argument to the wchar_t* string and write it with size specified as optional third argument, and return the result. This helper function can be called in Python code with different arguments. The result will be checked even in non-debug build. You can test much more cases.

-Original file line number
+Diff line change
@@ Expand Up / @@ -1551,9 +1551,17 @@ object. @@
  On success, return ``0``.
  On error, set an exception, leave the writer unchanged, and return ``-1``.
- To use a different error handler than ``strict``,
- :c:func:`PyUnicode_DecodeUTF8` can be used with
- :c:func:`PyUnicodeWriter_WriteStr`.
+ See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
+.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size)
+ Writer the wide string *str* into *writer*.
+ *size* is a number of wide characters. If *size* is equal to ``-1``, call
+ ``wcslen(str)`` to get the string length.
+ On success, return ``0``.
+ On error, set an exception, leave the writer unchanged, and return ``-1``.
 .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
@@ Expand DownExpand Up / @@ -1586,3 +1594,24 @@ object. @@
  On success, return ``0``.
  On error, set an exception, leave the writer unchanged, and return ``-1``.
+.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
+ Decode the string *str* from UTF-8 with *errors* error handler and write the
+ output into *writer*.
+ *size* is the string length in bytes. If *size* is equal to ``-1``, call
+ ``strlen(str)`` to get the string length.
+ *errors* is an error handler name, such as ``"replace"``. If *errors* is
+ ``NULL``, use the strict error handler.
+ If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
+ bytes on success.
+ If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences
+ as an error.
+ On success, return ``0``.
+ On error, set an exception, leave the writer unchanged, and return ``-1``.
+ See also :c:func:`PyUnicodeWriter_WriteUTF8`.

-Original file line number
+Diff line change
@@ Expand Up / @@ -291,10 +291,12 @@ New Features @@
  * :c:func:`PyUnicodeWriter_Finish`.
  * :c:func:`PyUnicodeWriter_WriteChar`.
  * :c:func:`PyUnicodeWriter_WriteUTF8`.
+ * :c:func:`PyUnicodeWriter_WriteWideChar`.
  * :c:func:`PyUnicodeWriter_WriteStr`.
  * :c:func:`PyUnicodeWriter_WriteRepr`.
  * :c:func:`PyUnicodeWriter_WriteSubstring`.
  * :c:func:`PyUnicodeWriter_Format`.
+ * :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
  (Contributed by Victor Stinner in :gh:`119182`.)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8( @@
  PyUnicodeWriter *writer,
  const char *str,
  Py_ssize_t size);
+PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
+ PyUnicodeWriter *writer,
+ const wchar_t *str,
+ Py_ssize_t size);
 PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
  PyUnicodeWriter *writer,
@@ Expand All / @@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format( @@
  PyUnicodeWriter *writer,
  const char *format,
  ...);
+PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
+ PyUnicodeWriter *writer,
+ const char *string, /* UTF-8 encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ Py_ssize_t *consumed); /* bytes consumed */
 /* --- Private _PyUnicodeWriter API --------------------------------------- */
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
 }
+static PyObject *
+test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
+{
+ // test PyUnicodeWriter_DecodeUTF8Stateful()
+ PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+ if (writer == NULL){
+ return NULL;
+ }
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0){
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0){
+ goto error;
+ }
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0){
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0){
+ goto error;
+ }
+ // incomplete trailing UTF-8 sequence
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0){
+ goto error;
+ }
+ PyObject *result = PyUnicodeWriter_Finish(writer);
+ if (result == NULL){
+ return NULL;
+ }
+ assert(PyUnicode_EqualToUTF8(result,
+ "ignore-replace\xef\xbf\xbd"
+ "-incomplete\xef\xbf\xbd"));
+ Py_DECREF(result);
+ Py_RETURN_NONE;
+error:
+ PyUnicodeWriter_Discard(writer);
+ return NULL;
+}
+static PyObject *
+test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
+{
+ // test PyUnicodeWriter_DecodeUTF8Stateful()
+ PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+ if (writer == NULL){
+ return NULL;
+ }
+ Py_ssize_t consumed;
+ // valid string
+ consumed = 12345;
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0){
+ goto error;
+ }
+ assert(consumed == 4);
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0){
+ goto error;
+ }
+ // non-ASCII
+ consumed = 12345;
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0){
+ goto error;
+ }
+ assert(consumed == 6);
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0){
+ goto error;
+ }
+ // consumed is 0 if write fails
+ consumed = 12345;
+ assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
+ PyErr_Clear();
+ assert(consumed == 0);
+ // ignore error handler
+ consumed = 12345;
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0){
+ goto error;
+ }
+ assert(consumed == 5);
+ if (PyUnicodeWriter_WriteChar(writer, '-') < 0){
+ goto error;
+ }
+ // incomplete trailing UTF-8 sequence
+ consumed = 12345;
+ if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0){
+ goto error;
+ }
+ assert(consumed == 10);
+ PyObject *result = PyUnicodeWriter_Finish(writer);
+ if (result == NULL){
+ return NULL;
+ }
+ assert(PyUnicode_EqualToUTF8(result,
+ "text-\xC3\xA9-\xE2\x82\xAC-"
+ "more-incomplete"));
+ Py_DECREF(result);
+ Py_RETURN_NONE;
+error:
+ PyUnicodeWriter_Discard(writer);
+ return NULL;
+}
 static PyObject *
 test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
 {
@@ Expand DownExpand Up @@
 }
+static PyObject *
+test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
+{
+ PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+ if (writer == NULL){
+ return NULL;
+ }
+ if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0){
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0){
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0){
+ goto error;
+ }
+ if (PyUnicodeWriter_WriteChar(writer, '.') < 0){
+ goto error;
+ }
+ PyObject *result = PyUnicodeWriter_Finish(writer);
+ if (result == NULL){
+ return NULL;
+ }
+ assert(PyUnicode_EqualToUTF8(result,
+ "latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
+ Py_DECREF(result);
+ Py_RETURN_NONE;
+error:
+ PyUnicodeWriter_Discard(writer);
+ return NULL;
+}
 static PyMethodDef TestMethods[] ={
 {"unicode_new", unicode_new, METH_VARARGS},
 {"unicode_fill", unicode_fill, METH_VARARGS},
@@ Expand All / @@ -448,8 +597,11 @@ static PyMethodDef TestMethods[] ={ @@
 {"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
 {"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
 {"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
+{"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
+{"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
 {"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
 {"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
+{"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
 {NULL},
 };
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful()#120639

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

serhiy-storchakaJun 20, 2024

Uh oh!

vstinnerJun 20, 2024

Uh oh!

Uh oh!

serhiy-storchakaJun 20, 2024

Uh oh!

Uh oh!

Uh oh!

gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful()#120639

Uh oh!

gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() #120639

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

serhiy-storchakaJun 20, 2024

Choose a reason for hiding this comment

Uh oh!

vstinnerJun 20, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

serhiy-storchakaJun 20, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!