pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python/cpython/commit/eb50cd37eac47dd4dc71ab42d0582dfb6eac4515

e07ff8eaaaff3a3.css" /> gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUT… · python/cpython@eb50cd3 · GitHub
Skip to content

Commit eb50cd3

Browse files
gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize() functions (GH-110297)
1 parent d1f7fae commit eb50cd3

File tree

11 files changed

+280
-0
lines changed

11 files changed

+280
-0
lines changed

Doc/c-api/unicode.rst

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1396,6 +1396,28 @@ They all return ``NULL`` or ``-1`` if an exception occurs.
13961396
:c:func:`PyErr_Occurred` to check for errors.
13971397
13981398
1399+
.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size)
1400+
1401+
Compare a Unicode object with a char buffer which is interpreted as
1402+
being UTF-8 or ASCII encoded and return true (``1``) if they are equal,
1403+
or false (``0``) otherwise.
1404+
If the Unicode object contains surrogate characters or
1405+
the C string is not valid UTF-8, false (``0``) is returned.
1406+
1407+
This function does not raise exceptions.
1408+
1409+
.. versionadded:: 3.13
1410+
1411+
1412+
.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string)
1413+
1414+
Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string*
1415+
length using :c:func:`!strlen`.
1416+
If the Unicode object contains null characters, false (``0``) is returned.
1417+
1418+
.. versionadded:: 3.13
1419+
1420+
13991421
.. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string)
14001422
14011423
Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less

Doc/data/stable_abi.dat

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Doc/whatsnew/3.13.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,6 +1024,12 @@ New Features
10241024
functions on Python 3.11 and 3.12.
10251025
(Contributed by Victor Stinner in :gh:`107073`.)
10261026

1027+
* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8`
1028+
functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded
1029+
string and return true (``1``) if they are equal, or false (``0``) otherwise.
1030+
These functions do not raise exceptions.
1031+
(Contributed by Serhiy Storchaka in :gh:`110289`.)
1032+
10271033
* Add :c:func:`PyThreadState_GetUnchecked()` function: similar to
10281034
:c:func:`PyThreadState_Get()`, but don't kill the process with a fatal error
10291035
if it is NULL. The caller is responsible to check if the result is NULL.

Include/unicodeobject.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,15 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
957957
const char *right /* ASCII-encoded string */
958958
);
959959

960+
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
961+
/* Compare a Unicode object with UTF-8 encoded C string.
962+
Return 1 if they are equal, or 0 otherwise.
963+
This function does not raise exceptions. */
964+
965+
PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *);
966+
PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t);
967+
#endif
968+
960969
/* Rich compare two strings and return one of the following:
961970
962971
- NULL in case an exception was raised

Lib/test/test_capi/test_unicode.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,6 +1297,118 @@ def test_comparewithasciistring(self):
12971297
# CRASHES comparewithasciistring([], b'abc')
12981298
# CRASHES comparewithasciistring(NULL, b'abc')
12991299

1300+
@support.cpython_only
1301+
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
1302+
def test_equaltoutf8(self):
1303+
# Test PyUnicode_EqualToUTF8()
1304+
from _testcapi import unicode_equaltoutf8 as equaltoutf8
1305+
from _testcapi import unicode_asutf8andsize as asutf8andsize
1306+
1307+
strings = [
1308+
'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
1309+
'\U0001f600\U0001f601\U0001f602',
1310+
'\U0010ffff',
1311+
]
1312+
for s in strings:
1313+
# Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
1314+
# encoded string cached in the Unicode object.
1315+
asutf8andsize(s, 0)
1316+
b = s.encode()
1317+
self.assertEqual(equaltoutf8(s, b), 1) # Use the UTF-8 cache.
1318+
s2 = b.decode() # New Unicode object without the UTF-8 cache.
1319+
self.assertEqual(equaltoutf8(s2, b), 1)
1320+
self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1)
1321+
self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0)
1322+
self.assertEqual(equaltoutf8(s, b + b'\0'), 1)
1323+
self.assertEqual(equaltoutf8(s2, b + b'\0'), 1)
1324+
self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0)
1325+
self.assertEqual(equaltoutf8(s + '\0', b), 0)
1326+
self.assertEqual(equaltoutf8(s2, b + b'x'), 0)
1327+
self.assertEqual(equaltoutf8(s2, b[:-1]), 0)
1328+
self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0)
1329+
1330+
self.assertEqual(equaltoutf8('', b''), 1)
1331+
self.assertEqual(equaltoutf8('', b'\0'), 1)
1332+
1333+
# embedded null chars/bytes
1334+
self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1)
1335+
self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0)
1336+
self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0)
1337+
1338+
# Surrogate characters are always treated as not equal
1339+
self.assertEqual(equaltoutf8('\udcfe',
1340+
'\udcfe'.encode("utf8", "surrogateescape")), 0)
1341+
self.assertEqual(equaltoutf8('\udcfe',
1342+
'\udcfe'.encode("utf8", "surrogatepass")), 0)
1343+
self.assertEqual(equaltoutf8('\ud801',
1344+
'\ud801'.encode("utf8", "surrogatepass")), 0)
1345+
1346+
@support.cpython_only
1347+
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
1348+
def test_equaltoutf8andsize(self):
1349+
# Test PyUnicode_EqualToUTF8AndSize()
1350+
from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize
1351+
from _testcapi import unicode_asutf8andsize as asutf8andsize
1352+
1353+
strings = [
1354+
'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
1355+
'\U0001f600\U0001f601\U0001f602',
1356+
'\U0010ffff',
1357+
]
1358+
for s in strings:
1359+
# Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
1360+
# encoded string cached in the Unicode object.
1361+
asutf8andsize(s, 0)
1362+
b = s.encode()
1363+
self.assertEqual(equaltoutf8andsize(s, b), 1) # Use the UTF-8 cache.
1364+
s2 = b.decode() # New Unicode object without the UTF-8 cache.
1365+
self.assertEqual(equaltoutf8andsize(s2, b), 1)
1366+
self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1)
1367+
self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0)
1368+
self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0)
1369+
self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0)
1370+
self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1)
1371+
self.assertEqual(equaltoutf8andsize(s + '\0', b), 0)
1372+
self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0)
1373+
self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0)
1374+
self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0)
1375+
# Not null-terminated,
1376+
self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1)
1377+
self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1)
1378+
self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1)
1379+
self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0)
1380+
1381+
self.assertEqual(equaltoutf8andsize('', b''), 1)
1382+
self.assertEqual(equaltoutf8andsize('', b'\0'), 0)
1383+
self.assertEqual(equaltoutf8andsize('', b'x', 0), 1)
1384+
1385+
# embedded null chars/bytes
1386+
self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1)
1387+
self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1)
1388+
1389+
# Surrogate characters are always treated as not equal
1390+
self.assertEqual(equaltoutf8andsize('\udcfe',
1391+
'\udcfe'.encode("utf8", "surrogateescape")), 0)
1392+
self.assertEqual(equaltoutf8andsize('\udcfe',
1393+
'\udcfe'.encode("utf8", "surrogatepass")), 0)
1394+
self.assertEqual(equaltoutf8andsize('\ud801',
1395+
'\ud801'.encode("utf8", "surrogatepass")), 0)
1396+
1397+
def check_not_equal_encoding(text, encoding):
1398+
self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0)
1399+
self.assertNotEqual(text.encode(encoding), text.encode("utf8"))
1400+
1401+
# Strings encoded to other encodings are not equal to expected UTF8-encoding string
1402+
check_not_equal_encoding('Stéphane', 'latin1')
1403+
check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters
1404+
check_not_equal_encoding('北京市', 'gbk')
1405+
1406+
# CRASHES equaltoutf8andsize('abc', b'abc', -1)
1407+
# CRASHES equaltoutf8andsize(b'abc', b'abc')
1408+
# CRASHES equaltoutf8andsize([], b'abc')
1409+
# CRASHES equaltoutf8andsize(NULL, b'abc')
1410+
# CRASHES equaltoutf8andsize('abc', NULL)
1411+
13001412
@support.cpython_only
13011413
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
13021414
def test_richcompare(self):

Lib/test/test_stable_abi_ctypes.py

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions.

Misc/stable_abi.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2462,3 +2462,7 @@
24622462
added = '3.13'
24632463
[function.Py_IsFinalizing]
24642464
added = '3.13'
2465+
[function.PyUnicode_EqualToUTF8]
2466+
added = '3.13'
2467+
[function.PyUnicode_EqualToUTF8AndSize]
2468+
added = '3.13'

Modules/_testcapi/unicode.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,6 +1429,48 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args)
14291429
return PyLong_FromLong(result);
14301430
}
14311431

1432+
/* Test PyUnicode_EqualToUTF8() */
1433+
static PyObject *
1434+
unicode_equaltoutf8(PyObject *self, PyObject *args)
1435+
{
1436+
PyObject *left;
1437+
const char *right = NULL;
1438+
Py_ssize_t right_len;
1439+
int result;
1440+
1441+
if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) {
1442+
return NULL;
1443+
}
1444+
1445+
NULLABLE(left);
1446+
result = PyUnicode_EqualToUTF8(left, right);
1447+
assert(!PyErr_Occurred());
1448+
return PyLong_FromLong(result);
1449+
}
1450+
1451+
/* Test PyUnicode_EqualToUTF8AndSize() */
1452+
static PyObject *
1453+
unicode_equaltoutf8andsize(PyObject *self, PyObject *args)
1454+
{
1455+
PyObject *left;
1456+
const char *right = NULL;
1457+
Py_ssize_t right_len;
1458+
Py_ssize_t size = -100;
1459+
int result;
1460+
1461+
if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) {
1462+
return NULL;
1463+
}
1464+
1465+
NULLABLE(left);
1466+
if (size == -100) {
1467+
size = right_len;
1468+
}
1469+
result = PyUnicode_EqualToUTF8AndSize(left, right, size);
1470+
assert(!PyErr_Occurred());
1471+
return PyLong_FromLong(result);
1472+
}
1473+
14321474
/* Test PyUnicode_RichCompare() */
14331475
static PyObject *
14341476
unicode_richcompare(PyObject *self, PyObject *args)
@@ -2044,6 +2086,8 @@ static PyMethodDef TestMethods[] = {
20442086
{"unicode_replace", unicode_replace, METH_VARARGS},
20452087
{"unicode_compare", unicode_compare, METH_VARARGS},
20462088
{"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS},
2089+
{"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS},
2090+
{"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize, METH_VARARGS},
20472091
{"unicode_richcompare", unicode_richcompare, METH_VARARGS},
20482092
{"unicode_format", unicode_format, METH_VARARGS},
20492093
{"unicode_contains", unicode_contains, METH_VARARGS},

Objects/unicodeobject.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10673,6 +10673,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
1067310673
}
1067410674
}
1067510675

10676+
int
10677+
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
10678+
{
10679+
return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
10680+
}
10681+
10682+
int
10683+
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
10684+
{
10685+
assert(_PyUnicode_CHECK(unicode));
10686+
assert(str);
10687+
10688+
if (PyUnicode_IS_ASCII(unicode)) {
10689+
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
10690+
return size == len &&
10691+
memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
10692+
}
10693+
if (PyUnicode_UTF8(unicode) != NULL) {
10694+
Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
10695+
return size == len &&
10696+
memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
10697+
}
10698+
10699+
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
10700+
if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
10701+
return 0;
10702+
}
10703+
const unsigned char *s = (const unsigned char *)str;
10704+
const unsigned char *ends = s + (size_t)size;
10705+
int kind = PyUnicode_KIND(unicode);
10706+
const void *data = PyUnicode_DATA(unicode);
10707+
/* Compare Unicode string and UTF-8 string */
10708+
for (Py_ssize_t i = 0; i < len; i++) {
10709+
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10710+
if (ch < 0x80) {
10711+
if (ends == s || s[0] != ch) {
10712+
return 0;
10713+
}
10714+
s += 1;
10715+
}
10716+
else if (ch < 0x800) {
10717+
if ((ends - s) < 2 ||
10718+
s[0] != (0xc0 | (ch >> 6)) ||
10719+
s[1] != (0x80 | (ch & 0x3f)))
10720+
{
10721+
return 0;
10722+
}
10723+
s += 2;
10724+
}
10725+
else if (ch < 0x10000) {
10726+
if (Py_UNICODE_IS_SURROGATE(ch) ||
10727+
(ends - s) < 3 ||
10728+
s[0] != (0xe0 | (ch >> 12)) ||
10729+
s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
10730+
s[2] != (0x80 | (ch & 0x3f)))
10731+
{
10732+
return 0;
10733+
}
10734+
s += 3;
10735+
}
10736+
else {
10737+
assert(ch <= MAX_UNICODE);
10738+
if ((ends - s) < 4 ||
10739+
s[0] != (0xf0 | (ch >> 18)) ||
10740+
s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
10741+
s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
10742+
s[3] != (0x80 | (ch & 0x3f)))
10743+
{
10744+
return 0;
10745+
}
10746+
s += 4;
10747+
}
10748+
}
10749+
return s == ends;
10750+
}
10751+
1067610752
int
1067710753
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
1067810754
{

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy