pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python/cpython/commit/d3cc68900dc99966007112f884779895daefc7db

[3.9] bpo-43667: Fix broken Unicode encoding in non-UTF locales on So… · python/cpython@d3cc689 · GitHub
Skip to content

Commit d3cc689

Browse files
authored
[3.9] bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096) (GH-25847)
(cherry picked from commit 9032cf5) Co-authored-by: Jakub Kulík <Kulikjak@gmail.com>
1 parent 0593ae8 commit d3cc689

6 files changed

Lines changed: 194 additions & 0 deletions

File tree

Include/internal/pycore_fileutils.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,18 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
4848
PyObject **decimal_point,
4949
PyObject **thousands_sep);
5050

51+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
52+
extern int _Py_LocaleUsesNonUnicodeWchar(void);
53+
54+
extern wchar_t* _Py_DecodeNonUnicodeWchar(
55+
const wchar_t* native,
56+
Py_ssize_t size);
57+
58+
extern int _Py_EncodeNonUnicodeWchar_InPlace(
59+
wchar_t* unicode,
60+
Py_ssize_t size);
61+
#endif
62+
5163
#ifdef __cplusplus
5264
}
5365
#endif

Objects/unicodeobject.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
5656
#include <windows.h>
5757
#endif
5858

59+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
60+
#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
61+
#endif
62+
5963
/* Uncomment to display statistics on interned strings at exit when
6064
using Valgrind or Insecure++. */
6165
/* #define INTERNED_STATS 1 */
@@ -2211,6 +2215,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
22112215
if (size == 0)
22122216
_Py_RETURN_UNICODE_EMPTY();
22132217

2218+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2219+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
2220+
non-Unicode locales and hence needs conversion to UCS-4 first. */
2221+
if (_Py_LocaleUsesNonUnicodeWchar()) {
2222+
wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2223+
if (!converted) {
2224+
return NULL;
2225+
}
2226+
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2227+
PyMem_Free(converted);
2228+
return unicode;
2229+
}
2230+
#endif
2231+
22142232
/* Single character Unicode objects in the Latin-1 range are
22152233
shared when using this constructor */
22162234
if (size == 1 && (Py_UCS4)*u < 256)
@@ -3223,6 +3241,17 @@ PyUnicode_AsWideChar(PyObject *unicode,
32233241
res = size;
32243242
}
32253243
unicode_copy_as_widechar(unicode, w, size);
3244+
3245+
#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3246+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
3247+
non-Unicode locales and hence needs conversion first. */
3248+
if (_Py_LocaleUsesNonUnicodeWchar()) {
3249+
if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3250+
return -1;
3251+
}
3252+
}
3253+
#endif
3254+
32263255
return res;
32273256
}
32283257

@@ -3249,6 +3278,17 @@ PyUnicode_AsWideCharString(PyObject *unicode,
32493278
return NULL;
32503279
}
32513280
unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3281+
3282+
#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3283+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
3284+
non-Unicode locales and hence needs conversion first. */
3285+
if (_Py_LocaleUsesNonUnicodeWchar()) {
3286+
if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3287+
return NULL;
3288+
}
3289+
}
3290+
#endif
3291+
32523292
if (size != NULL) {
32533293
*size = buflen;
32543294
}

Python/fileutils.c

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ extern int winerror_to_errno(int);
1717
#include <sys/ioctl.h>
1818
#endif
1919

20+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
21+
#include <iconv.h>
22+
#endif
23+
2024
#ifdef HAVE_FCNTL_H
2125
#include <fcntl.h>
2226
#endif /* HAVE_FCNTL_H */
@@ -96,6 +100,12 @@ _Py_device_encoding(int fd)
96100
static size_t
97101
is_valid_wide_char(wchar_t ch)
98102
{
103+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
104+
/* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
105+
for non-Unicode locales, which makes values higher than MAX_UNICODE
106+
possibly valid. */
107+
return 1;
108+
#endif
99109
if (Py_UNICODE_IS_SURROGATE(ch)) {
100110
// Reject lone surrogate characters
101111
return 0;
@@ -859,6 +869,102 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
859869
current_locale, errors);
860870
}
861871

872+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
873+
874+
/* Check whether current locale uses Unicode as internal wchar_t form. */
875+
int
876+
_Py_LocaleUsesNonUnicodeWchar(void)
877+
{
878+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
879+
non-Unicode locales and hence needs conversion to UTF first. */
880+
char* codeset = nl_langinfo(CODESET);
881+
if (!codeset) {
882+
return 0;
883+
}
884+
/* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
885+
return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0);
886+
}
887+
888+
static wchar_t *
889+
_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size,
890+
const char *tocode, const char *fromcode)
891+
{
892+
Py_BUILD_ASSERT(sizeof(wchar_t) == 4);
893+
894+
/* Ensure we won't overflow the size. */
895+
if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) {
896+
PyErr_NoMemory();
897+
return NULL;
898+
}
899+
900+
/* the string doesn't have to be NULL terminated */
901+
wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t));
902+
if (target == NULL) {
903+
PyErr_NoMemory();
904+
return NULL;
905+
}
906+
907+
iconv_t cd = iconv_open(tocode, fromcode);
908+
if (cd == (iconv_t)-1) {
909+
PyErr_Format(PyExc_ValueError, "iconv_open() failed");
910+
PyMem_Free(target);
911+
return NULL;
912+
}
913+
914+
char *inbuf = (char *) source;
915+
char *outbuf = (char *) target;
916+
size_t inbytesleft = sizeof(wchar_t) * size;
917+
size_t outbytesleft = inbytesleft;
918+
919+
size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
920+
if (ret == DECODE_ERROR) {
921+
PyErr_Format(PyExc_ValueError, "iconv() failed");
922+
PyMem_Free(target);
923+
iconv_close(cd);
924+
return NULL;
925+
}
926+
927+
iconv_close(cd);
928+
return target;
929+
}
930+
931+
/* Convert a wide character string to the UCS-4 encoded string. This
932+
is necessary on systems where internal form of wchar_t are not Unicode
933+
code points (e.g. Oracle Solaris).
934+
935+
Return a pointer to a newly allocated string, use PyMem_Free() to free
936+
the memory. Return NULL and raise exception on conversion or memory
937+
allocation error. */
938+
wchar_t *
939+
_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size)
940+
{
941+
return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t");
942+
}
943+
944+
/* Convert a UCS-4 encoded string to native wide character string. This
945+
is necessary on systems where internal form of wchar_t are not Unicode
946+
code points (e.g. Oracle Solaris).
947+
948+
The conversion is done in place. This can be done because both wchar_t
949+
and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
950+
to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
951+
which is currently the only system using these functions; it doesn't have
952+
to be for other systems).
953+
954+
Return 0 on success. Return -1 and raise exception on conversion
955+
or memory allocation error. */
956+
int
957+
_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size)
958+
{
959+
wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL");
960+
if (!result) {
961+
return -1;
962+
}
963+
memcpy(unicode, result, size * sizeof(wchar_t));
964+
PyMem_Free(result);
965+
return 0;
966+
}
967+
#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
862968

863969
#ifdef MS_WINDOWS
864970
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */

configure

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15123,6 +15123,22 @@ else
1512315123
$as_echo "no" >&6; }
1512415124
fi
1512515125

15126+
case $ac_sys_system/$ac_sys_release in
15127+
SunOS/*)
15128+
if test -f /etc/os-release; then
15129+
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
15130+
if test "x$OS_NAME" = "xOracle Solaris"; then
15131+
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
15132+
# non-Unicode locales is not Unicode and hence cannot be used directly.
15133+
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
15134+
15135+
$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h
15136+
15137+
fi
15138+
fi
15139+
;;
15140+
esac
15141+
1512615142
# check for endianness
1512715143
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
1512815144
$as_echo_n "checking whether byte ordering is bigendian... " >&6; }

configure.ac

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4759,6 +4759,22 @@ else
47594759
AC_MSG_RESULT(no)
47604760
fi
47614761

4762+
case $ac_sys_system/$ac_sys_release in
4763+
SunOS/*)
4764+
if test -f /etc/os-release; then
4765+
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
4766+
if test "x$OS_NAME" = "xOracle Solaris"; then
4767+
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
4768+
# non-Unicode locales is not Unicode and hence cannot be used directly.
4769+
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
4770+
AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
4771+
[Define if the internal form of wchar_t in non-Unicode locales
4772+
is not Unicode.])
4773+
fi
4774+
fi
4775+
;;
4776+
esac
4777+
47624778
# check for endianness
47634779
AC_C_BIGENDIAN
47644780

pyconfig.h.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,10 @@
733733
/* Define to 1 if you have the `nice' function. */
734734
#undef HAVE_NICE
735735

736+
/* Define if the internal form of wchar_t in non-Unicode locales is not
737+
Unicode. */
738+
#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
739+
736740
/* Define to 1 if you have the `openat' function. */
737741
#undef HAVE_OPENAT
738742

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy