@@ -17,6 +17,10 @@ extern int winerror_to_errno(int);
1717#include <sys/ioctl.h>
1818#endif
1919
20+ #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
21+ #include <iconv.h>
22+ #endif
23+
2024#ifdef HAVE_FCNTL_H
2125#include <fcntl.h>
2226#endif /* HAVE_FCNTL_H */
@@ -96,6 +100,12 @@ _Py_device_encoding(int fd)
96100static size_t
97101is_valid_wide_char (wchar_t ch )
98102{
103+ #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
104+ /* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
105+ for non-Unicode locales, which makes values higher than MAX_UNICODE
106+ possibly valid. */
107+ return 1 ;
108+ #endif
99109 if (Py_UNICODE_IS_SURROGATE (ch )) {
100110 // Reject lone surrogate characters
101111 return 0 ;
@@ -859,6 +869,102 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
859869 current_locale , errors );
860870}
861871
872+ #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
873+
874+ /* Check whether current locale uses Unicode as internal wchar_t form. */
875+ int
876+ _Py_LocaleUsesNonUnicodeWchar (void )
877+ {
878+ /* Oracle Solaris uses non-Unicode internal wchar_t form for
879+ non-Unicode locales and hence needs conversion to UTF first. */
880+ char * codeset = nl_langinfo (CODESET );
881+ if (!codeset ) {
882+ return 0 ;
883+ }
884+ /* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
885+ return (strcmp (codeset , "UTF-8" ) != 0 && strcmp (codeset , "646" ) != 0 );
886+ }
887+
888+ static wchar_t *
889+ _Py_ConvertWCharForm (const wchar_t * source , Py_ssize_t size ,
890+ const char * tocode , const char * fromcode )
891+ {
892+ Py_BUILD_ASSERT (sizeof (wchar_t ) == 4 );
893+
894+ /* Ensure we won't overflow the size. */
895+ if (size > (PY_SSIZE_T_MAX / (Py_ssize_t )sizeof (wchar_t ))) {
896+ PyErr_NoMemory ();
897+ return NULL ;
898+ }
899+
900+ /* the string doesn't have to be NULL terminated */
901+ wchar_t * target = PyMem_Malloc (size * sizeof (wchar_t ));
902+ if (target == NULL ) {
903+ PyErr_NoMemory ();
904+ return NULL ;
905+ }
906+
907+ iconv_t cd = iconv_open (tocode , fromcode );
908+ if (cd == (iconv_t )- 1 ) {
909+ PyErr_Format (PyExc_ValueError , "iconv_open() failed" );
910+ PyMem_Free (target );
911+ return NULL ;
912+ }
913+
914+ char * inbuf = (char * ) source ;
915+ char * outbuf = (char * ) target ;
916+ size_t inbytesleft = sizeof (wchar_t ) * size ;
917+ size_t outbytesleft = inbytesleft ;
918+
919+ size_t ret = iconv (cd , & inbuf , & inbytesleft , & outbuf , & outbytesleft );
920+ if (ret == DECODE_ERROR ) {
921+ PyErr_Format (PyExc_ValueError , "iconv() failed" );
922+ PyMem_Free (target );
923+ iconv_close (cd );
924+ return NULL ;
925+ }
926+
927+ iconv_close (cd );
928+ return target ;
929+ }
930+
931+ /* Convert a wide character string to the UCS-4 encoded string. This
932+ is necessary on systems where internal form of wchar_t are not Unicode
933+ code points (e.g. Oracle Solaris).
934+
935+ Return a pointer to a newly allocated string, use PyMem_Free() to free
936+ the memory. Return NULL and raise exception on conversion or memory
937+ allocation error. */
938+ wchar_t *
939+ _Py_DecodeNonUnicodeWchar (const wchar_t * native , Py_ssize_t size )
940+ {
941+ return _Py_ConvertWCharForm (native , size , "UCS-4-INTERNAL" , "wchar_t" );
942+ }
943+
944+ /* Convert a UCS-4 encoded string to native wide character string. This
945+ is necessary on systems where internal form of wchar_t are not Unicode
946+ code points (e.g. Oracle Solaris).
947+
948+ The conversion is done in place. This can be done because both wchar_t
949+ and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
950+ to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
951+ which is currently the only system using these functions; it doesn't have
952+ to be for other systems).
953+
954+ Return 0 on success. Return -1 and raise exception on conversion
955+ or memory allocation error. */
956+ int
957+ _Py_EncodeNonUnicodeWchar_InPlace (wchar_t * unicode , Py_ssize_t size )
958+ {
959+ wchar_t * result = _Py_ConvertWCharForm (unicode , size , "wchar_t" , "UCS-4-INTERNAL" );
960+ if (!result ) {
961+ return -1 ;
962+ }
963+ memcpy (unicode , result , size * sizeof (wchar_t ));
964+ PyMem_Free (result );
965+ return 0 ;
966+ }
967+ #endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
862968
863969#ifdef MS_WINDOWS
864970static __int64 secs_between_epochs = 11644473600 ; /* Seconds between 1.1.1601 and 1.1.1970 */
0 commit comments