From 5333773fbdd870cf3bf57d3be59ba65e7e94d886 Mon Sep 17 00:00:00 2001 From: "Penn, John M 047828115" Date: Tue, 27 Nov 2018 14:24:48 -0600 Subject: [PATCH] Return output length even if output array is NULL. Ref #708 --- include/trick/unicode_utils.h | 50 ++- .../trick_utils/unicode/src/unicode_utils.c | 397 ++++++++++-------- .../trick_utils/unicode/test/Makefile | 2 +- .../unicode/test/unicode_utils_test.cpp | 245 +++++++---- 4 files changed, 433 insertions(+), 261 deletions(-) diff --git a/include/trick/unicode_utils.h b/include/trick/unicode_utils.h index 1d966e5d..9f7a468f 100644 --- a/include/trick/unicode_utils.h +++ b/include/trick/unicode_utils.h @@ -2,14 +2,15 @@ #define UNITCODE_UTILS_H #include -/* Maintainer: John M. Penn */ +/* Author: John M. Penn */ #ifdef __cplusplus extern "C" { #endif /* Convert Unicode codepoint to UTF-32. Validates that it's a legal unicode value. - Returns 1, if successful, 0 otherwise. */ + Returns 1, if successful, 0 otherwise. + */ size_t ucodepoint_to_utf32(unsigned int codePoint, int32_t *out); /* Convert Unicode codepoint to UTF-16. @@ -24,27 +25,46 @@ size_t ucodepoint_to_utf16(unsigned int codePoint, int16_t (*out)[2]); */ size_t ucodepoint_to_utf8(unsigned int codePoint, char (*out)[4]); - - -/* Un-escape C escape sequences, including \u and \U Unicode escape sequences, - in an ASCII character array, producing a UTF-8 character array. Return the - number of elements in the character string. -*/ -size_t ascii_to_utf8(const char *in, char *out, size_t outSize); - /* Escape ('\' escape codes) all unicode and non-printable ASCII characters - in a UTF-8 character string. Return the number of elements in the character string. + in a UTF-8 character string to an all-ASCII representation. + Returns the number of elements in the character string, or 0 on failure. + */ +size_t escape_to_ascii(const char *in, char *out, size_t outSize); + +/* Return the length of the array that would be produced if it were converted, + or 0 on failure. + */ +size_t escape_to_ascii_len(const char *in); + +/* Un-escape C-language escape sequences, including \u and \U Unicode escape sequences, + in an ASCII character array, producing a UTF-8 character array. + Returns the number of elements in the character string, or 0 on failure. */ -size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize); +size_t unescape_to_utf8(const char *in, char *out, size_t outSize); + +/* Return the length of the array that would be produced if it were converted, + or 0 on failure. + */ +size_t unescape_to_utf8_len(const char *in); /* Convert a UTF-8 character array to a wchar_t array. Supports 16, and 32 bit wchar_t. - Return the number of elements in the wchar_t string. */ + Returns the number of elements in the wchar_t string, or 0 on failure. */ size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize); -/* Convert wchar_t character array to UTF-8. Return the number of elements in - the character (utf-8) string.*/ +/* Return the length of the array that would be produced if in were converted, + or 0 on failure. + */ +size_t utf8_to_wchar_len(const char *in); + +/* Convert wchar_t character array to UTF-8. + Returns the number of elements in the character (utf-8) string, + or 0 on failure. +*/ size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize); +/* Return the length of the array that would be produced if in were converted, or 0 on failure. */ +size_t wchar_to_utf8_len(const wchar_t *in); + #ifdef __cplusplus } #endif diff --git a/trick_source/trick_utils/unicode/src/unicode_utils.c b/trick_source/trick_utils/unicode/src/unicode_utils.c index e2229bef..fbab7e62 100644 --- a/trick_source/trick_utils/unicode/src/unicode_utils.c +++ b/trick_source/trick_utils/unicode/src/unicode_utils.c @@ -6,7 +6,9 @@ #include #include "trick/unicode_utils.h" -/* Maintainer: John M. Penn */ +/* Author: John M. Penn */ + +#define ERROR_STATE 99 size_t ucodepoint_to_utf32(unsigned int codePoint, int32_t *out) { @@ -69,24 +71,20 @@ size_t ucodepoint_to_utf8(unsigned int codePoint, char (*out)[4]) { return 0; } -size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) { - - int state = 0; +size_t escape_to_ascii(const char *in, char *out, size_t outSize) { unsigned int codePoint; - char wks[11]; - - if (out == NULL) { - fprintf(stderr,"%s:ERROR: ASCII char pointer (out) is NULL. No conversion performed.\n", __FUNCTION__); - return 0; - } - out[0] = 0; + size_t out_len = 0; + int state = 0; + char ascii_elements[11]; if (in == NULL) { fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__); return 0; } - while (*in != 0) { + if (out != NULL) out[out_len] = 0; + + while ((*in != 0) && (state != ERROR_STATE)) { unsigned char ch = *in; switch (state) { case 0: { @@ -101,55 +99,61 @@ size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) { state = 1; } else if (ch >= 0x80) { // We should never find a continuation byte in isolation. fprintf(stderr,"%s:ERROR: UTF8 string (in) appears to be corrupted.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } else { // ASCII if (ch == '\a') { - sprintf(wks,"\\a"); + sprintf(ascii_elements, "\\a"); } else if (ch == '\b') { - sprintf(wks,"\\b"); + sprintf(ascii_elements, "\\b"); } else if (ch == '\f') { - sprintf(wks,"\\f"); + sprintf(ascii_elements, "\\f"); } else if (ch == '\n') { - sprintf(wks,"\\n"); + sprintf(ascii_elements, "\\n"); } else if (ch == '\r') { - sprintf(wks,"\\r"); + sprintf(ascii_elements, "\\r"); } else if (ch == '\t') { - sprintf(wks,"\\t"); + sprintf(ascii_elements, "\\t"); } else if (ch == '\v') { - sprintf(wks,"\\v"); + sprintf(ascii_elements, "\\v"); } else if (isprint(ch)) { - sprintf(wks,"%c",ch); + sprintf(ascii_elements, "%c",ch); } else { - sprintf(wks,"\\x%02x",ch); + sprintf(ascii_elements, "\\x%02x",ch); } - if ((strlen(out)+strlen(wks)) < outSize-1) { - strcat(out, wks); - } else { - fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__); - state = 99; + size_t n_elements = strlen(ascii_elements); + if (out != NULL) { + if ((out_len + n_elements) < outSize) { + strcat(out, ascii_elements); + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__); + state = ERROR_STATE; + } } + out_len += n_elements; } } break; case 1: { // Expecting one continuation byte. if ((ch & 0xc0) == 0x80) { // If the next char is a continuation byte .. codePoint = (codePoint << 6) | (ch & 0x3f); // Extract low 6 bits state = 0; - if (codePoint <= 0xffff) { - sprintf(wks,"\\u%04x", codePoint); + sprintf(ascii_elements, "\\u%04x", codePoint); } else { - sprintf(wks,"\\U%08x", codePoint); + sprintf(ascii_elements, "\\U%08x", codePoint); } - if ((strlen(out)+strlen(wks)) < outSize-1) { - strcat(out, wks); - } else { - fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__); - state = 99; + size_t n_elements = strlen(ascii_elements); + if (out != NULL) { + if ((out_len + n_elements) < outSize) { + strcat(out, ascii_elements); + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__); + state = ERROR_STATE; + } } - + out_len += n_elements; } else { fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } } break; case 2: { /* Expecting two continuation bytes. */ @@ -158,7 +162,7 @@ size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) { state = 1; } else { fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } } break; case 3: { /* Expecting three continuation bytes. */ @@ -167,60 +171,60 @@ size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) { state = 2; } else { fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } } break; default: { - out[0] = 0; - return 0; + state = ERROR_STATE; } break; } in ++; } + /* If we didn't finished in state 0, then we had an error. */ + if (state != 0) { + out_len = 0; + } + if (out != NULL) out[out_len] = 0; /* NULL termination of string. */ + return out_len; +} - /* If we finished in state 0, then we're good. Just - terminate the string, otherwise we had an error. */ - if (state == 0) { - return strlen(out); - } else { - out[0] = 0; - return 0; - } +size_t escape_to_ascii_len(const char *in) { + return escape_to_ascii( in, NULL, (size_t)0); } /* Un-escapes ASCII and Unicode escape sequences, and encodes them into UTF-8. */ -size_t ascii_to_utf8(const char *in, char *out, size_t outSize) { +size_t unescape_to_utf8(const char *in, char *out, size_t outSize) { unsigned int codePoint = 0; - size_t len = 0; + size_t out_len = 0; int state = 0; int digitsExpected = 0; - if (out == NULL) { - fprintf(stderr,"%s:ERROR: ASCII char pointer (out) is NULL. No conversion performed.\n", __FUNCTION__); - return 0; - } - out[0] = 0; - if (in == NULL) { - fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__); + fprintf(stderr,"%s:ERROR: char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__); return 0; } - while (*in != 0) { + if (out != NULL) out[out_len] = 0; + + while ((*in != 0) && (state != ERROR_STATE )) { unsigned char ch = *in; - if (ch > 0x7f) { /* All input characters must be ASCII. */ - fprintf(stderr,"%s:ERROR: ASCII string (in) contains non-ASCII values.\n", __FUNCTION__); - out[0] = 0; - return 0; - } - /* All escaped characters will be un-escaped. */ switch(state) { case 0: { // Normal State - if (ch =='\\') { + if (ch >= 0xf0) { // Start of a 4-byte UTF-8 sequence. + if (out != NULL) out[out_len] = ch; out_len++; state = 3; + } else if (ch >= 0xe0) { // Start of a 3-byte UTF-8 sequence. + if (out != NULL) out[out_len] = ch; out_len++; state = 4; + } else if (ch >= 0xc0) { // Start of a 2-byte UTF-8 sequence. + if (out != NULL) out[out_len] = ch; out_len++; state = 5; + } else if (ch >= 0x80) { // We should never find a UTF-8 continuation byte in isolation. + fprintf(stderr,"%s:ERROR: Input string (in) appears to be corrupted.\n", __FUNCTION__); + state = ERROR_STATE; + } else if (ch =='\\') { state = 1; } else { - out[len++] = ch; + if (out != NULL) out[out_len] = ch; + out_len++; } } break; case 1: { // Escaped State ( that is: we've found a '\' character.) @@ -228,26 +232,24 @@ size_t ascii_to_utf8(const char *in, char *out, size_t outSize) { case '\'': case '\"': case '\?': - case '\\': { - out[len++] = ch; state = 0; - } break; - - case 'a': { out[len++] = '\a'; state = 0; } break; - case 'b': { out[len++] = '\b'; state = 0; } break; - case 'f': { out[len++] = '\f'; state = 0; } break; - case 'n': { out[len++] = '\n'; state = 0; } break; - case 'r': { out[len++] = '\r'; state = 0; } break; - case 't': { out[len++] = '\t'; state = 0; } break; - case 'v': { out[len++] = '\b'; state = 0; } break; + case '\\': { if (out != NULL) out[out_len] = ch; out_len++; state = 0; } break; + case 'a': { if (out != NULL) out[out_len] = '\a'; out_len++; state = 0; } break; + case 'b': { if (out != NULL) out[out_len] = '\b'; out_len++; state = 0; } break; + case 'f': { if (out != NULL) out[out_len] = '\f'; out_len++; state = 0; } break; + case 'n': { if (out != NULL) out[out_len] = '\n'; out_len++; state = 0; } break; + case 'r': { if (out != NULL) out[out_len] = '\r'; out_len++; state = 0; } break; + case 't': { if (out != NULL) out[out_len] = '\t'; out_len++; state = 0; } break; + case 'v': { if (out != NULL) out[out_len] = '\b'; out_len++; state = 0; } break; case 'x': { digitsExpected = 2; state = 2; } break; case 'u': { digitsExpected = 4; state = 2; } break; case 'U': { digitsExpected = 8; state = 2; } break; default : { + state = ERROR_STATE; } } // switch ch } break; case 2: { // Escaped Unicode ( that is: we've found '\x', '\u' or '\U'.) - int digit = 0; + int digit = -1; if (ch >= '0' && ch <= '9') { digit = ch - (int)'0'; } else if (ch >= 'A' && ch <= 'F') { @@ -257,108 +259,161 @@ size_t ascii_to_utf8(const char *in, char *out, size_t outSize) { } else { fprintf(stderr,"%s:ERROR: Insufficient hexidecimal digits following" " \\x, \\u, or \\U escape code in char string (in).\n", __FUNCTION__); - out[0] = 0; - return 0; + state = ERROR_STATE; } - codePoint = codePoint * 16 + digit; - digitsExpected -- ; - if ( digitsExpected == 0 ) { - char temp[4]; - size_t count = ucodepoint_to_utf8(codePoint, &temp); - if (count < (outSize-len)) { - memcpy( &out[len], temp, sizeof(char) * count ); - len += count; + if (digit >= 0) { + codePoint = codePoint * 16 + digit; + digitsExpected -- ; + if ( digitsExpected == 0 ) { + char utf8_bytes[4]; + size_t n_elements = ucodepoint_to_utf8(codePoint, &utf8_bytes); state = 0; - } else { - fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__); - out[0] = 0; - return 0; - } - codePoint = 0; + if (out != NULL) { + if (out_len + n_elements < outSize) { + memcpy( &out[out_len], utf8_bytes, sizeof(char) * n_elements ); + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__); + state = ERROR_STATE; + } + } + out_len += n_elements; + codePoint = 0; + } } } break; + + case 3: { /* Expecting 3 UTF-8 continuation bytes. */ + if ((ch & 0xc0) == 0x80) { + if (out != NULL) out[out_len] = ch; out_len++; state = 4; + } else { + fprintf(stderr,"%s:ERROR: Input (in) appears to be corrupted.\n", __FUNCTION__); + state = ERROR_STATE; + } + } break; + + case 4: { /* Expecting 2 UTF-8 continuation bytes. */ + if ((ch & 0xc0) == 0x80) { + if (out != NULL) out[out_len] = ch; out_len++; state = 5; + } else { + fprintf(stderr,"%s:ERROR: Input (in) appears to be corrupted.\n", __FUNCTION__); + state = ERROR_STATE; + } + } break; + + case 5: { /* Expecting 1 UTF-8 continuation byte. */ + if ((ch & 0xc0) == 0x80) { + if (out != NULL) out[out_len] = ch; out_len++; state = 0; + } else { + fprintf(stderr,"%s:ERROR: Input (in) appears to be corrupted.\n", __FUNCTION__); + state = ERROR_STATE; + } + } break; + default: { - out[0] = 0; - return 0; + state = ERROR_STATE; } break; } in ++; } - out[len] = 0; /* NULL termination of string. */ - return len; + if (state != 0) { /* If we didn't finished in state 0, then we had an error. */ + out_len = 0; + } + if (out != NULL) out[out_len] = 0; /* NULL termination of string. */ + return out_len; +} + +size_t unescape_to_utf8_len(const char *in) { + return unescape_to_utf8( in, NULL, (size_t)0); } size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize) { unsigned int codePoint = 0; - size_t len = 0; + size_t out_len = 0; int state = 0; - while (*in != 0) { + if (in == NULL) { + fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__); + return 0; + } + + if (out != NULL) out[out_len] = 0; + + while ((*in != 0) && (state != ERROR_STATE)) { unsigned char ch = *in; switch (state) { case 0: { - if (ch >= 0xf0) { // Start of a 4-byte sequence. + if (ch >= 0xf0) { // Start of a 4-byte UTF-8 sequence. codePoint = ch & 0x07; // Extract low 3 bits state = 3; - } else if (ch >= 0xe0) { // Start of a 3-byte sequence. + } else if (ch >= 0xe0) { // Start of a 3-byte UTF-8 sequence. codePoint = ch & 0x0f; // Extract low 4 bits state = 2; - } else if (ch >= 0xc0) { // Start of a 2-byte sequence. + } else if (ch >= 0xc0) { // Start of a 2-byte UTF-8 sequence. codePoint = ch & 0x1f; // Extract low 5 bits state = 1; - } else if (ch >= 0x80) { // We should never find a continuation byte in isolation. + } else if (ch >= 0x80) { // We should never find a UTF-8 continuation byte in isolation. fprintf(stderr,"%s:ERROR: UTF8 string (in) appears to be corrupted.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } else { codePoint = ch; // ASCII - if ((outSize-len) > 1) { - out[len++] = (wchar_t)codePoint; - } else { - fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); - state = 99; + if (out != NULL) { + if ((out_len + 1) < outSize) { + out[out_len] = (wchar_t)codePoint; + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); + state = ERROR_STATE; + } } + out_len++; } } break; - case 1: { // Expecting one continuation byte. - if ((ch & 0xc0) == 0x80) { // If the next char is a continuation byte .. + case 1: { /* Expecting one continuation byte. */ + if ((ch & 0xc0) == 0x80) { codePoint = (codePoint << 6) | (ch & 0x3f); // Extract lower 6 bits state = 0; if (sizeof(wchar_t) == 4) { // wchar_t is UTF-32 - int32_t temp; - if ( ucodepoint_to_utf32(codePoint, &temp) > 0) { - if ((outSize-len) > 1) { - out[len++] = (wchar_t)temp; - } else { - fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); - state = 99; + int32_t utf32_element; + if ( ucodepoint_to_utf32(codePoint, &utf32_element) > 0) { + if (out != NULL) { + if ((out_len + 1) < outSize) { + out[out_len] = (wchar_t)utf32_element; + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); + state = ERROR_STATE; + } } + out_len++; } else { - state = 99; + /* ucodepoint_to_utf32() will have, in this case produced an error message. */ + state = ERROR_STATE; } - } else if (sizeof(wchar_t) == 2) { // wchar_t is UTF-16 - int16_t temp[2]; - size_t count; - if (( count = ucodepoint_to_utf16(codePoint, &temp)) > 0) { - if (count < (outSize-len)) { - memcpy( &out[len], temp, sizeof(int16_t) * count ); - len += count; - } else { - fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); - state = 99; + int16_t utf16_elements[2]; + size_t n_elements; + if (( n_elements = ucodepoint_to_utf16(codePoint, &utf16_elements)) > 0) { + if (out != NULL) { + if ((out_len + n_elements) < outSize) { + memcpy( &out[out_len], utf16_elements, sizeof(int16_t) * n_elements); + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); + state = ERROR_STATE; + } } + out_len += n_elements; + } else { + /* ucodepoint_to_utf16() will have, in this case produced an error message. */ + state = ERROR_STATE; } - } else { fprintf(stderr,"%s:ERROR: Unsupported wchar_t size.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } } else { fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } } break; case 2: { /* Expecting two continuation bytes. */ @@ -367,7 +422,7 @@ size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize) { state = 1; } else { fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } } break; case 3: { /* Expecting three continuation bytes. */ @@ -376,35 +431,38 @@ size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize) { state = 2; } else { fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); - state = 99; + state = ERROR_STATE; } } break; - default: { /* Error State. */ - out[0] = 0; - return 0; + default: { + state = ERROR_STATE; } break; } in ++; } + if (state != 0) { /* If we didn't finish in state 0, it's an error. */ + out_len = 0; + } + if (out != NULL) out[out_len] = 0; /* NULL termination of string. */ + return out_len; +} - /* If we finished in state 0, then we're good. Just - terminate the string, otherwise we had an error. */ - if (state == 0) { - out[len] = 0; - return len; - } else { - out[0] = 0; - return 0; - } - return len; +size_t utf8_to_wchar_len(const char *in) { + return utf8_to_wchar( in, NULL, (size_t)0); } size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize ) { unsigned int codePoint = 0; - size_t len = 0; + size_t out_len = 0; + int state = 0; - while ( *in != 0 ) { + if (in == NULL) { + fprintf(stderr,"%s:ERROR: wchar_t-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__); + return 0; + } + + while ((*in != 0) && (state != ERROR_STATE)) { if (*in >= 0xd800 && *in <= 0xdbff) /* If High-surrogate. */ codePoint = ((*in - 0xd800) << 10) + 0x10000; else { @@ -414,24 +472,33 @@ size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize ) { codePoint = *in; } else { fprintf(stderr,"%s:ERROR: Invalid Unicode value.\n", __FUNCTION__); - out[0] = 0; - return 0; + state = ERROR_STATE; } - char temp[4]; - size_t count = ucodepoint_to_utf8(codePoint, &temp); - if (count < (outSize-len)) { - memcpy( &out[len], temp, sizeof(char) * count ); - len += count; - } else { - fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__); - out[0] = 0; - return 0; + if (state != ERROR_STATE) { + char utf8_elements[4]; + size_t n_elements = ucodepoint_to_utf8(codePoint, &utf8_elements); + if (out != NULL) { + if ((out_len + n_elements) < outSize) { + memcpy( &out[out_len], utf8_elements, sizeof(char) * n_elements ); + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__); + state = ERROR_STATE; + } + } + out_len += n_elements; + codePoint = 0; } - codePoint = 0; } in++; } - out[len] = L'\0'; /* NULL termination of string. */ - return len; + if (state != 0) { /* If we didn't finish in state 0, it's an error. */ + out_len = 0; + } + if (out != NULL) out[out_len] = 0; /* NULL termination of string. */ + return out_len; +} + +size_t wchar_to_utf8_len(const wchar_t *in) { + return wchar_to_utf8( in, NULL, (size_t)0); } diff --git a/trick_source/trick_utils/unicode/test/Makefile b/trick_source/trick_utils/unicode/test/Makefile index a517ce5a..26427629 100644 --- a/trick_source/trick_utils/unicode/test/Makefile +++ b/trick_source/trick_utils/unicode/test/Makefile @@ -11,7 +11,7 @@ include ${TRICK_HOME}/share/trick/makefiles/Makefile.common TRICK_CPPFLAGS += -I$(GTEST_HOME)/include -I$(TRICK_HOME)/include -g -Wall -Wextra -DGTEST_HAS_TR1_TUPLE=0 TRICK_LIBS = ${TRICK_LIB_DIR}/libtrick.a -TRICK_EXEC_LINK_LIBS += -L${GTEST_HOME}/lib64 -L${GTEST_HOME}/lib -lgtest -lgtest_main -lpthread +TRICK_EXEC_LINK_LIBS += -L${GTEST_HOME}/lib64 -L${GTEST_HOME}/lib -lgtest -lpthread # Added for Ubuntu... not required for other systems. TRICK_EXEC_LINK_LIBS += -lpthread diff --git a/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp b/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp index a2875e86..8175664a 100644 --- a/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp +++ b/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp @@ -5,6 +5,11 @@ #include #include "trick/unicode_utils.h" +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + const char* ISO_6429_Restore_Default = "\x1b[00m"; const char* ISO_6429_Bold = "\x1b[01m"; const char* ISO_6429_Underline = "\x1b[04m"; @@ -31,6 +36,12 @@ void Error_Message_Expected() { printf("%s\n", ISO_6429_Restore_Default ); } +/* The following are the utf-8 encodings of four unicode characters used in the following tests. */ +// Greek Phi Symbol => U+03d5 => 0xcf 0x95 // see: https://www.compart.com/en/unicode/U+03D5 +// Superscript Latin Small Letter I => U+2071 => 0xe2 0x81 0xb1 // see: https://www.compart.com/en/unicode/U+2071 +// Modifier Letter Small Greek Phi => U+1D60 => 0xe1 0xb5 0xa0 // see: https://www.compart.com/en/unicode/U+1D60 +// Aegean Number Ten => U+10110 => 0xf0 0x90 0x84 0x90 // see: https://www.compart.com/en/unicode/U+10110 + // ------------------------------------------------------- // Test suite for ucodepoint_to_utf32() // ------------------------------------------------------- @@ -134,38 +145,39 @@ TEST(ucodepoint_to_utf8, ascii ) { } // ------------------------------------------------------- -// Test suite for utf8_to_printable_ascii() +// Test suite for escape_to_ascii() // ------------------------------------------------------- -TEST(utf8_to_printable_ascii, null_input ) { +TEST(escape_to_ascii, null_input ) { /* Should generate error message if input character pointer is NULL. */ - char resultant_ascii_s[128]; + char output[128]; char* null_ptr = (char*)0; Error_Message_Expected(); - size_t size = utf8_to_printable_ascii( null_ptr, resultant_ascii_s, sizeof(resultant_ascii_s)); + size_t size = escape_to_ascii( null_ptr, output, sizeof(output)); EXPECT_EQ(0, size); } -TEST(utf8_to_printable_ascii, null_output ) { - /* Should generate error message if output character pointer is NULL. */ +TEST(escape_to_ascii, null_output ) { + /* If output character pointer is NULL, still determine the length. */ char* null_ptr = (char*)0; - const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)"; - Error_Message_Expected(); - size_t size = utf8_to_printable_ascii( input, null_ptr, size_t(5)); - EXPECT_EQ(0, size); + const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n"; + size_t expected_size = strlen ("e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n"); + size_t size = escape_to_ascii( input, null_ptr, size_t(5)); + EXPECT_EQ(expected_size, size); } -TEST(utf8_to_printable_ascii, normal_1 ) { - char resultant_ascii_s[128]; - /* utf8_to_printable_ascii() should escape all Unicode and non-printable ASCII characters. */ +TEST(escape_to_ascii, normal_1 ) { + char output[128]; + /* escape_to_ascii() should escape all Unicode and non-printable ASCII characters. */ const char* utf8_s = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n"; const char* expected_ascii_s = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n"; - (void) utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); - EXPECT_STREQ(expected_ascii_s, resultant_ascii_s); + size_t size = escape_to_ascii( utf8_s, output, sizeof(output)); + EXPECT_EQ( strlen(expected_ascii_s), size); + EXPECT_STREQ(expected_ascii_s, output); } -TEST(utf8_to_printable_ascii, normal_2 ) { - char resultant_ascii_s[256]; - /* utf8_to_printable_ascii() should escape all Unicode and non-printable ASCII characters. */ +TEST(escape_to_ascii, normal_2 ) { + char output[256]; + /* escape_to_ascii() should escape all Unicode and non-printable ASCII characters. */ const char ascii[128] = { '\x01','\x02','\x03','\x04','\x05','\x06','\x07','\x08','\x09','\x0a','\x0b','\x0c','\x0d','\x0e','\x0f', '\x10','\x11','\x12','\x13','\x14','\x15','\x16','\x17','\x18','\x19','\x1a','\x1b','\x1c','\x1d','\x1e','\x1f', '\x20','\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f', @@ -181,133 +193,147 @@ TEST(utf8_to_printable_ascii, normal_2 ) { "\\r\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f" " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"; - size_t size = utf8_to_printable_ascii( ascii, resultant_ascii_s, sizeof(resultant_ascii_s)); - EXPECT_EQ(209, size); - EXPECT_STREQ(expected_ascii_s, resultant_ascii_s); + size_t size = escape_to_ascii( ascii, output, sizeof(output)); + EXPECT_EQ(strlen(expected_ascii_s), size); + EXPECT_STREQ(expected_ascii_s, output); } -/* The following are the utf-8 encodings of four unicode characters used in the following tests. */ -// Greek Phi Symbol => U+03d5 => 0xcf 0x95 // see: https://www.compart.com/en/unicode/U+03D5 -// Superscript Latin Small Letter I => U+2071 => 0xe2 0x81 0xb1 // see: https://www.compart.com/en/unicode/U+2071 -// Modifier Letter Small Greek Phi => U+1D60 => 0xe1 0xb5 0xa0 // see: https://www.compart.com/en/unicode/U+1D60 -// Aegean Number Ten => U+10110 => 0xf0 0x90 0x84 0x90 // see: https://www.compart.com/en/unicode/U+10110 +TEST(escape_to_ascii, demotest ) { + char output[128]; -TEST(utf8_to_printable_ascii, demotest ) { - char resultant_ascii_s[128]; + /* This test simply demonstrates that the following UTF-8 string (utf8_s), + used in subsequent tests, is a well formed UTF-8 string. */ const char utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x95','\0'}; const char* expected_ascii_s = "Phi = \\u03d5"; - (void) utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); - EXPECT_STREQ(expected_ascii_s, resultant_ascii_s); + + size_t size = escape_to_ascii( utf8_s, output, sizeof(output)); + + EXPECT_STREQ(expected_ascii_s, output); + EXPECT_EQ(strlen(expected_ascii_s), size); } -TEST(utf8_to_printable_ascii, detect_corruption_1 ) { - char resultant_ascii_s[128]; - /* The following string is deliberately corrupted with a spurious - continuation character (in corrupted_utf8_s[6]).*/ - const char corrupted_utf8_s[11] = {'P','h','i',' ','=',' ','\x80','\x95','\0'}; +TEST(escape_to_ascii, detect_corruption_1 ) { + char output[128]; + + /* The input string is deliberately corrupted with a spurious + continuation character.*/ + + char utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x95','\0'}; + utf8_s[6] = '\x80'; /* Deliberately corrupt the UTF-8 string. */ + Error_Message_Expected(); - size_t size = utf8_to_printable_ascii( corrupted_utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); + size_t size = escape_to_ascii( utf8_s, output, sizeof(output)); + EXPECT_EQ(0, size); } -TEST(utf8_to_printable_ascii, detect_corruption_2 ) { - char resultant_ascii_s[128]; +TEST(escape_to_ascii, detect_corruption_2 ) { + char output[128]; + /* The following string is deliberately corrupted: 0xcf is a header for a two-byte sequence, it should be followed by a continuation byte (most significant 2 bits are 10). 0x75 starts with 01 */ - const char corrupted_utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x75','\0'}; + + char utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x95','\0'}; + utf8_s[7] = '\x75'; /* Deliberately corrupt the UTF-8 string. */ + Error_Message_Expected(); - size_t size = utf8_to_printable_ascii( corrupted_utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); + size_t size = escape_to_ascii( utf8_s, output, sizeof(output)); + EXPECT_EQ(0, size); } -TEST(utf8_to_printable_ascii, insufficient_result_array_size ) { - /* The result array must be of sufficient size. Here it is not. */ - char resultant_ascii_s[16]; +TEST(escape_to_ascii, insufficient_result_array_size ) { + char output[16]; + + /* If the output array pointer is not NULL, it must be of sufficient size. Here it is not. */ const char* utf8_s = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n"; Error_Message_Expected(); - size_t size = utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); + size_t size = escape_to_ascii( utf8_s, output, sizeof(output)); EXPECT_EQ(0, size); } // ------------------------------------------------------- -// Test suite for ascii_to_utf8() +// Test suite for unescape_to_utf8() // ------------------------------------------------------- -TEST(ascii_to_utf8, null_input ) { +TEST(unescape_to_utf8, null_input ) { /* Should generate error message if input character pointer is NULL. */ - char resultant_ascii_s[128]; + char output[128]; char* null_ptr = (char*)0; Error_Message_Expected(); - size_t size = ascii_to_utf8( null_ptr, resultant_ascii_s, sizeof(resultant_ascii_s)); + size_t size = unescape_to_utf8( null_ptr, output, sizeof(output)); EXPECT_EQ(0, size); } -TEST(ascii_to_utf8, null_output ) { - /* Should generate error message if output character pointer is NULL. */ +TEST(unescape_to_utf8, null_output ) { + /* Should return the length of the string that would have been produced. */ char* null_ptr = (char*)0; - const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)"; - - Error_Message_Expected(); - size_t size = ascii_to_utf8( input, null_ptr, size_t(5)); - EXPECT_EQ(0, size); + const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n"; + size_t expected_size = strlen("e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n"); + size_t size = unescape_to_utf8( input, null_ptr, size_t(5)); + EXPECT_EQ(expected_size, size); } -TEST(ascii_to_utf8, normal_1) { - /* ascii_to_utf8() should un-escape all escaped ASCII and escaped unicode. - */ - char actual_output[256]; +TEST(unescape_to_utf8, normal_1) { + /* unescape_to_utf8() should un-escape all escaped ASCII and escaped unicode, + producing a utf8 character string. It should also return the length of + that string. */ + char actual_output[128]; const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n"; const char* expected_output = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n"; - size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); - EXPECT_EQ(30, size); + size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output)); + + EXPECT_EQ( strlen(expected_output), size); EXPECT_STREQ(expected_output, actual_output); } -TEST(ascii_to_utf8, non_ascii_chars) { - char actual_output[256]; - /* The input string should only contain ASCII characters, that is, - each element should have a value < 128. That isn't the case in the - following string. Therefore, an error message should be emitted. +TEST(unescape_to_utf8, non_ascii_chars) { + char actual_output[128]; + /* +??? */ - const char* input = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + const char* input = "eⁱᵠ = cos(ϕ) + i*sin(\\u03d5)\\n"; Error_Message_Expected(); - size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); - EXPECT_EQ(0, size); + size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output)); + + EXPECT_EQ(30, size); } -TEST(ascii_to_utf8, insufficient_hex_digits_1) { +TEST(unescape_to_utf8, insufficient_hex_digits_1) { /* The \U escape code expects exactly 8 hexidecimal digits to follow. If fewer than 8 are present, then an error message should result. Note: "\U10110" will fail in a C/C++ literal at compile time too, because it is incomplete. It should be "\U00010110". */ - char actual_output[256]; + char actual_output[128]; const char* input = "Aegean Number Ten = \\U10110\n"; Error_Message_Expected(); - size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); + size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output)); + EXPECT_EQ(0, size); } -TEST(ascii_to_utf8, insufficient_hex_digits_2) { +TEST(unescape_to_utf8, insufficient_hex_digits_2) { /* The \u escape code expects exactly 4 hexidecimal digits to follow. If fewer than 4 are present, then an error message should result. Note: "\u3d5" will fail in a C/C++ literal at compile time too, because it is incomplete. It should be "\u03d5". */ - char actual_output[256]; + char actual_output[128]; const char* input = "Phi = \\u3d5\n"; Error_Message_Expected(); - size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); + size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output)); + EXPECT_EQ(0, size); } -TEST(ascii_to_utf8, insufficient_result_array_size) { +TEST(unescape_to_utf8, insufficient_result_array_size) { /* The result array must be of sufficient size. If it isn't, then an error message should be emitted. */ @@ -315,13 +341,31 @@ TEST(ascii_to_utf8, insufficient_result_array_size) { const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n"; Error_Message_Expected(); - size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); + size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output)); + EXPECT_EQ(0, size); } // ------------------------------------------------------- // Test suite for utf8_to_wchar() // ------------------------------------------------------- +TEST(utf8_to_wchar, null_input ) { + /* Should generate error message if input character pointer is NULL. */ + wchar_t output[128]; + char* null_ptr = (char*)0; + Error_Message_Expected(); + size_t size = utf8_to_wchar( null_ptr, output, sizeof(output)/sizeof(wchar_t)); + EXPECT_EQ(0, size); +} + +TEST(utf8_to_wchar, null_output ) { + /* Should return the length of the string that would have been produced. */ + wchar_t* null_ptr = (wchar_t*)0; + const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)"; + size_t expected_size = wcslen(L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"); + size_t size = utf8_to_wchar( input, null_ptr, size_t(0)); + EXPECT_EQ(expected_size, size); +} /* The following three tests demonstrate three different ways to create the same input string. */ @@ -348,7 +392,7 @@ TEST(utf8_to_wchar, test2) { TEST(utf8_to_wchar, test3) { wchar_t resultant_wchar_s[128]; - const char input[30] = {'e','\xe2','\x81','\xb1','\xe1', '\xb5','\xa0',' ','=',' ', + const char input[30] = {'e','\xe2','\x81','\xb1','\xe1','\xb5','\xa0',' ','=',' ', 'c','o','s','(','\xcf','\x95',')',' ','+',' ','i','*','s', 'i','n','(','\xcf','\x95',')','\0'}; const wchar_t* expected_wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; @@ -369,7 +413,7 @@ TEST(utf8_to_wchar, insufficient_result_array_size) { TEST(utf8_to_wchar, corrupted_input) { wchar_t resultant_wchar_s[128]; - char input[30] = {'e','\xe2','\x81','\xb1','\xe1', '\xb5','\xa0',' ','=',' ', + char input[30] = {'e','\xe2','\x81','\xb1','\xe1','\xb5','\xa0',' ','=',' ', 'c','o','s','(','\xcf','\x95',')',' ','+',' ','i','*','s', 'i','n','(','\xcf','\x95',')','\0'}; @@ -384,8 +428,26 @@ TEST(utf8_to_wchar, corrupted_input) { // ------------------------------------------------------- // Test suite for wchar_to_utf8() // ------------------------------------------------------- -TEST(wchar_to_utf8, test1) { +TEST(wchar_to_utf8, null_input ) { + /* Should generate error message if input character pointer is NULL. */ + wchar_t* null_ptr = (wchar_t*)0; + char output[128]; + Error_Message_Expected(); + size_t size = wchar_to_utf8( null_ptr, output, sizeof(output)/sizeof(wchar_t)); + EXPECT_EQ(0, size); +} +TEST(wchar_to_utf8, null_output ) { + /* If output is NULL, still generate the length of the array that would have been produced. */ + const wchar_t* input = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + char* null_ptr = (char*)0; + size_t expected_size = strlen("eⁱᵠ = cos(ϕ) + i*sin(ϕ)"); + size_t size = wchar_to_utf8( input, null_ptr, (size_t)0); + EXPECT_EQ(expected_size, size); +} + +TEST(wchar_to_utf8, test1) { + /* Should convert wchar_t array to a UTF-8 array. */ char resultant_utf8_s[128]; const wchar_t* wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; const char* expected_utf8_s = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; @@ -395,6 +457,29 @@ TEST(wchar_to_utf8, test1) { EXPECT_EQ(true, test_result); } +TEST(wchar_to_utf8, test2) { + /* Same test as above, but input is a constrained array. A variant fo this is used below.*/ + char resultant_utf8_s[128]; + wchar_t wide_s[32] = { L'e', L'ⁱ', L'ᵠ', L' ', L'=', L' ', L'c', L'o', L's', L'(', L'ϕ', L')', + L' ', L'+', L' ', L'i', L'*', L's', L'i', L'n', L'(', L'ϕ', L')' }; + const char* expected_utf8_s = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + + wchar_to_utf8(wide_s, resultant_utf8_s, sizeof(resultant_utf8_s)/sizeof(char)); + bool test_result = (strcmp(expected_utf8_s, resultant_utf8_s) == 0); + EXPECT_EQ(true, test_result); +} + +TEST(wchar_to_utf8, invalid_unicode) { + /* Should detect an invalid unicode codepoint. */ + char resultant_utf8_s[128]; + wchar_t wide_s[32] = { L'e', L'ⁱ', L'ᵠ', L' ', L'=', L' ', L'c', L'o', L's', L'(', L'ϕ', L')', + L' ', L'+', L' ', L'i', L'*', L's', L'i', L'n', L'(', L'ϕ', L')' }; + wide_s[14] = (wchar_t)0x110000; + Error_Message_Expected(); + size_t size = wchar_to_utf8(wide_s, resultant_utf8_s, sizeof(resultant_utf8_s)/sizeof(char)); + EXPECT_EQ(0, size); +} + TEST(wchar_to_utf8, insufficient_result_array_size) { char resultant_utf8_s[16];