From a29045005c18e7c751694d33daef215638f10e22 Mon Sep 17 00:00:00 2001 From: "John M. Penn" Date: Thu, 25 Oct 2018 15:14:52 -0500 Subject: [PATCH] Create a suite of unicode utilities. --- Makefile | 3 +- include/trick/unicode_utils.h | 51 ++ trick_source/trick_utils/unicode/Makefile | 5 + .../trick_utils/unicode/src/unicode_utils.c | 437 ++++++++++++++++++ .../trick_utils/unicode/test/Makefile | 41 ++ .../unicode/test/unicode_utils_test.cpp | 406 ++++++++++++++++ 6 files changed, 942 insertions(+), 1 deletion(-) create mode 100644 include/trick/unicode_utils.h create mode 100644 trick_source/trick_utils/unicode/Makefile create mode 100644 trick_source/trick_utils/unicode/src/unicode_utils.c create mode 100644 trick_source/trick_utils/unicode/test/Makefile create mode 100644 trick_source/trick_utils/unicode/test/unicode_utils_test.cpp diff --git a/Makefile b/Makefile index ba43b81c..6b0192b5 100644 --- a/Makefile +++ b/Makefile @@ -96,7 +96,8 @@ UTILS_DIRS := \ ${TRICK_HOME}/trick_source/trick_utils/comm \ ${TRICK_HOME}/trick_source/trick_utils/shm \ ${TRICK_HOME}/trick_source/trick_utils/math \ - ${TRICK_HOME}/trick_source/trick_utils/units + ${TRICK_HOME}/trick_source/trick_utils/units \ + ${TRICK_HOME}/trick_source/trick_utils/unicode UTILS_OBJS := $(addsuffix /object_$(TRICK_HOST_CPU)/*.o ,$(UTILS_DIRS)) # filter out the directories that make their own libraries diff --git a/include/trick/unicode_utils.h b/include/trick/unicode_utils.h new file mode 100644 index 00000000..1d966e5d --- /dev/null +++ b/include/trick/unicode_utils.h @@ -0,0 +1,51 @@ +#ifndef UNITCODE_UTILS_H +#define UNITCODE_UTILS_H +#include + +/* Maintainer: John M. Penn */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Convert Unicode codepoint to UTF-32. Validates that it's a legal unicode value. + Returns 1, if successful, 0 otherwise. */ +size_t ucodepoint_to_utf32(unsigned int codePoint, int32_t *out); + +/* Convert Unicode codepoint to UTF-16. + Returns the number of UTF-16 elements (1..2) necessary to represent the codepoint, + or 0 on failure. + */ +size_t ucodepoint_to_utf16(unsigned int codePoint, int16_t (*out)[2]); + +/* Convert Unicode codepoint to UTF-8. + Returns the number of UTF_8 elements (1..4) )necessary to represent the codepoint, + or 0 on failure. + */ +size_t ucodepoint_to_utf8(unsigned int codePoint, char (*out)[4]); + + + +/* Un-escape C escape sequences, including \u and \U Unicode escape sequences, + in an ASCII character array, producing a UTF-8 character array. Return the + number of elements in the character string. +*/ +size_t ascii_to_utf8(const char *in, char *out, size_t outSize); + +/* Escape ('\' escape codes) all unicode and non-printable ASCII characters + in a UTF-8 character string. Return the number of elements in the character string. +*/ +size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize); + +/* Convert a UTF-8 character array to a wchar_t array. Supports 16, and 32 bit wchar_t. + Return the number of elements in the wchar_t string. */ +size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize); + +/* Convert wchar_t character array to UTF-8. Return the number of elements in + the character (utf-8) string.*/ +size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/trick_source/trick_utils/unicode/Makefile b/trick_source/trick_utils/unicode/Makefile new file mode 100644 index 00000000..a181cedf --- /dev/null +++ b/trick_source/trick_utils/unicode/Makefile @@ -0,0 +1,5 @@ + +include ${TRICK_HOME}/share/trick/makefiles/Makefile.common +include ${TRICK_HOME}/share/trick/makefiles/Makefile.tricklib +-include Makefile_deps + diff --git a/trick_source/trick_utils/unicode/src/unicode_utils.c b/trick_source/trick_utils/unicode/src/unicode_utils.c new file mode 100644 index 00000000..e2229bef --- /dev/null +++ b/trick_source/trick_utils/unicode/src/unicode_utils.c @@ -0,0 +1,437 @@ +#include +#include +#include +#include +#include +#include +#include "trick/unicode_utils.h" + +/* Maintainer: John M. Penn */ + +size_t ucodepoint_to_utf32(unsigned int codePoint, int32_t *out) { + + if (codePoint < 0xd800 || codePoint >= 0xe000) { /* Not Surrogate */ + *out = (int32_t)codePoint; + return 1; + } else { + fprintf(stderr,"%s:ERROR: 0x%08x is reserved for UTF-16, as a surrogate codepoint.\n", __FUNCTION__, codePoint); + } + return 0; +} + +size_t ucodepoint_to_utf16(unsigned int codePoint, int16_t (*out)[2]) { + + if (codePoint > 0x10ffff) { + fprintf(stderr,"%s:ERROR: Invalid Unicode value (too big): 0x%04x.\n", __FUNCTION__, codePoint); + return 0; + } else if (codePoint > 0xffff) { + /* High-surrogate code points are in the range U+D800–U+DBFF. + * Low-surrogate code points are in the range U+DC00–U+DFFF. + * A high-surrogate code point followed by a low-surrogate code point form a + * surrogate pair in UTF-16 to represent code points greater than U+FFFF. + */ + (*out)[0] = (int16_t)(0xd800 + (codePoint >> 10)); /* Create High Surrogate */ + (*out)[1] = (int16_t)(0xdc00 + (codePoint & 0x03ff)); /* Create Low Surrogate */ + return 2; + } else if (codePoint < 0xd800 || codePoint >= 0xe000) { /* Not Surrogate */ + (*out)[0] = (int16_t)(codePoint); + return 1; + } else { + fprintf(stderr,"%s:ERROR: Invalid Unicode value (surrogate): 0x%04x.\n", __FUNCTION__, codePoint); + } + return 0; +} + +size_t ucodepoint_to_utf8(unsigned int codePoint, char (*out)[4]) { + + if (codePoint <= 0x7f) { /* ASCII */ + (*out)[0] = (char)codePoint; /* 0xxxxxxx 0x00..0x7F*/ + return 1; + + } else if (codePoint <= 0x7ff) { /* Two-byte Sequence */ + (*out)[0] = (char)(0xc0 | ((codePoint >> 6) & 0x1f)); /* 110xxxxx 0xC0..0xDF*/ + (*out)[1] = (char)(0x80 | (codePoint & 0x3f)); /* 10xxxxxx */ + return 2; + + } else if (codePoint <= 0xffff) { /* Three byte Sequence */ + (*out)[0] = (char)(0xe0 | ((codePoint >> 12) & 0x0f)); /* 1110xxxx 0xE0..0xEF*/ + (*out)[1] = (char)(0x80 | ((codePoint >> 6) & 0x3f)); /* 10xxxxxx */ + (*out)[2] = (char)(0x80 | (codePoint & 0x3f)); /* 10xxxxxx */ + return 3; + + } else { /* Four-byte Sequence */ + (*out)[0] = (char)(0xf0 | ((codePoint >> 18) & 0x07)); /* 11110xxx 0xF0..0xF7*/ + (*out)[1] = (char)(0x80 | ((codePoint >> 12) & 0x3f)); /* 10xxxxxx */ + (*out)[2] = (char)(0x80 | ((codePoint >> 6) & 0x3f)); /* 10xxxxxx */ + (*out)[3] = (char)(0x80 | (codePoint & 0x3f)); /* 10xxxxxx 0x80..0xBF */ + return 4; + } + return 0; +} + +size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) { + + int state = 0; + unsigned int codePoint; + char wks[11]; + + if (out == NULL) { + fprintf(stderr,"%s:ERROR: ASCII char pointer (out) is NULL. No conversion performed.\n", __FUNCTION__); + return 0; + } + out[0] = 0; + + if (in == NULL) { + fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__); + return 0; + } + + while (*in != 0) { + unsigned char ch = *in; + switch (state) { + case 0: { + if (ch >= 0xf0) { // Start of a 4-byte sequence. + codePoint = ch & 0x07; // Extract low 3 bits + state = 3; + } else if (ch >= 0xe0) { // Start of a 3-byte sequence. + codePoint = ch & 0x0f; // Extract low 4 bits + state = 2; + } else if (ch >= 0xc0) { // Start of a 2-byte sequence. + codePoint = ch & 0x1f; // Extract low 5 bits + state = 1; + } else if (ch >= 0x80) { // We should never find a continuation byte in isolation. + fprintf(stderr,"%s:ERROR: UTF8 string (in) appears to be corrupted.\n", __FUNCTION__); + state = 99; + } else { // ASCII + if (ch == '\a') { + sprintf(wks,"\\a"); + } else if (ch == '\b') { + sprintf(wks,"\\b"); + } else if (ch == '\f') { + sprintf(wks,"\\f"); + } else if (ch == '\n') { + sprintf(wks,"\\n"); + } else if (ch == '\r') { + sprintf(wks,"\\r"); + } else if (ch == '\t') { + sprintf(wks,"\\t"); + } else if (ch == '\v') { + sprintf(wks,"\\v"); + } else if (isprint(ch)) { + sprintf(wks,"%c",ch); + } else { + sprintf(wks,"\\x%02x",ch); + } + if ((strlen(out)+strlen(wks)) < outSize-1) { + strcat(out, wks); + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__); + state = 99; + } + } + } break; + case 1: { // Expecting one continuation byte. + if ((ch & 0xc0) == 0x80) { // If the next char is a continuation byte .. + codePoint = (codePoint << 6) | (ch & 0x3f); // Extract low 6 bits + state = 0; + + if (codePoint <= 0xffff) { + sprintf(wks,"\\u%04x", codePoint); + } else { + sprintf(wks,"\\U%08x", codePoint); + } + if ((strlen(out)+strlen(wks)) < outSize-1) { + strcat(out, wks); + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__); + state = 99; + } + + } else { + fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); + state = 99; + } + } break; + case 2: { /* Expecting two continuation bytes. */ + if ((ch & 0xc0) == 0x80) { + codePoint = (codePoint << 6) | (ch & 0x3f); + state = 1; + } else { + fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); + state = 99; + } + } break; + case 3: { /* Expecting three continuation bytes. */ + if ((ch & 0xc0) == 0x80) { + codePoint = (codePoint << 6) | (ch & 0x3f); + state = 2; + } else { + fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); + state = 99; + } + } break; + default: { + out[0] = 0; + return 0; + } break; + } + in ++; + } + + /* If we finished in state 0, then we're good. Just + terminate the string, otherwise we had an error. */ + if (state == 0) { + return strlen(out); + } else { + out[0] = 0; + return 0; + } +} + +/* Un-escapes ASCII and Unicode escape sequences, and encodes them into UTF-8. */ +size_t ascii_to_utf8(const char *in, char *out, size_t outSize) { + + unsigned int codePoint = 0; + size_t len = 0; + int state = 0; + int digitsExpected = 0; + + if (out == NULL) { + fprintf(stderr,"%s:ERROR: ASCII char pointer (out) is NULL. No conversion performed.\n", __FUNCTION__); + return 0; + } + out[0] = 0; + + if (in == NULL) { + fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__); + return 0; + } + + while (*in != 0) { + unsigned char ch = *in; + if (ch > 0x7f) { /* All input characters must be ASCII. */ + fprintf(stderr,"%s:ERROR: ASCII string (in) contains non-ASCII values.\n", __FUNCTION__); + out[0] = 0; + return 0; + } + /* All escaped characters will be un-escaped. */ + switch(state) { + case 0: { // Normal State + if (ch =='\\') { + state = 1; + } else { + out[len++] = ch; + } + } break; + case 1: { // Escaped State ( that is: we've found a '\' character.) + switch(ch) { + case '\'': + case '\"': + case '\?': + case '\\': { + out[len++] = ch; state = 0; + } break; + + case 'a': { out[len++] = '\a'; state = 0; } break; + case 'b': { out[len++] = '\b'; state = 0; } break; + case 'f': { out[len++] = '\f'; state = 0; } break; + case 'n': { out[len++] = '\n'; state = 0; } break; + case 'r': { out[len++] = '\r'; state = 0; } break; + case 't': { out[len++] = '\t'; state = 0; } break; + case 'v': { out[len++] = '\b'; state = 0; } break; + case 'x': { digitsExpected = 2; state = 2; } break; + case 'u': { digitsExpected = 4; state = 2; } break; + case 'U': { digitsExpected = 8; state = 2; } break; + default : { + } + } // switch ch + } break; + case 2: { // Escaped Unicode ( that is: we've found '\x', '\u' or '\U'.) + int digit = 0; + if (ch >= '0' && ch <= '9') { + digit = ch - (int)'0'; + } else if (ch >= 'A' && ch <= 'F') { + digit = ch - (int)'A' + 10; + } else if (ch >= 'a' && ch <= 'f') { + digit = ch - (int)'a' + 10; + } else { + fprintf(stderr,"%s:ERROR: Insufficient hexidecimal digits following" + " \\x, \\u, or \\U escape code in char string (in).\n", __FUNCTION__); + out[0] = 0; + return 0; + } + codePoint = codePoint * 16 + digit; + digitsExpected -- ; + if ( digitsExpected == 0 ) { + char temp[4]; + size_t count = ucodepoint_to_utf8(codePoint, &temp); + if (count < (outSize-len)) { + memcpy( &out[len], temp, sizeof(char) * count ); + len += count; + state = 0; + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__); + out[0] = 0; + return 0; + } + codePoint = 0; + } + } break; + default: { + out[0] = 0; + return 0; + } break; + } + in ++; + } + out[len] = 0; /* NULL termination of string. */ + return len; +} + +size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize) { + + unsigned int codePoint = 0; + size_t len = 0; + int state = 0; + + while (*in != 0) { + unsigned char ch = *in; + switch (state) { + case 0: { + if (ch >= 0xf0) { // Start of a 4-byte sequence. + codePoint = ch & 0x07; // Extract low 3 bits + state = 3; + } else if (ch >= 0xe0) { // Start of a 3-byte sequence. + codePoint = ch & 0x0f; // Extract low 4 bits + state = 2; + } else if (ch >= 0xc0) { // Start of a 2-byte sequence. + codePoint = ch & 0x1f; // Extract low 5 bits + state = 1; + } else if (ch >= 0x80) { // We should never find a continuation byte in isolation. + fprintf(stderr,"%s:ERROR: UTF8 string (in) appears to be corrupted.\n", __FUNCTION__); + state = 99; + } else { + codePoint = ch; // ASCII + if ((outSize-len) > 1) { + out[len++] = (wchar_t)codePoint; + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); + state = 99; + } + } + } break; + case 1: { // Expecting one continuation byte. + if ((ch & 0xc0) == 0x80) { // If the next char is a continuation byte .. + codePoint = (codePoint << 6) | (ch & 0x3f); // Extract lower 6 bits + state = 0; + + if (sizeof(wchar_t) == 4) { // wchar_t is UTF-32 + int32_t temp; + if ( ucodepoint_to_utf32(codePoint, &temp) > 0) { + if ((outSize-len) > 1) { + out[len++] = (wchar_t)temp; + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); + state = 99; + } + } else { + state = 99; + } + + } else if (sizeof(wchar_t) == 2) { // wchar_t is UTF-16 + int16_t temp[2]; + size_t count; + if (( count = ucodepoint_to_utf16(codePoint, &temp)) > 0) { + if (count < (outSize-len)) { + memcpy( &out[len], temp, sizeof(int16_t) * count ); + len += count; + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__); + state = 99; + } + } + + } else { + fprintf(stderr,"%s:ERROR: Unsupported wchar_t size.\n", __FUNCTION__); + state = 99; + } + + } else { + fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); + state = 99; + } + } break; + case 2: { /* Expecting two continuation bytes. */ + if ((ch & 0xc0) == 0x80) { + codePoint = (codePoint << 6) | (ch & 0x3f); + state = 1; + } else { + fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); + state = 99; + } + } break; + case 3: { /* Expecting three continuation bytes. */ + if ((ch & 0xc0) == 0x80) { + codePoint = (codePoint << 6) | (ch & 0x3f); + state = 2; + } else { + fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__); + state = 99; + } + } break; + default: { /* Error State. */ + out[0] = 0; + return 0; + } break; + } + in ++; + } + + /* If we finished in state 0, then we're good. Just + terminate the string, otherwise we had an error. */ + if (state == 0) { + out[len] = 0; + return len; + } else { + out[0] = 0; + return 0; + } + return len; +} + +size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize ) { + + unsigned int codePoint = 0; + size_t len = 0; + + while ( *in != 0 ) { + if (*in >= 0xd800 && *in <= 0xdbff) /* If High-surrogate. */ + codePoint = ((*in - 0xd800) << 10) + 0x10000; + else { + if (*in >= 0xdc00 && *in <= 0xdfff) { /* If Low-surrogate. */ + codePoint |= *in - 0xdc00; + } else if (*in <= 0x10ffff) { /* Max Unicode Value */ + codePoint = *in; + } else { + fprintf(stderr,"%s:ERROR: Invalid Unicode value.\n", __FUNCTION__); + out[0] = 0; + return 0; + } + + char temp[4]; + size_t count = ucodepoint_to_utf8(codePoint, &temp); + if (count < (outSize-len)) { + memcpy( &out[len], temp, sizeof(char) * count ); + len += count; + } else { + fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__); + out[0] = 0; + return 0; + } + codePoint = 0; + } + in++; + } + out[len] = L'\0'; /* NULL termination of string. */ + return len; +} diff --git a/trick_source/trick_utils/unicode/test/Makefile b/trick_source/trick_utils/unicode/test/Makefile new file mode 100644 index 00000000..a517ce5a --- /dev/null +++ b/trick_source/trick_utils/unicode/test/Makefile @@ -0,0 +1,41 @@ + +#SYNOPSIS: +# +# make [all] - makes everything. +# make TARGET - makes the given target. +# make clean - removes all files generated by make. + +include ${TRICK_HOME}/share/trick/makefiles/Makefile.common + +# Flags passed to the preprocessor. +TRICK_CPPFLAGS += -I$(GTEST_HOME)/include -I$(TRICK_HOME)/include -g -Wall -Wextra -DGTEST_HAS_TR1_TUPLE=0 + +TRICK_LIBS = ${TRICK_LIB_DIR}/libtrick.a +TRICK_EXEC_LINK_LIBS += -L${GTEST_HOME}/lib64 -L${GTEST_HOME}/lib -lgtest -lgtest_main -lpthread + +# Added for Ubuntu... not required for other systems. +TRICK_EXEC_LINK_LIBS += -lpthread + +# All tests produced by this Makefile. Remember to add new tests you +# created to the list. +TESTS = unicode_utils_test + +OTHER_OBJECTS = + +# House-keeping build targets. + +all : $(TESTS) + +test: $(TESTS) + ./unicode_utils_test --gtest_output=xml:${TRICK_HOME}/trick_test/Unicode_utils.xml + +clean : + rm -f $(TESTS) *.o + rm -rf io_src xml + +unicode_utils_test.o : unicode_utils_test.cpp + $(TRICK_CPPC) $(TRICK_CPPFLAGS) -c $< + +unicode_utils_test : unicode_utils_test.o + $(TRICK_CPPC) $(TRICK_CPPFLAGS) -o $@ $^ $(OTHER_OBJECTS) -L${TRICK_HOME}/lib_${TRICK_HOST_CPU} $(TRICK_LIBS) $(TRICK_EXEC_LINK_LIBS) + diff --git a/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp b/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp new file mode 100644 index 00000000..a2875e86 --- /dev/null +++ b/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp @@ -0,0 +1,406 @@ +#include +#include +#include +#include +#include +#include "trick/unicode_utils.h" + +const char* ISO_6429_Restore_Default = "\x1b[00m"; +const char* ISO_6429_Bold = "\x1b[01m"; +const char* ISO_6429_Underline = "\x1b[04m"; +const char* ISO_6429_Black_Foreground = "\x1b[30m"; +const char* ISO_6429_Red_Foreground = "\x1b[31m"; +const char* ISO_6429_Green_Foreground = "\x1b[32m"; +const char* ISO_6429_Yellow_Foreground = "\x1b[33m"; +const char* ISO_6429_Blue_Foreground = "\x1b[34m"; +const char* ISO_6429_Purple_Foreground = "\x1b[35m"; +const char* ISO_6429_Cyan_Foreground = "\x1b[36m"; +const char* ISO_6429_White_Foreground = "\x1b[37m"; +const char* ISO_6429_Black_Background = "\x1b[40m"; +const char* ISO_6429_Red_Background = "\x1b[41m"; +const char* ISO_6429_Green_Background = "\x1b[42m"; +const char* ISO_6429_Yellow_Background = "\x1b[43m"; +const char* ISO_6429_Blue_Background = "\x1b[44m"; +const char* ISO_6429_Purple_Background = "\x1b[45m"; +const char* ISO_6429_Cyan_Background = "\x1b[46m"; +const char* ISO_6429_White_Background = "\x1b[47m"; + +void Error_Message_Expected() { + printf("%s%s%s", ISO_6429_Blue_Background, ISO_6429_White_Foreground, ISO_6429_Underline); + printf("An error message is expected from this test."); + printf("%s\n", ISO_6429_Restore_Default ); +} + +// ------------------------------------------------------- +// Test suite for ucodepoint_to_utf32() +// ------------------------------------------------------- +TEST(ucodepoint_to_utf32, valid ) { + /* Aegean Number Ten, U+10110 is a valid code point. */ + int32_t out; + size_t size = ucodepoint_to_utf32(0x10110, &out); + EXPECT_EQ(1, size); +} + +TEST(ucodepoint_to_utf32, invalid ) { + /* 0xdead is in the range [d800 .. dfff], and reserved for UTF-16 + surrogates. They are not valid unicode codepoints. So, if we + attempt to convert a surrogate as a codepoint, we should get + an error meassage. + */ + int32_t out; + Error_Message_Expected(); + size_t size = ucodepoint_to_utf32(0xdead, &out); + EXPECT_EQ(0, size); +} + +// ------------------------------------------------------- +// Test suite for ucodepoint_to_utf16() +// ------------------------------------------------------- +TEST(ucodepoint_to_utf16, two_16bit_element_sequence ) { + /* Note that unicode is a 21-bit encoding. + Because Aegean Number Ten (U+10110) is larger than can be stored in 16-bits, + UTF-16 requires two 16-bit values, called surrogates to encode it. + The high-surrogate "carries" the most significant 11 bits of the codepoint. + High-surrogate = 0xd800 + most significant 11 bits of the codepoint. + The low-surrogate carries the least significant 10 bits of the codepoint. + Low-surrogate = 0xde00 + least significant 10 bits of the codepoint. + */ + int16_t out[2]; + size_t size = ucodepoint_to_utf16(0x10110, &out); + EXPECT_EQ(2, size); + EXPECT_EQ((int16_t)0xd840, out[0]); + EXPECT_EQ((int16_t)0xdd10, out[1]); +} + +TEST(ucodepoint_to_utf16, one_16bit_element) { + int16_t out[2]; + /* A valid codepoint that can be stored within 16-bits should be + equal to its UTF-16 character value. */ + size_t size = ucodepoint_to_utf16(0x03d5, &out); + EXPECT_EQ(1, size); + EXPECT_EQ((int16_t)0x03d5, out[0]); +} + +TEST(ucodepoint_to_utf16, invalid_surrogate ) { + /* Input codepoint can not be a surrogate. */ + int16_t out[2]; + Error_Message_Expected(); + size_t size = ucodepoint_to_utf16(0xdead, &out); + EXPECT_EQ(0, size); +} + +TEST(ucodepoint_to_utf16, codepoint_too_big ) { + /* Input codepoint can not be > 0x10ffff, which is the largest valid unicode codepoint. */ + int16_t out[2]; + Error_Message_Expected(); + size_t size = ucodepoint_to_utf16(0x110000, &out); + EXPECT_EQ(0, size); +} + +// ------------------------------------------------------- +// Test suite for ucodepoint_to_utf8() +// ------------------------------------------------------- +TEST(ucodepoint_to_utf8, four_8bit_element_sequence ) { + char out[4]; + /* Aegean Number Ten, U+10110 is a valid codepoint that + requires four bytes to encode in utf-8. */ + size_t size = ucodepoint_to_utf8(0x10110, &out); + EXPECT_EQ(4, size); +} + +TEST(ucodepoint_to_utf8, three_8bit_element_sequence ) { + char out[4]; + /* Superscript Latin Small Letter I, U+2071 is a valid + codepoint that requires three bytes to encode in utf-8. */ + size_t size = ucodepoint_to_utf8(0x2071, &out); + EXPECT_EQ(3, size); +} + +TEST(ucodepoint_to_utf8, two_8bit_element_sequence ) { + char out[4]; + /* Greek Phi Symbol, U+03d5 is a valid codepoint that + requires two bytes to encode in utf-8. */ + size_t size = ucodepoint_to_utf8(0x03d5, &out); + EXPECT_EQ(2, size); +} + +TEST(ucodepoint_to_utf8, ascii ) { + char out[4]; + /* Latin Small Letter A, U+0061 is a valid codepoint that + requires one byte to encode in utf-8. Below 0x7f, Unicode + and ASCII are identical. */ + size_t size = ucodepoint_to_utf8('a', &out); + EXPECT_EQ(1, size); +} + +// ------------------------------------------------------- +// Test suite for utf8_to_printable_ascii() +// ------------------------------------------------------- +TEST(utf8_to_printable_ascii, null_input ) { + /* Should generate error message if input character pointer is NULL. */ + char resultant_ascii_s[128]; + char* null_ptr = (char*)0; + Error_Message_Expected(); + size_t size = utf8_to_printable_ascii( null_ptr, resultant_ascii_s, sizeof(resultant_ascii_s)); + EXPECT_EQ(0, size); +} + +TEST(utf8_to_printable_ascii, null_output ) { + /* Should generate error message if output character pointer is NULL. */ + char* null_ptr = (char*)0; + const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)"; + Error_Message_Expected(); + size_t size = utf8_to_printable_ascii( input, null_ptr, size_t(5)); + EXPECT_EQ(0, size); +} + +TEST(utf8_to_printable_ascii, normal_1 ) { + char resultant_ascii_s[128]; + /* utf8_to_printable_ascii() should escape all Unicode and non-printable ASCII characters. */ + const char* utf8_s = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n"; + const char* expected_ascii_s = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n"; + (void) utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); + EXPECT_STREQ(expected_ascii_s, resultant_ascii_s); +} + +TEST(utf8_to_printable_ascii, normal_2 ) { + char resultant_ascii_s[256]; + /* utf8_to_printable_ascii() should escape all Unicode and non-printable ASCII characters. */ + const char ascii[128] = { '\x01','\x02','\x03','\x04','\x05','\x06','\x07','\x08','\x09','\x0a','\x0b','\x0c','\x0d','\x0e','\x0f', + '\x10','\x11','\x12','\x13','\x14','\x15','\x16','\x17','\x18','\x19','\x1a','\x1b','\x1c','\x1d','\x1e','\x1f', + '\x20','\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f', + '\x30','\x31','\x32','\x33','\x34','\x35','\x36','\x37','\x38','\x39','\x3a','\x3b','\x3c','\x3d','\x3e','\x3f', + '\x40','\x41','\x42','\x43','\x44','\x45','\x46','\x47','\x48','\x49','\x4a','\x4b','\x4c','\x4d','\x4e','\x4f', + '\x50','\x51','\x52','\x53','\x54','\x55','\x56','\x57','\x58','\x59','\x5a','\x5b','\x5c','\x5d','\x5e','\x5f', + '\x60','\x61','\x62','\x63','\x64','\x65','\x66','\x67','\x68','\x69','\x6a','\x6b','\x6c','\x6d','\x6e','\x6f', + '\x70','\x71','\x72','\x73','\x74','\x75','\x76','\x77','\x78','\x79','\x7a','\x7b','\x7c','\x7d','\x7e','\x7f', + '\x00' + }; + + const char* expected_ascii_s = "\\x01\\x02\\x03\\x04\\x05\\x06\\a\\b\\t\\n\\v\\f" + "\\r\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f" + " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"; + + size_t size = utf8_to_printable_ascii( ascii, resultant_ascii_s, sizeof(resultant_ascii_s)); + EXPECT_EQ(209, size); + EXPECT_STREQ(expected_ascii_s, resultant_ascii_s); +} + +/* The following are the utf-8 encodings of four unicode characters used in the following tests. */ +// Greek Phi Symbol => U+03d5 => 0xcf 0x95 // see: https://www.compart.com/en/unicode/U+03D5 +// Superscript Latin Small Letter I => U+2071 => 0xe2 0x81 0xb1 // see: https://www.compart.com/en/unicode/U+2071 +// Modifier Letter Small Greek Phi => U+1D60 => 0xe1 0xb5 0xa0 // see: https://www.compart.com/en/unicode/U+1D60 +// Aegean Number Ten => U+10110 => 0xf0 0x90 0x84 0x90 // see: https://www.compart.com/en/unicode/U+10110 + +TEST(utf8_to_printable_ascii, demotest ) { + char resultant_ascii_s[128]; + + const char utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x95','\0'}; + const char* expected_ascii_s = "Phi = \\u03d5"; + (void) utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); + EXPECT_STREQ(expected_ascii_s, resultant_ascii_s); +} + +TEST(utf8_to_printable_ascii, detect_corruption_1 ) { + char resultant_ascii_s[128]; + /* The following string is deliberately corrupted with a spurious + continuation character (in corrupted_utf8_s[6]).*/ + const char corrupted_utf8_s[11] = {'P','h','i',' ','=',' ','\x80','\x95','\0'}; + Error_Message_Expected(); + size_t size = utf8_to_printable_ascii( corrupted_utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); + EXPECT_EQ(0, size); +} + +TEST(utf8_to_printable_ascii, detect_corruption_2 ) { + char resultant_ascii_s[128]; + /* The following string is deliberately corrupted: 0xcf is a header + for a two-byte sequence, it should be followed by a continuation + byte (most significant 2 bits are 10). 0x75 starts with 01 */ + const char corrupted_utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x75','\0'}; + Error_Message_Expected(); + size_t size = utf8_to_printable_ascii( corrupted_utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); + EXPECT_EQ(0, size); +} + +TEST(utf8_to_printable_ascii, insufficient_result_array_size ) { + /* The result array must be of sufficient size. Here it is not. */ + char resultant_ascii_s[16]; + const char* utf8_s = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n"; + Error_Message_Expected(); + size_t size = utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s)); + EXPECT_EQ(0, size); +} + +// ------------------------------------------------------- +// Test suite for ascii_to_utf8() +// ------------------------------------------------------- + +TEST(ascii_to_utf8, null_input ) { + /* Should generate error message if input character pointer is NULL. */ + char resultant_ascii_s[128]; + char* null_ptr = (char*)0; + Error_Message_Expected(); + size_t size = ascii_to_utf8( null_ptr, resultant_ascii_s, sizeof(resultant_ascii_s)); + EXPECT_EQ(0, size); +} + +TEST(ascii_to_utf8, null_output ) { + /* Should generate error message if output character pointer is NULL. */ + char* null_ptr = (char*)0; + const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)"; + + Error_Message_Expected(); + size_t size = ascii_to_utf8( input, null_ptr, size_t(5)); + EXPECT_EQ(0, size); +} + +TEST(ascii_to_utf8, normal_1) { + /* ascii_to_utf8() should un-escape all escaped ASCII and escaped unicode. + */ + char actual_output[256]; + const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n"; + const char* expected_output = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n"; + + size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); + EXPECT_EQ(30, size); + EXPECT_STREQ(expected_output, actual_output); +} + +TEST(ascii_to_utf8, non_ascii_chars) { + char actual_output[256]; + /* The input string should only contain ASCII characters, that is, + each element should have a value < 128. That isn't the case in the + following string. Therefore, an error message should be emitted. + */ + const char* input = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + + Error_Message_Expected(); + size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); + EXPECT_EQ(0, size); +} + +TEST(ascii_to_utf8, insufficient_hex_digits_1) { + /* The \U escape code expects exactly 8 hexidecimal digits to follow. + If fewer than 8 are present, then an error message should result. + Note: "\U10110" will fail in a C/C++ literal at compile time too, + because it is incomplete. It should be "\U00010110". + */ + char actual_output[256]; + const char* input = "Aegean Number Ten = \\U10110\n"; + + Error_Message_Expected(); + size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); + EXPECT_EQ(0, size); +} + +TEST(ascii_to_utf8, insufficient_hex_digits_2) { + /* The \u escape code expects exactly 4 hexidecimal digits to follow. + If fewer than 4 are present, then an error message should result. + Note: "\u3d5" will fail in a C/C++ literal at compile time too, + because it is incomplete. It should be "\u03d5". + */ + char actual_output[256]; + const char* input = "Phi = \\u3d5\n"; + + Error_Message_Expected(); + size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); + EXPECT_EQ(0, size); +} + +TEST(ascii_to_utf8, insufficient_result_array_size) { + /* The result array must be of sufficient size. If it isn't, then an error + message should be emitted. + */ + char actual_output[16]; + const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n"; + + Error_Message_Expected(); + size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output)); + EXPECT_EQ(0, size); +} + +// ------------------------------------------------------- +// Test suite for utf8_to_wchar() +// ------------------------------------------------------- + +/* The following three tests demonstrate three different ways to + create the same input string. */ + +TEST(utf8_to_wchar, test1) { + wchar_t resultant_wchar_s[128]; + const char* input = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + const wchar_t* expected_wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + + (void) utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); + bool test_result = (wcscmp(expected_wide_s, expected_wide_s) == 0); + EXPECT_EQ(true, test_result); +} + +TEST(utf8_to_wchar, test2) { + wchar_t resultant_wchar_s[128]; + const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)"; + const wchar_t* expected_wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + + (void) utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); + bool test_result = (wcscmp(expected_wide_s, expected_wide_s) == 0); + EXPECT_EQ(true, test_result); +} + +TEST(utf8_to_wchar, test3) { + wchar_t resultant_wchar_s[128]; + const char input[30] = {'e','\xe2','\x81','\xb1','\xe1', '\xb5','\xa0',' ','=',' ', + 'c','o','s','(','\xcf','\x95',')',' ','+',' ','i','*','s', + 'i','n','(','\xcf','\x95',')','\0'}; + const wchar_t* expected_wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + + (void) utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); + bool test_result = (wcscmp(expected_wide_s, expected_wide_s) == 0); + EXPECT_EQ(true, test_result); +} + +TEST(utf8_to_wchar, insufficient_result_array_size) { + wchar_t resultant_wchar_s[16]; + const char* input = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + + Error_Message_Expected(); + size_t size = utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); + EXPECT_EQ(0, size); +} + +TEST(utf8_to_wchar, corrupted_input) { + wchar_t resultant_wchar_s[128]; + char input[30] = {'e','\xe2','\x81','\xb1','\xe1', '\xb5','\xa0',' ','=',' ', + 'c','o','s','(','\xcf','\x95',')',' ','+',' ','i','*','s', + 'i','n','(','\xcf','\x95',')','\0'}; + + /* Deliberately corrupt input by changing input[2] to not being a continuation byte. */ + input[2] = 0x70; + + Error_Message_Expected(); + size_t size = utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); + EXPECT_EQ(0, size); +} + +// ------------------------------------------------------- +// Test suite for wchar_to_utf8() +// ------------------------------------------------------- +TEST(wchar_to_utf8, test1) { + + char resultant_utf8_s[128]; + const wchar_t* wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + const char* expected_utf8_s = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + + wchar_to_utf8(wide_s, resultant_utf8_s, sizeof(resultant_utf8_s)/sizeof(char)); + bool test_result = (strcmp(expected_utf8_s, resultant_utf8_s) == 0); + EXPECT_EQ(true, test_result); +} + +TEST(wchar_to_utf8, insufficient_result_array_size) { + + char resultant_utf8_s[16]; + const wchar_t* wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)"; + + Error_Message_Expected(); + size_t size = wchar_to_utf8(wide_s, resultant_utf8_s, sizeof(resultant_utf8_s)/sizeof(char)); + EXPECT_EQ(0, size); +}