From a29045005c18e7c751694d33daef215638f10e22 Mon Sep 17 00:00:00 2001
From: "John M. Penn" <john.m.penn@nasa.gov>
Date: Thu, 25 Oct 2018 15:14:52 -0500
Subject: [PATCH] Create a suite of unicode utilities.

---
 Makefile                                      |   3 +-
 include/trick/unicode_utils.h                 |  51 ++
 trick_source/trick_utils/unicode/Makefile     |   5 +
 .../trick_utils/unicode/src/unicode_utils.c   | 437 ++++++++++++++++++
 .../trick_utils/unicode/test/Makefile         |  41 ++
 .../unicode/test/unicode_utils_test.cpp       | 406 ++++++++++++++++
 6 files changed, 942 insertions(+), 1 deletion(-)
 create mode 100644 include/trick/unicode_utils.h
 create mode 100644 trick_source/trick_utils/unicode/Makefile
 create mode 100644 trick_source/trick_utils/unicode/src/unicode_utils.c
 create mode 100644 trick_source/trick_utils/unicode/test/Makefile
 create mode 100644 trick_source/trick_utils/unicode/test/unicode_utils_test.cpp

diff --git a/Makefile b/Makefile
index ba43b81c..6b0192b5 100644
--- a/Makefile
+++ b/Makefile
@@ -96,7 +96,8 @@ UTILS_DIRS := \
 	${TRICK_HOME}/trick_source/trick_utils/comm \
 	${TRICK_HOME}/trick_source/trick_utils/shm \
 	${TRICK_HOME}/trick_source/trick_utils/math \
-	${TRICK_HOME}/trick_source/trick_utils/units
+	${TRICK_HOME}/trick_source/trick_utils/units \
+	${TRICK_HOME}/trick_source/trick_utils/unicode
 UTILS_OBJS := $(addsuffix /object_$(TRICK_HOST_CPU)/*.o ,$(UTILS_DIRS))
 
 # filter out the directories that make their own libraries
diff --git a/include/trick/unicode_utils.h b/include/trick/unicode_utils.h
new file mode 100644
index 00000000..1d966e5d
--- /dev/null
+++ b/include/trick/unicode_utils.h
@@ -0,0 +1,51 @@
+#ifndef UNITCODE_UTILS_H
+#define UNITCODE_UTILS_H
+#include <stddef.h>
+
+/* Maintainer: John M. Penn */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Convert Unicode codepoint to UTF-32. Validates that it's a legal unicode value.
+   Returns 1, if successful, 0 otherwise. */
+size_t ucodepoint_to_utf32(unsigned int codePoint, int32_t *out);
+
+/* Convert Unicode codepoint to UTF-16.
+   Returns the number of UTF-16 elements (1..2) necessary to represent the codepoint,
+   or 0 on failure.
+ */ 
+size_t ucodepoint_to_utf16(unsigned int codePoint, int16_t (*out)[2]);
+
+/* Convert Unicode codepoint to UTF-8.
+   Returns the number of UTF_8 elements (1..4) )necessary to represent the codepoint,
+   or 0 on failure.
+ */ 
+size_t ucodepoint_to_utf8(unsigned int codePoint, char (*out)[4]);
+
+
+
+/* Un-escape C escape sequences, including \u and \U Unicode escape sequences,
+   in an ASCII character array, producing a UTF-8 character array. Return the
+   number of elements in the character string.
+*/
+size_t ascii_to_utf8(const char *in, char *out, size_t outSize); 
+
+/* Escape ('\' escape codes) all unicode and non-printable ASCII characters
+   in a UTF-8 character string. Return the number of elements in the character string.
+*/
+size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize); 
+
+/* Convert a UTF-8 character array to a wchar_t array. Supports 16, and 32 bit wchar_t.
+   Return the number of elements in the wchar_t string. */
+size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize);
+
+/* Convert wchar_t character array to UTF-8. Return the number of elements in
+   the character (utf-8) string.*/
+size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/trick_source/trick_utils/unicode/Makefile b/trick_source/trick_utils/unicode/Makefile
new file mode 100644
index 00000000..a181cedf
--- /dev/null
+++ b/trick_source/trick_utils/unicode/Makefile
@@ -0,0 +1,5 @@
+
+include ${TRICK_HOME}/share/trick/makefiles/Makefile.common
+include ${TRICK_HOME}/share/trick/makefiles/Makefile.tricklib
+-include Makefile_deps
+
diff --git a/trick_source/trick_utils/unicode/src/unicode_utils.c b/trick_source/trick_utils/unicode/src/unicode_utils.c
new file mode 100644
index 00000000..e2229bef
--- /dev/null
+++ b/trick_source/trick_utils/unicode/src/unicode_utils.c
@@ -0,0 +1,437 @@
+#include <string.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <wchar.h>
+#include <ctype.h>
+#include <stdint.h>
+#include "trick/unicode_utils.h"
+
+/* Maintainer: John M. Penn */
+
+size_t ucodepoint_to_utf32(unsigned int codePoint, int32_t *out) {
+
+    if (codePoint < 0xd800 || codePoint >= 0xe000) { /* Not Surrogate */
+        *out = (int32_t)codePoint;
+        return 1;
+    } else {
+        fprintf(stderr,"%s:ERROR: 0x%08x is reserved for UTF-16, as a surrogate codepoint.\n", __FUNCTION__, codePoint);
+    }
+    return 0;
+}
+
+size_t ucodepoint_to_utf16(unsigned int codePoint, int16_t (*out)[2]) {
+
+    if (codePoint > 0x10ffff) {
+        fprintf(stderr,"%s:ERROR: Invalid Unicode value (too big): 0x%04x.\n", __FUNCTION__, codePoint);
+        return 0;
+    } else if (codePoint > 0xffff) {
+        /* High-surrogate code points are in the range U+D800–U+DBFF. 
+         * Low-surrogate code points are in the range U+DC00–U+DFFF. 
+         * A high-surrogate code point followed by a low-surrogate code point form a
+         * surrogate pair in UTF-16 to represent code points greater than U+FFFF.
+         */
+        (*out)[0] = (int16_t)(0xd800 + (codePoint >> 10));    /* Create High Surrogate */
+        (*out)[1] = (int16_t)(0xdc00 + (codePoint & 0x03ff)); /* Create Low Surrogate */
+        return 2;
+    } else if (codePoint < 0xd800 || codePoint >= 0xe000) { /* Not Surrogate */
+        (*out)[0] = (int16_t)(codePoint);
+        return 1;
+    } else {
+        fprintf(stderr,"%s:ERROR: Invalid Unicode value (surrogate): 0x%04x.\n", __FUNCTION__, codePoint);
+    }
+    return 0;
+}
+
+size_t ucodepoint_to_utf8(unsigned int codePoint, char (*out)[4]) {
+
+    if (codePoint <= 0x7f) { /* ASCII */
+        (*out)[0] = (char)codePoint;                           /* 0xxxxxxx 0x00..0x7F*/
+        return 1;
+
+    } else if (codePoint <= 0x7ff) { /* Two-byte Sequence */
+        (*out)[0] = (char)(0xc0 | ((codePoint >> 6) & 0x1f));  /* 110xxxxx 0xC0..0xDF*/
+        (*out)[1] = (char)(0x80 | (codePoint & 0x3f));         /* 10xxxxxx */
+        return 2;
+
+    } else if (codePoint <= 0xffff) { /* Three byte Sequence */
+        (*out)[0] = (char)(0xe0 | ((codePoint >> 12) & 0x0f)); /* 1110xxxx 0xE0..0xEF*/
+        (*out)[1] = (char)(0x80 | ((codePoint >> 6)  & 0x3f)); /* 10xxxxxx */
+        (*out)[2] = (char)(0x80 | (codePoint & 0x3f));         /* 10xxxxxx */
+        return 3;
+
+    } else { /* Four-byte Sequence */
+        (*out)[0] = (char)(0xf0 | ((codePoint >> 18) & 0x07)); /* 11110xxx 0xF0..0xF7*/
+        (*out)[1] = (char)(0x80 | ((codePoint >> 12) & 0x3f)); /* 10xxxxxx */
+        (*out)[2] = (char)(0x80 | ((codePoint >> 6)  & 0x3f)); /* 10xxxxxx */
+        (*out)[3] = (char)(0x80 | (codePoint & 0x3f));         /* 10xxxxxx 0x80..0xBF */
+        return 4;
+    }
+    return 0;
+}
+
+size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) {
+
+    int state = 0;
+    unsigned int codePoint;
+    char wks[11];
+
+    if (out == NULL) {
+        fprintf(stderr,"%s:ERROR: ASCII char pointer (out) is NULL. No conversion performed.\n", __FUNCTION__);
+        return 0;
+    }
+    out[0] = 0;
+
+    if (in == NULL) {
+        fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__);
+        return 0;
+    }
+
+    while (*in != 0) {
+        unsigned char ch = *in;
+        switch (state) {
+            case 0: {
+                if (ch >= 0xf0) {          // Start of a 4-byte sequence.
+                    codePoint = ch & 0x07; // Extract low 3 bits
+                    state = 3;
+                } else if (ch >= 0xe0) {   // Start of a 3-byte sequence.
+                    codePoint = ch & 0x0f; // Extract low 4 bits
+                    state = 2;
+                } else if (ch >= 0xc0) {   // Start of a 2-byte sequence.
+                    codePoint = ch & 0x1f; // Extract low 5 bits
+                    state = 1;
+                } else if (ch >= 0x80) {   // We should never find a continuation byte in isolation.
+                    fprintf(stderr,"%s:ERROR: UTF8 string (in) appears to be corrupted.\n", __FUNCTION__);
+                    state = 99;
+                } else {                   // ASCII        
+                    if (ch == '\a') {
+                        sprintf(wks,"\\a");
+                    } else if (ch == '\b') {
+                        sprintf(wks,"\\b");
+                    } else if (ch == '\f') {
+                        sprintf(wks,"\\f");
+                    } else if (ch == '\n') {
+                        sprintf(wks,"\\n");
+                    } else if (ch == '\r') {
+                        sprintf(wks,"\\r");
+                    } else if (ch == '\t') {
+                        sprintf(wks,"\\t");
+                    } else if (ch == '\v') {
+                        sprintf(wks,"\\v");
+                    } else if (isprint(ch)) {
+                        sprintf(wks,"%c",ch);
+                    } else {
+                        sprintf(wks,"\\x%02x",ch);
+                    }
+                    if ((strlen(out)+strlen(wks)) < outSize-1) {
+                        strcat(out, wks);
+                    } else {
+                        fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__);
+                        state = 99;
+                    }
+                }
+            } break;
+            case 1: { // Expecting one continuation byte.
+                if ((ch & 0xc0) == 0x80) { // If the next char is a continuation byte ..
+                    codePoint = (codePoint << 6) | (ch & 0x3f); // Extract low 6 bits
+                    state = 0;
+
+                    if (codePoint <= 0xffff) {
+                        sprintf(wks,"\\u%04x", codePoint); 
+                    } else {
+                        sprintf(wks,"\\U%08x", codePoint); 
+                    }
+                    if ((strlen(out)+strlen(wks)) < outSize-1) {
+                        strcat(out, wks);
+                    } else {
+                        fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__);
+                        state = 99;
+                    }
+
+                } else {
+                    fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
+                    state = 99;
+                }
+            } break;
+            case 2: { /* Expecting two continuation bytes. */
+                if ((ch & 0xc0) == 0x80) {
+                    codePoint = (codePoint << 6) | (ch & 0x3f);
+                    state = 1;
+                } else {
+                    fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
+                    state = 99;
+                }
+            } break;
+            case 3: { /* Expecting three continuation bytes. */
+                if ((ch & 0xc0) == 0x80) {
+                    codePoint = (codePoint << 6) | (ch & 0x3f);
+                    state = 2;
+                } else {
+                    fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
+                    state = 99;
+                }
+            } break;
+            default: { 
+                out[0] = 0;
+                return 0;
+            } break;
+        }
+        in ++;
+    }
+
+    /* If we finished in state 0, then we're good. Just
+       terminate the string, otherwise we had an error. */
+    if (state == 0) {
+        return strlen(out);
+    } else {
+        out[0] = 0;
+        return 0;
+    }
+}
+
+/* Un-escapes ASCII and Unicode escape sequences, and encodes them into UTF-8. */
+size_t ascii_to_utf8(const char *in, char *out, size_t outSize) {
+
+    unsigned int codePoint = 0;
+    size_t len = 0;
+    int state = 0;
+    int digitsExpected = 0;
+
+    if (out == NULL) {
+        fprintf(stderr,"%s:ERROR: ASCII char pointer (out) is NULL. No conversion performed.\n", __FUNCTION__);
+        return 0;
+    }
+    out[0] = 0;
+
+    if (in == NULL) {
+        fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__);
+        return 0;
+    }
+
+    while (*in != 0) {
+        unsigned char ch = *in;
+        if (ch > 0x7f) { /* All input characters must be ASCII. */
+            fprintf(stderr,"%s:ERROR: ASCII string (in) contains non-ASCII values.\n", __FUNCTION__);
+            out[0] = 0; 
+            return 0;
+        }
+        /* All escaped characters will be un-escaped. */
+        switch(state) {
+            case 0: { // Normal State
+                if (ch =='\\') {
+                    state = 1;
+                } else {
+                    out[len++] = ch;
+                }
+            } break;
+            case 1: { // Escaped State ( that is: we've found a '\' character.)
+                switch(ch) {
+                    case '\'':
+                    case '\"':
+                    case '\?':
+                    case '\\': {
+                        out[len++] = ch; state = 0;
+                    } break;
+
+                    case 'a': { out[len++] = '\a';  state = 0; } break;
+                    case 'b': { out[len++] = '\b';  state = 0; } break;
+                    case 'f': { out[len++] = '\f';  state = 0; } break;
+                    case 'n': { out[len++] = '\n';  state = 0; } break;
+                    case 'r': { out[len++] = '\r';  state = 0; } break;
+                    case 't': { out[len++] = '\t';  state = 0; } break;
+                    case 'v': { out[len++] = '\b';  state = 0; } break;
+                    case 'x': { digitsExpected = 2; state = 2; } break;
+                    case 'u': { digitsExpected = 4; state = 2; } break;
+                    case 'U': { digitsExpected = 8; state = 2; } break;
+                    default : {
+                    }
+                } // switch ch
+            } break;
+            case 2: { // Escaped Unicode ( that is: we've found '\x', '\u' or '\U'.)
+                 int digit = 0;
+                 if (ch >= '0' && ch <= '9') {
+                     digit = ch - (int)'0';
+                 } else if (ch >= 'A' && ch <= 'F') {
+                     digit = ch - (int)'A' + 10;
+                 } else if (ch >= 'a' && ch <= 'f') {
+                     digit = ch - (int)'a' + 10;
+                 } else {
+                     fprintf(stderr,"%s:ERROR: Insufficient hexidecimal digits following"
+                                    " \\x, \\u, or \\U escape code in char string (in).\n", __FUNCTION__);
+                     out[0] = 0;
+                     return 0;
+                 }
+                 codePoint = codePoint * 16 + digit;
+                 digitsExpected -- ;
+                 if ( digitsExpected == 0 ) {
+                    char temp[4];
+                    size_t count = ucodepoint_to_utf8(codePoint, &temp);
+                    if (count < (outSize-len)) {
+                        memcpy( &out[len], temp, sizeof(char) * count );
+                        len += count;
+                        state = 0;
+                    } else {
+                        fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__);
+                        out[0] = 0;
+                        return 0;
+                    }
+                    codePoint = 0;
+                 }
+            } break;
+            default: { 
+                out[0] = 0;
+                return 0;
+            } break;
+        } 
+        in ++;
+    }
+    out[len] = 0; /* NULL termination of string. */
+    return len;
+}
+
+size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize) {
+
+    unsigned int codePoint = 0;
+    size_t len = 0;
+    int state = 0;
+
+    while (*in != 0) {
+        unsigned char ch = *in;
+        switch (state) {
+            case 0: {
+                if (ch >= 0xf0) {          // Start of a 4-byte sequence.
+                    codePoint = ch & 0x07; // Extract low 3 bits
+                    state = 3;
+                } else if (ch >= 0xe0) {   // Start of a 3-byte sequence.
+                    codePoint = ch & 0x0f; // Extract low 4 bits
+                    state = 2;
+                } else if (ch >= 0xc0) {   // Start of a 2-byte sequence.
+                    codePoint = ch & 0x1f; // Extract low 5 bits
+                    state = 1;
+                } else if (ch >= 0x80) {   // We should never find a continuation byte in isolation.
+                    fprintf(stderr,"%s:ERROR: UTF8 string (in) appears to be corrupted.\n", __FUNCTION__);
+                    state = 99;
+                } else {
+                    codePoint = ch;        // ASCII        
+                    if ((outSize-len) > 1) {
+                        out[len++] = (wchar_t)codePoint;
+                    } else {
+                        fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
+                        state = 99;
+                    }
+                }
+            } break;
+            case 1: { // Expecting one continuation byte.
+                if ((ch & 0xc0) == 0x80) { // If the next char is a continuation byte ..
+                    codePoint = (codePoint << 6) | (ch & 0x3f); // Extract lower 6 bits 
+                    state = 0;
+
+                    if (sizeof(wchar_t) == 4) { // wchar_t is UTF-32
+                        int32_t temp;
+                        if ( ucodepoint_to_utf32(codePoint, &temp) > 0) {
+                            if ((outSize-len) > 1) {
+                                out[len++] = (wchar_t)temp;
+                            } else {
+                                fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
+                                state = 99;
+                            }
+                        } else {
+                            state = 99;
+                        }
+
+                    } else if (sizeof(wchar_t) == 2) { // wchar_t is UTF-16
+                        int16_t temp[2];
+                        size_t count;
+                        if (( count = ucodepoint_to_utf16(codePoint, &temp)) > 0) {
+                            if (count < (outSize-len)) {
+                                memcpy( &out[len], temp, sizeof(int16_t) * count );
+                                len += count;
+                            } else {
+                                fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
+                                state = 99;
+                            }
+                        }
+
+                    } else {
+                        fprintf(stderr,"%s:ERROR: Unsupported wchar_t size.\n", __FUNCTION__);
+                        state = 99;
+                    }
+
+                } else {
+                    fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
+                    state = 99;
+                }
+            } break;
+            case 2: { /* Expecting two continuation bytes. */
+                if ((ch & 0xc0) == 0x80) {
+                    codePoint = (codePoint << 6) | (ch & 0x3f);
+                    state = 1;
+                } else {
+                    fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
+                    state = 99;
+                }
+            } break;
+            case 3: { /* Expecting three continuation bytes. */
+                if ((ch & 0xc0) == 0x80) {
+                    codePoint = (codePoint << 6) | (ch & 0x3f);
+                    state = 2;
+                } else {
+                    fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
+                    state = 99;
+                }
+            } break;
+            default: { /* Error State. */
+                out[0] = 0;
+                return 0;
+            } break;
+        }
+        in ++;
+    }
+
+    /* If we finished in state 0, then we're good. Just
+       terminate the string, otherwise we had an error. */
+    if (state == 0) {
+        out[len] = 0;
+        return len;
+    } else {
+        out[0] = 0;
+        return 0;
+    }
+    return len;
+}
+
+size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize ) {
+
+    unsigned int codePoint = 0;
+    size_t len = 0;
+
+    while ( *in != 0 ) {
+        if (*in >= 0xd800 && *in <= 0xdbff)         /* If High-surrogate. */
+            codePoint = ((*in - 0xd800) << 10) + 0x10000;
+        else {
+            if (*in >= 0xdc00 && *in <= 0xdfff) {   /* If Low-surrogate. */
+                codePoint |= *in - 0xdc00;
+            } else if (*in <= 0x10ffff) {           /* Max Unicode Value */
+                codePoint = *in;
+            } else {
+                fprintf(stderr,"%s:ERROR: Invalid Unicode value.\n", __FUNCTION__);
+                out[0] = 0;
+                return 0;
+            }
+
+            char temp[4];
+            size_t count = ucodepoint_to_utf8(codePoint, &temp);
+            if (count < (outSize-len)) {
+                memcpy( &out[len], temp, sizeof(char) * count );
+                len += count;
+            } else {
+                fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__);
+                out[0] = 0;
+                return 0;
+            }
+            codePoint = 0;
+        }
+        in++;
+    }
+    out[len] = L'\0'; /* NULL termination of string. */
+    return len;
+}
diff --git a/trick_source/trick_utils/unicode/test/Makefile b/trick_source/trick_utils/unicode/test/Makefile
new file mode 100644
index 00000000..a517ce5a
--- /dev/null
+++ b/trick_source/trick_utils/unicode/test/Makefile
@@ -0,0 +1,41 @@
+
+#SYNOPSIS:
+#
+#   make [all]  - makes everything.
+#   make TARGET - makes the given target.
+#   make clean  - removes all files generated by make.
+
+include ${TRICK_HOME}/share/trick/makefiles/Makefile.common
+
+# Flags passed to the preprocessor.
+TRICK_CPPFLAGS += -I$(GTEST_HOME)/include -I$(TRICK_HOME)/include -g -Wall -Wextra -DGTEST_HAS_TR1_TUPLE=0
+
+TRICK_LIBS = ${TRICK_LIB_DIR}/libtrick.a
+TRICK_EXEC_LINK_LIBS += -L${GTEST_HOME}/lib64 -L${GTEST_HOME}/lib -lgtest -lgtest_main -lpthread
+
+# Added for Ubuntu... not required for other systems.
+TRICK_EXEC_LINK_LIBS += -lpthread
+
+# All tests produced by this Makefile.  Remember to add new tests you
+# created to the list.
+TESTS = unicode_utils_test
+
+OTHER_OBJECTS =
+
+# House-keeping build targets.
+
+all : $(TESTS)
+
+test: $(TESTS)
+	./unicode_utils_test --gtest_output=xml:${TRICK_HOME}/trick_test/Unicode_utils.xml
+
+clean :
+	rm -f $(TESTS) *.o
+	rm -rf io_src xml
+
+unicode_utils_test.o : unicode_utils_test.cpp
+	$(TRICK_CPPC) $(TRICK_CPPFLAGS) -c $<
+
+unicode_utils_test : unicode_utils_test.o
+	$(TRICK_CPPC) $(TRICK_CPPFLAGS) -o $@ $^ $(OTHER_OBJECTS) -L${TRICK_HOME}/lib_${TRICK_HOST_CPU} $(TRICK_LIBS) $(TRICK_EXEC_LINK_LIBS)
+
diff --git a/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp b/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp
new file mode 100644
index 00000000..a2875e86
--- /dev/null
+++ b/trick_source/trick_utils/unicode/test/unicode_utils_test.cpp
@@ -0,0 +1,406 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+#include <gtest/gtest.h>
+#include "trick/unicode_utils.h"
+
+const char* ISO_6429_Restore_Default   = "\x1b[00m";
+const char* ISO_6429_Bold              = "\x1b[01m";
+const char* ISO_6429_Underline         = "\x1b[04m";
+const char* ISO_6429_Black_Foreground  = "\x1b[30m";
+const char* ISO_6429_Red_Foreground    = "\x1b[31m";
+const char* ISO_6429_Green_Foreground  = "\x1b[32m";
+const char* ISO_6429_Yellow_Foreground = "\x1b[33m";
+const char* ISO_6429_Blue_Foreground   = "\x1b[34m";
+const char* ISO_6429_Purple_Foreground = "\x1b[35m";
+const char* ISO_6429_Cyan_Foreground   = "\x1b[36m";
+const char* ISO_6429_White_Foreground  = "\x1b[37m";
+const char* ISO_6429_Black_Background  = "\x1b[40m";
+const char* ISO_6429_Red_Background    = "\x1b[41m";
+const char* ISO_6429_Green_Background  = "\x1b[42m";
+const char* ISO_6429_Yellow_Background = "\x1b[43m";
+const char* ISO_6429_Blue_Background   = "\x1b[44m";
+const char* ISO_6429_Purple_Background = "\x1b[45m";
+const char* ISO_6429_Cyan_Background   = "\x1b[46m";
+const char* ISO_6429_White_Background  = "\x1b[47m";
+
+void Error_Message_Expected() {
+    printf("%s%s%s", ISO_6429_Blue_Background, ISO_6429_White_Foreground, ISO_6429_Underline);
+    printf("An error message is expected from this test.");
+    printf("%s\n", ISO_6429_Restore_Default );
+}
+
+// -------------------------------------------------------
+// Test suite for ucodepoint_to_utf32()
+// -------------------------------------------------------
+TEST(ucodepoint_to_utf32, valid ) {
+    /* Aegean Number Ten, U+10110 is a valid code point. */
+    int32_t out;
+    size_t size = ucodepoint_to_utf32(0x10110, &out);
+    EXPECT_EQ(1, size);
+}
+
+TEST(ucodepoint_to_utf32, invalid ) {
+    /* 0xdead is in the range [d800 .. dfff], and reserved for UTF-16
+       surrogates. They are not valid unicode codepoints. So, if we
+       attempt to convert a surrogate as a codepoint, we should get
+       an error meassage.
+     */
+    int32_t out;
+    Error_Message_Expected();
+    size_t size = ucodepoint_to_utf32(0xdead, &out);
+    EXPECT_EQ(0, size);
+}
+
+// -------------------------------------------------------
+// Test suite for ucodepoint_to_utf16()
+// -------------------------------------------------------
+TEST(ucodepoint_to_utf16, two_16bit_element_sequence ) {
+    /* Note that unicode is a 21-bit encoding.
+       Because Aegean Number Ten (U+10110) is larger than can be stored in 16-bits,
+       UTF-16 requires two 16-bit values, called surrogates to encode it.
+       The high-surrogate "carries" the most significant 11 bits of the codepoint.
+       High-surrogate = 0xd800 + most significant 11 bits of the codepoint.
+       The low-surrogate carries the least significant 10 bits of the codepoint.
+       Low-surrogate = 0xde00 + least significant 10 bits of the codepoint.
+    */
+    int16_t out[2];
+    size_t size = ucodepoint_to_utf16(0x10110, &out);
+    EXPECT_EQ(2, size);
+    EXPECT_EQ((int16_t)0xd840, out[0]); 
+    EXPECT_EQ((int16_t)0xdd10, out[1]);
+}
+
+TEST(ucodepoint_to_utf16, one_16bit_element) {
+    int16_t out[2];
+    /* A valid codepoint that can be stored within 16-bits should be
+       equal to its UTF-16 character value. */
+    size_t size = ucodepoint_to_utf16(0x03d5, &out);
+    EXPECT_EQ(1, size);
+    EXPECT_EQ((int16_t)0x03d5, out[0]); 
+}
+
+TEST(ucodepoint_to_utf16, invalid_surrogate ) {
+    /* Input codepoint can not be a surrogate. */
+    int16_t out[2];
+    Error_Message_Expected();
+    size_t size = ucodepoint_to_utf16(0xdead, &out);
+    EXPECT_EQ(0, size);
+}
+
+TEST(ucodepoint_to_utf16, codepoint_too_big ) {
+    /* Input codepoint can not be > 0x10ffff, which is the largest valid unicode codepoint. */
+    int16_t out[2];
+    Error_Message_Expected();
+    size_t size = ucodepoint_to_utf16(0x110000, &out);
+    EXPECT_EQ(0, size);
+}
+
+// -------------------------------------------------------
+// Test suite for ucodepoint_to_utf8()
+// -------------------------------------------------------
+TEST(ucodepoint_to_utf8, four_8bit_element_sequence ) {
+    char out[4];
+    /* Aegean Number Ten, U+10110 is a valid codepoint that
+       requires four bytes to encode in utf-8. */
+    size_t size = ucodepoint_to_utf8(0x10110, &out);
+    EXPECT_EQ(4, size);
+}
+
+TEST(ucodepoint_to_utf8, three_8bit_element_sequence ) {
+    char out[4];
+    /* Superscript Latin Small Letter I, U+2071 is a valid
+       codepoint that requires three bytes to encode in utf-8. */
+    size_t size = ucodepoint_to_utf8(0x2071, &out);
+    EXPECT_EQ(3, size);
+}
+
+TEST(ucodepoint_to_utf8, two_8bit_element_sequence ) {
+    char out[4];
+    /* Greek Phi Symbol, U+03d5 is a valid codepoint that
+       requires two bytes to encode in utf-8. */
+    size_t size = ucodepoint_to_utf8(0x03d5, &out);
+    EXPECT_EQ(2, size);
+}
+
+TEST(ucodepoint_to_utf8, ascii ) {
+    char out[4];
+    /* Latin Small Letter A,  U+0061 is a valid codepoint that
+       requires one byte to encode in utf-8. Below 0x7f, Unicode
+       and ASCII are identical. */
+    size_t size = ucodepoint_to_utf8('a', &out);
+    EXPECT_EQ(1, size);
+}
+
+// -------------------------------------------------------
+// Test suite for utf8_to_printable_ascii()
+// -------------------------------------------------------
+TEST(utf8_to_printable_ascii, null_input ) {
+    /* Should generate error message if input character pointer is NULL. */
+    char resultant_ascii_s[128];
+    char* null_ptr = (char*)0;
+    Error_Message_Expected();
+    size_t size = utf8_to_printable_ascii( null_ptr, resultant_ascii_s, sizeof(resultant_ascii_s));
+    EXPECT_EQ(0, size);
+}
+
+TEST(utf8_to_printable_ascii, null_output ) {
+    /* Should generate error message if output character pointer is NULL. */
+    char* null_ptr = (char*)0;
+    const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)";
+    Error_Message_Expected();
+    size_t size = utf8_to_printable_ascii( input, null_ptr, size_t(5));
+    EXPECT_EQ(0, size);
+}
+
+TEST(utf8_to_printable_ascii, normal_1  ) {
+    char resultant_ascii_s[128];
+    /* utf8_to_printable_ascii() should escape all Unicode and non-printable ASCII characters. */
+    const char* utf8_s = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n";
+    const char* expected_ascii_s = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n";
+    (void) utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
+    EXPECT_STREQ(expected_ascii_s, resultant_ascii_s);
+}
+
+TEST(utf8_to_printable_ascii, normal_2  ) {
+    char resultant_ascii_s[256];
+    /* utf8_to_printable_ascii() should escape all Unicode and non-printable ASCII characters. */
+    const char ascii[128] = {       '\x01','\x02','\x03','\x04','\x05','\x06','\x07','\x08','\x09','\x0a','\x0b','\x0c','\x0d','\x0e','\x0f',
+                             '\x10','\x11','\x12','\x13','\x14','\x15','\x16','\x17','\x18','\x19','\x1a','\x1b','\x1c','\x1d','\x1e','\x1f',
+                             '\x20','\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f',
+                             '\x30','\x31','\x32','\x33','\x34','\x35','\x36','\x37','\x38','\x39','\x3a','\x3b','\x3c','\x3d','\x3e','\x3f',
+                             '\x40','\x41','\x42','\x43','\x44','\x45','\x46','\x47','\x48','\x49','\x4a','\x4b','\x4c','\x4d','\x4e','\x4f',
+                             '\x50','\x51','\x52','\x53','\x54','\x55','\x56','\x57','\x58','\x59','\x5a','\x5b','\x5c','\x5d','\x5e','\x5f',
+                             '\x60','\x61','\x62','\x63','\x64','\x65','\x66','\x67','\x68','\x69','\x6a','\x6b','\x6c','\x6d','\x6e','\x6f',
+                             '\x70','\x71','\x72','\x73','\x74','\x75','\x76','\x77','\x78','\x79','\x7a','\x7b','\x7c','\x7d','\x7e','\x7f',
+                             '\x00'
+                            };
+
+    const char* expected_ascii_s = "\\x01\\x02\\x03\\x04\\x05\\x06\\a\\b\\t\\n\\v\\f"
+                                   "\\r\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f"
+                                   " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f";
+
+    size_t size = utf8_to_printable_ascii( ascii, resultant_ascii_s, sizeof(resultant_ascii_s));
+    EXPECT_EQ(209, size);
+    EXPECT_STREQ(expected_ascii_s, resultant_ascii_s);
+}
+
+/* The following are the utf-8 encodings of four unicode characters used in the following tests. */
+// Greek Phi Symbol => U+03d5 => 0xcf 0x95                      // see: https://www.compart.com/en/unicode/U+03D5
+// Superscript Latin Small Letter I => U+2071 => 0xe2 0x81 0xb1 // see: https://www.compart.com/en/unicode/U+2071
+// Modifier Letter Small Greek Phi  => U+1D60 => 0xe1 0xb5 0xa0 // see: https://www.compart.com/en/unicode/U+1D60
+// Aegean Number Ten => U+10110 => 0xf0 0x90 0x84 0x90          // see: https://www.compart.com/en/unicode/U+10110
+
+TEST(utf8_to_printable_ascii, demotest ) {
+    char resultant_ascii_s[128];
+
+    const char utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x95','\0'};
+    const char* expected_ascii_s = "Phi = \\u03d5";
+    (void) utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
+    EXPECT_STREQ(expected_ascii_s, resultant_ascii_s);
+}
+
+TEST(utf8_to_printable_ascii, detect_corruption_1 ) {
+    char resultant_ascii_s[128];
+    /* The following string is deliberately corrupted with a spurious
+       continuation character (in corrupted_utf8_s[6]).*/
+    const char corrupted_utf8_s[11] = {'P','h','i',' ','=',' ','\x80','\x95','\0'};
+    Error_Message_Expected();
+    size_t size = utf8_to_printable_ascii( corrupted_utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
+    EXPECT_EQ(0, size);
+}
+
+TEST(utf8_to_printable_ascii, detect_corruption_2 ) {
+    char resultant_ascii_s[128];
+    /* The following string is deliberately corrupted: 0xcf is a header
+       for a two-byte sequence, it should be followed by a continuation
+       byte (most significant 2 bits are 10). 0x75 starts with 01 */
+    const char corrupted_utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x75','\0'};
+    Error_Message_Expected();
+    size_t size = utf8_to_printable_ascii( corrupted_utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
+    EXPECT_EQ(0, size);
+}
+
+TEST(utf8_to_printable_ascii, insufficient_result_array_size ) {
+    /* The result array must be of sufficient size. Here it is not. */
+    char resultant_ascii_s[16];
+    const char* utf8_s = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n";
+    Error_Message_Expected();
+    size_t size = utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
+    EXPECT_EQ(0, size);
+}
+
+// -------------------------------------------------------
+// Test suite for ascii_to_utf8()
+// -------------------------------------------------------
+
+TEST(ascii_to_utf8, null_input ) {
+    /* Should generate error message if input character pointer is NULL. */
+    char resultant_ascii_s[128];
+    char* null_ptr = (char*)0;
+    Error_Message_Expected();
+    size_t size = ascii_to_utf8( null_ptr, resultant_ascii_s, sizeof(resultant_ascii_s));
+    EXPECT_EQ(0, size);
+}
+
+TEST(ascii_to_utf8, null_output ) {
+    /* Should generate error message if output character pointer is NULL. */
+    char* null_ptr = (char*)0;
+    const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)";
+
+    Error_Message_Expected();
+    size_t size = ascii_to_utf8( input, null_ptr, size_t(5));
+    EXPECT_EQ(0, size);
+}
+
+TEST(ascii_to_utf8, normal_1) {
+    /* ascii_to_utf8() should un-escape all escaped ASCII and escaped unicode.
+     */
+    char actual_output[256];
+    const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n";
+    const char* expected_output = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n";
+
+    size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
+    EXPECT_EQ(30, size);
+    EXPECT_STREQ(expected_output, actual_output);
+}
+
+TEST(ascii_to_utf8, non_ascii_chars) {
+    char actual_output[256];
+    /* The input string should only contain ASCII characters, that is,
+       each element should have a value < 128. That isn't the case in the 
+       following string. Therefore, an error message should be emitted.
+     */
+    const char* input = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+
+    Error_Message_Expected();
+    size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
+    EXPECT_EQ(0, size);
+}
+
+TEST(ascii_to_utf8, insufficient_hex_digits_1) {
+    /* The \U escape code expects exactly 8 hexidecimal digits to follow.
+       If fewer than 8 are present, then an error message should result.
+       Note: "\U10110" will fail in a C/C++ literal at compile time too,
+       because it is incomplete. It should be "\U00010110".
+     */
+    char actual_output[256];
+    const char* input = "Aegean Number Ten = \\U10110\n";
+
+    Error_Message_Expected();
+    size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
+    EXPECT_EQ(0, size);
+}
+
+TEST(ascii_to_utf8, insufficient_hex_digits_2) {
+    /* The \u escape code expects exactly 4 hexidecimal digits to follow.
+       If fewer than 4 are present, then an error message should result.
+       Note: "\u3d5" will fail in a C/C++ literal at compile time too,
+       because it is incomplete. It should be "\u03d5".
+     */
+    char actual_output[256];
+    const char* input = "Phi = \\u3d5\n";
+
+    Error_Message_Expected();
+    size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
+    EXPECT_EQ(0, size);
+}
+
+TEST(ascii_to_utf8, insufficient_result_array_size) {
+    /* The result array must be of sufficient size. If it isn't, then an error
+       message should be emitted.
+     */
+    char actual_output[16];
+    const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n";
+
+    Error_Message_Expected();
+    size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
+    EXPECT_EQ(0, size);
+}
+
+// -------------------------------------------------------
+// Test suite for utf8_to_wchar()
+// -------------------------------------------------------
+
+/* The following three tests demonstrate three different ways to
+   create the same input string. */
+
+TEST(utf8_to_wchar, test1) {
+    wchar_t resultant_wchar_s[128];
+    const char* input = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+    const wchar_t* expected_wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+
+    (void) utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); 
+    bool test_result = (wcscmp(expected_wide_s, expected_wide_s) == 0);
+    EXPECT_EQ(true, test_result);
+}
+
+TEST(utf8_to_wchar, test2) {
+    wchar_t resultant_wchar_s[128];
+    const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)";
+    const wchar_t* expected_wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+
+    (void) utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); 
+    bool test_result = (wcscmp(expected_wide_s, expected_wide_s) == 0);
+    EXPECT_EQ(true, test_result);
+}
+
+TEST(utf8_to_wchar, test3) {
+    wchar_t resultant_wchar_s[128];
+    const char input[30] = {'e','\xe2','\x81','\xb1','\xe1', '\xb5','\xa0',' ','=',' ',
+                            'c','o','s','(','\xcf','\x95',')',' ','+',' ','i','*','s',
+                            'i','n','(','\xcf','\x95',')','\0'};
+    const wchar_t* expected_wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+
+    (void) utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); 
+    bool test_result = (wcscmp(expected_wide_s, expected_wide_s) == 0);
+    EXPECT_EQ(true, test_result);
+}
+
+TEST(utf8_to_wchar, insufficient_result_array_size) {
+    wchar_t resultant_wchar_s[16];
+    const char* input = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+
+    Error_Message_Expected();
+    size_t size = utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); 
+    EXPECT_EQ(0, size);
+}
+
+TEST(utf8_to_wchar, corrupted_input) {
+    wchar_t resultant_wchar_s[128];
+    char input[30] = {'e','\xe2','\x81','\xb1','\xe1', '\xb5','\xa0',' ','=',' ',
+                      'c','o','s','(','\xcf','\x95',')',' ','+',' ','i','*','s',
+                      'i','n','(','\xcf','\x95',')','\0'};
+
+    /* Deliberately corrupt input by changing input[2] to not being a continuation byte. */
+    input[2] = 0x70;
+
+    Error_Message_Expected();
+    size_t size = utf8_to_wchar(input, resultant_wchar_s, sizeof(resultant_wchar_s)/sizeof(wchar_t)); 
+    EXPECT_EQ(0, size);
+}
+
+// -------------------------------------------------------
+// Test suite for wchar_to_utf8()
+// -------------------------------------------------------
+TEST(wchar_to_utf8, test1) {
+
+    char resultant_utf8_s[128];
+    const wchar_t* wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+    const char* expected_utf8_s = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+
+    wchar_to_utf8(wide_s, resultant_utf8_s, sizeof(resultant_utf8_s)/sizeof(char)); 
+    bool test_result = (strcmp(expected_utf8_s, resultant_utf8_s) == 0);
+    EXPECT_EQ(true, test_result);
+}
+
+TEST(wchar_to_utf8, insufficient_result_array_size) {
+
+    char resultant_utf8_s[16];
+    const wchar_t* wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
+
+    Error_Message_Expected();
+    size_t size = wchar_to_utf8(wide_s, resultant_utf8_s, sizeof(resultant_utf8_s)/sizeof(char)); 
+    EXPECT_EQ(0, size);
+}