mirror of
https://github.com/nasa/trick.git
synced 2025-04-15 15:06:44 +00:00
Return output length even if output array is NULL. Ref #708
This commit is contained in:
parent
ca99386026
commit
5333773fbd
@ -2,14 +2,15 @@
|
||||
#define UNITCODE_UTILS_H
|
||||
#include <stddef.h>
|
||||
|
||||
/* Maintainer: John M. Penn */
|
||||
/* Author: John M. Penn */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Convert Unicode codepoint to UTF-32. Validates that it's a legal unicode value.
|
||||
Returns 1, if successful, 0 otherwise. */
|
||||
Returns 1, if successful, 0 otherwise.
|
||||
*/
|
||||
size_t ucodepoint_to_utf32(unsigned int codePoint, int32_t *out);
|
||||
|
||||
/* Convert Unicode codepoint to UTF-16.
|
||||
@ -24,27 +25,46 @@ size_t ucodepoint_to_utf16(unsigned int codePoint, int16_t (*out)[2]);
|
||||
*/
|
||||
size_t ucodepoint_to_utf8(unsigned int codePoint, char (*out)[4]);
|
||||
|
||||
|
||||
|
||||
/* Un-escape C escape sequences, including \u and \U Unicode escape sequences,
|
||||
in an ASCII character array, producing a UTF-8 character array. Return the
|
||||
number of elements in the character string.
|
||||
*/
|
||||
size_t ascii_to_utf8(const char *in, char *out, size_t outSize);
|
||||
|
||||
/* Escape ('\' escape codes) all unicode and non-printable ASCII characters
|
||||
in a UTF-8 character string. Return the number of elements in the character string.
|
||||
in a UTF-8 character string to an all-ASCII representation.
|
||||
Returns the number of elements in the character string, or 0 on failure.
|
||||
*/
|
||||
size_t escape_to_ascii(const char *in, char *out, size_t outSize);
|
||||
|
||||
/* Return the length of the array that would be produced if it were converted,
|
||||
or 0 on failure.
|
||||
*/
|
||||
size_t escape_to_ascii_len(const char *in);
|
||||
|
||||
/* Un-escape C-language escape sequences, including \u and \U Unicode escape sequences,
|
||||
in an ASCII character array, producing a UTF-8 character array.
|
||||
Returns the number of elements in the character string, or 0 on failure.
|
||||
*/
|
||||
size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize);
|
||||
size_t unescape_to_utf8(const char *in, char *out, size_t outSize);
|
||||
|
||||
/* Return the length of the array that would be produced if it were converted,
|
||||
or 0 on failure.
|
||||
*/
|
||||
size_t unescape_to_utf8_len(const char *in);
|
||||
|
||||
/* Convert a UTF-8 character array to a wchar_t array. Supports 16, and 32 bit wchar_t.
|
||||
Return the number of elements in the wchar_t string. */
|
||||
Returns the number of elements in the wchar_t string, or 0 on failure. */
|
||||
size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize);
|
||||
|
||||
/* Convert wchar_t character array to UTF-8. Return the number of elements in
|
||||
the character (utf-8) string.*/
|
||||
/* Return the length of the array that would be produced if in were converted,
|
||||
or 0 on failure.
|
||||
*/
|
||||
size_t utf8_to_wchar_len(const char *in);
|
||||
|
||||
/* Convert wchar_t character array to UTF-8.
|
||||
Returns the number of elements in the character (utf-8) string,
|
||||
or 0 on failure.
|
||||
*/
|
||||
size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize);
|
||||
|
||||
/* Return the length of the array that would be produced if in were converted, or 0 on failure. */
|
||||
size_t wchar_to_utf8_len(const wchar_t *in);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -6,7 +6,9 @@
|
||||
#include <stdint.h>
|
||||
#include "trick/unicode_utils.h"
|
||||
|
||||
/* Maintainer: John M. Penn */
|
||||
/* Author: John M. Penn */
|
||||
|
||||
#define ERROR_STATE 99
|
||||
|
||||
size_t ucodepoint_to_utf32(unsigned int codePoint, int32_t *out) {
|
||||
|
||||
@ -69,24 +71,20 @@ size_t ucodepoint_to_utf8(unsigned int codePoint, char (*out)[4]) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) {
|
||||
|
||||
int state = 0;
|
||||
size_t escape_to_ascii(const char *in, char *out, size_t outSize) {
|
||||
unsigned int codePoint;
|
||||
char wks[11];
|
||||
|
||||
if (out == NULL) {
|
||||
fprintf(stderr,"%s:ERROR: ASCII char pointer (out) is NULL. No conversion performed.\n", __FUNCTION__);
|
||||
return 0;
|
||||
}
|
||||
out[0] = 0;
|
||||
size_t out_len = 0;
|
||||
int state = 0;
|
||||
char ascii_elements[11];
|
||||
|
||||
if (in == NULL) {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__);
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (*in != 0) {
|
||||
if (out != NULL) out[out_len] = 0;
|
||||
|
||||
while ((*in != 0) && (state != ERROR_STATE)) {
|
||||
unsigned char ch = *in;
|
||||
switch (state) {
|
||||
case 0: {
|
||||
@ -101,55 +99,61 @@ size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) {
|
||||
state = 1;
|
||||
} else if (ch >= 0x80) { // We should never find a continuation byte in isolation.
|
||||
fprintf(stderr,"%s:ERROR: UTF8 string (in) appears to be corrupted.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
} else { // ASCII
|
||||
if (ch == '\a') {
|
||||
sprintf(wks,"\\a");
|
||||
sprintf(ascii_elements, "\\a");
|
||||
} else if (ch == '\b') {
|
||||
sprintf(wks,"\\b");
|
||||
sprintf(ascii_elements, "\\b");
|
||||
} else if (ch == '\f') {
|
||||
sprintf(wks,"\\f");
|
||||
sprintf(ascii_elements, "\\f");
|
||||
} else if (ch == '\n') {
|
||||
sprintf(wks,"\\n");
|
||||
sprintf(ascii_elements, "\\n");
|
||||
} else if (ch == '\r') {
|
||||
sprintf(wks,"\\r");
|
||||
sprintf(ascii_elements, "\\r");
|
||||
} else if (ch == '\t') {
|
||||
sprintf(wks,"\\t");
|
||||
sprintf(ascii_elements, "\\t");
|
||||
} else if (ch == '\v') {
|
||||
sprintf(wks,"\\v");
|
||||
sprintf(ascii_elements, "\\v");
|
||||
} else if (isprint(ch)) {
|
||||
sprintf(wks,"%c",ch);
|
||||
sprintf(ascii_elements, "%c",ch);
|
||||
} else {
|
||||
sprintf(wks,"\\x%02x",ch);
|
||||
sprintf(ascii_elements, "\\x%02x",ch);
|
||||
}
|
||||
if ((strlen(out)+strlen(wks)) < outSize-1) {
|
||||
strcat(out, wks);
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
size_t n_elements = strlen(ascii_elements);
|
||||
if (out != NULL) {
|
||||
if ((out_len + n_elements) < outSize) {
|
||||
strcat(out, ascii_elements);
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
}
|
||||
out_len += n_elements;
|
||||
}
|
||||
} break;
|
||||
case 1: { // Expecting one continuation byte.
|
||||
if ((ch & 0xc0) == 0x80) { // If the next char is a continuation byte ..
|
||||
codePoint = (codePoint << 6) | (ch & 0x3f); // Extract low 6 bits
|
||||
state = 0;
|
||||
|
||||
if (codePoint <= 0xffff) {
|
||||
sprintf(wks,"\\u%04x", codePoint);
|
||||
sprintf(ascii_elements, "\\u%04x", codePoint);
|
||||
} else {
|
||||
sprintf(wks,"\\U%08x", codePoint);
|
||||
sprintf(ascii_elements, "\\U%08x", codePoint);
|
||||
}
|
||||
if ((strlen(out)+strlen(wks)) < outSize-1) {
|
||||
strcat(out, wks);
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
size_t n_elements = strlen(ascii_elements);
|
||||
if (out != NULL) {
|
||||
if ((out_len + n_elements) < outSize) {
|
||||
strcat(out, ascii_elements);
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in (out) array.\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
}
|
||||
|
||||
out_len += n_elements;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
case 2: { /* Expecting two continuation bytes. */
|
||||
@ -158,7 +162,7 @@ size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) {
|
||||
state = 1;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
case 3: { /* Expecting three continuation bytes. */
|
||||
@ -167,60 +171,60 @@ size_t utf8_to_printable_ascii(const char *in, char *out, size_t outSize) {
|
||||
state = 2;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
default: {
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
state = ERROR_STATE;
|
||||
} break;
|
||||
}
|
||||
in ++;
|
||||
}
|
||||
/* If we didn't finished in state 0, then we had an error. */
|
||||
if (state != 0) {
|
||||
out_len = 0;
|
||||
}
|
||||
if (out != NULL) out[out_len] = 0; /* NULL termination of string. */
|
||||
return out_len;
|
||||
}
|
||||
|
||||
/* If we finished in state 0, then we're good. Just
|
||||
terminate the string, otherwise we had an error. */
|
||||
if (state == 0) {
|
||||
return strlen(out);
|
||||
} else {
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
}
|
||||
size_t escape_to_ascii_len(const char *in) {
|
||||
return escape_to_ascii( in, NULL, (size_t)0);
|
||||
}
|
||||
|
||||
/* Un-escapes ASCII and Unicode escape sequences, and encodes them into UTF-8. */
|
||||
size_t ascii_to_utf8(const char *in, char *out, size_t outSize) {
|
||||
size_t unescape_to_utf8(const char *in, char *out, size_t outSize) {
|
||||
|
||||
unsigned int codePoint = 0;
|
||||
size_t len = 0;
|
||||
size_t out_len = 0;
|
||||
int state = 0;
|
||||
int digitsExpected = 0;
|
||||
|
||||
if (out == NULL) {
|
||||
fprintf(stderr,"%s:ERROR: ASCII char pointer (out) is NULL. No conversion performed.\n", __FUNCTION__);
|
||||
return 0;
|
||||
}
|
||||
out[0] = 0;
|
||||
|
||||
if (in == NULL) {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__);
|
||||
fprintf(stderr,"%s:ERROR: char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__);
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (*in != 0) {
|
||||
if (out != NULL) out[out_len] = 0;
|
||||
|
||||
while ((*in != 0) && (state != ERROR_STATE )) {
|
||||
unsigned char ch = *in;
|
||||
if (ch > 0x7f) { /* All input characters must be ASCII. */
|
||||
fprintf(stderr,"%s:ERROR: ASCII string (in) contains non-ASCII values.\n", __FUNCTION__);
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
}
|
||||
/* All escaped characters will be un-escaped. */
|
||||
switch(state) {
|
||||
case 0: { // Normal State
|
||||
if (ch =='\\') {
|
||||
if (ch >= 0xf0) { // Start of a 4-byte UTF-8 sequence.
|
||||
if (out != NULL) out[out_len] = ch; out_len++; state = 3;
|
||||
} else if (ch >= 0xe0) { // Start of a 3-byte UTF-8 sequence.
|
||||
if (out != NULL) out[out_len] = ch; out_len++; state = 4;
|
||||
} else if (ch >= 0xc0) { // Start of a 2-byte UTF-8 sequence.
|
||||
if (out != NULL) out[out_len] = ch; out_len++; state = 5;
|
||||
} else if (ch >= 0x80) { // We should never find a UTF-8 continuation byte in isolation.
|
||||
fprintf(stderr,"%s:ERROR: Input string (in) appears to be corrupted.\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
} else if (ch =='\\') {
|
||||
state = 1;
|
||||
} else {
|
||||
out[len++] = ch;
|
||||
if (out != NULL) out[out_len] = ch;
|
||||
out_len++;
|
||||
}
|
||||
} break;
|
||||
case 1: { // Escaped State ( that is: we've found a '\' character.)
|
||||
@ -228,26 +232,24 @@ size_t ascii_to_utf8(const char *in, char *out, size_t outSize) {
|
||||
case '\'':
|
||||
case '\"':
|
||||
case '\?':
|
||||
case '\\': {
|
||||
out[len++] = ch; state = 0;
|
||||
} break;
|
||||
|
||||
case 'a': { out[len++] = '\a'; state = 0; } break;
|
||||
case 'b': { out[len++] = '\b'; state = 0; } break;
|
||||
case 'f': { out[len++] = '\f'; state = 0; } break;
|
||||
case 'n': { out[len++] = '\n'; state = 0; } break;
|
||||
case 'r': { out[len++] = '\r'; state = 0; } break;
|
||||
case 't': { out[len++] = '\t'; state = 0; } break;
|
||||
case 'v': { out[len++] = '\b'; state = 0; } break;
|
||||
case '\\': { if (out != NULL) out[out_len] = ch; out_len++; state = 0; } break;
|
||||
case 'a': { if (out != NULL) out[out_len] = '\a'; out_len++; state = 0; } break;
|
||||
case 'b': { if (out != NULL) out[out_len] = '\b'; out_len++; state = 0; } break;
|
||||
case 'f': { if (out != NULL) out[out_len] = '\f'; out_len++; state = 0; } break;
|
||||
case 'n': { if (out != NULL) out[out_len] = '\n'; out_len++; state = 0; } break;
|
||||
case 'r': { if (out != NULL) out[out_len] = '\r'; out_len++; state = 0; } break;
|
||||
case 't': { if (out != NULL) out[out_len] = '\t'; out_len++; state = 0; } break;
|
||||
case 'v': { if (out != NULL) out[out_len] = '\b'; out_len++; state = 0; } break;
|
||||
case 'x': { digitsExpected = 2; state = 2; } break;
|
||||
case 'u': { digitsExpected = 4; state = 2; } break;
|
||||
case 'U': { digitsExpected = 8; state = 2; } break;
|
||||
default : {
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} // switch ch
|
||||
} break;
|
||||
case 2: { // Escaped Unicode ( that is: we've found '\x', '\u' or '\U'.)
|
||||
int digit = 0;
|
||||
int digit = -1;
|
||||
if (ch >= '0' && ch <= '9') {
|
||||
digit = ch - (int)'0';
|
||||
} else if (ch >= 'A' && ch <= 'F') {
|
||||
@ -257,108 +259,161 @@ size_t ascii_to_utf8(const char *in, char *out, size_t outSize) {
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient hexidecimal digits following"
|
||||
" \\x, \\u, or \\U escape code in char string (in).\n", __FUNCTION__);
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
codePoint = codePoint * 16 + digit;
|
||||
digitsExpected -- ;
|
||||
if ( digitsExpected == 0 ) {
|
||||
char temp[4];
|
||||
size_t count = ucodepoint_to_utf8(codePoint, &temp);
|
||||
if (count < (outSize-len)) {
|
||||
memcpy( &out[len], temp, sizeof(char) * count );
|
||||
len += count;
|
||||
if (digit >= 0) {
|
||||
codePoint = codePoint * 16 + digit;
|
||||
digitsExpected -- ;
|
||||
if ( digitsExpected == 0 ) {
|
||||
char utf8_bytes[4];
|
||||
size_t n_elements = ucodepoint_to_utf8(codePoint, &utf8_bytes);
|
||||
state = 0;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__);
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
}
|
||||
codePoint = 0;
|
||||
if (out != NULL) {
|
||||
if (out_len + n_elements < outSize) {
|
||||
memcpy( &out[out_len], utf8_bytes, sizeof(char) * n_elements );
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
}
|
||||
out_len += n_elements;
|
||||
codePoint = 0;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
|
||||
case 3: { /* Expecting 3 UTF-8 continuation bytes. */
|
||||
if ((ch & 0xc0) == 0x80) {
|
||||
if (out != NULL) out[out_len] = ch; out_len++; state = 4;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Input (in) appears to be corrupted.\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
|
||||
case 4: { /* Expecting 2 UTF-8 continuation bytes. */
|
||||
if ((ch & 0xc0) == 0x80) {
|
||||
if (out != NULL) out[out_len] = ch; out_len++; state = 5;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Input (in) appears to be corrupted.\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
|
||||
case 5: { /* Expecting 1 UTF-8 continuation byte. */
|
||||
if ((ch & 0xc0) == 0x80) {
|
||||
if (out != NULL) out[out_len] = ch; out_len++; state = 0;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Input (in) appears to be corrupted.\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
|
||||
default: {
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
state = ERROR_STATE;
|
||||
} break;
|
||||
}
|
||||
in ++;
|
||||
}
|
||||
out[len] = 0; /* NULL termination of string. */
|
||||
return len;
|
||||
if (state != 0) { /* If we didn't finished in state 0, then we had an error. */
|
||||
out_len = 0;
|
||||
}
|
||||
if (out != NULL) out[out_len] = 0; /* NULL termination of string. */
|
||||
return out_len;
|
||||
}
|
||||
|
||||
size_t unescape_to_utf8_len(const char *in) {
|
||||
return unescape_to_utf8( in, NULL, (size_t)0);
|
||||
}
|
||||
|
||||
size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize) {
|
||||
|
||||
unsigned int codePoint = 0;
|
||||
size_t len = 0;
|
||||
size_t out_len = 0;
|
||||
int state = 0;
|
||||
|
||||
while (*in != 0) {
|
||||
if (in == NULL) {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 char-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (out != NULL) out[out_len] = 0;
|
||||
|
||||
while ((*in != 0) && (state != ERROR_STATE)) {
|
||||
unsigned char ch = *in;
|
||||
switch (state) {
|
||||
case 0: {
|
||||
if (ch >= 0xf0) { // Start of a 4-byte sequence.
|
||||
if (ch >= 0xf0) { // Start of a 4-byte UTF-8 sequence.
|
||||
codePoint = ch & 0x07; // Extract low 3 bits
|
||||
state = 3;
|
||||
} else if (ch >= 0xe0) { // Start of a 3-byte sequence.
|
||||
} else if (ch >= 0xe0) { // Start of a 3-byte UTF-8 sequence.
|
||||
codePoint = ch & 0x0f; // Extract low 4 bits
|
||||
state = 2;
|
||||
} else if (ch >= 0xc0) { // Start of a 2-byte sequence.
|
||||
} else if (ch >= 0xc0) { // Start of a 2-byte UTF-8 sequence.
|
||||
codePoint = ch & 0x1f; // Extract low 5 bits
|
||||
state = 1;
|
||||
} else if (ch >= 0x80) { // We should never find a continuation byte in isolation.
|
||||
} else if (ch >= 0x80) { // We should never find a UTF-8 continuation byte in isolation.
|
||||
fprintf(stderr,"%s:ERROR: UTF8 string (in) appears to be corrupted.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
} else {
|
||||
codePoint = ch; // ASCII
|
||||
if ((outSize-len) > 1) {
|
||||
out[len++] = (wchar_t)codePoint;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
|
||||
state = 99;
|
||||
if (out != NULL) {
|
||||
if ((out_len + 1) < outSize) {
|
||||
out[out_len] = (wchar_t)codePoint;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
}
|
||||
out_len++;
|
||||
}
|
||||
} break;
|
||||
case 1: { // Expecting one continuation byte.
|
||||
if ((ch & 0xc0) == 0x80) { // If the next char is a continuation byte ..
|
||||
case 1: { /* Expecting one continuation byte. */
|
||||
if ((ch & 0xc0) == 0x80) {
|
||||
codePoint = (codePoint << 6) | (ch & 0x3f); // Extract lower 6 bits
|
||||
state = 0;
|
||||
|
||||
if (sizeof(wchar_t) == 4) { // wchar_t is UTF-32
|
||||
int32_t temp;
|
||||
if ( ucodepoint_to_utf32(codePoint, &temp) > 0) {
|
||||
if ((outSize-len) > 1) {
|
||||
out[len++] = (wchar_t)temp;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
|
||||
state = 99;
|
||||
int32_t utf32_element;
|
||||
if ( ucodepoint_to_utf32(codePoint, &utf32_element) > 0) {
|
||||
if (out != NULL) {
|
||||
if ((out_len + 1) < outSize) {
|
||||
out[out_len] = (wchar_t)utf32_element;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
}
|
||||
out_len++;
|
||||
} else {
|
||||
state = 99;
|
||||
/* ucodepoint_to_utf32() will have, in this case produced an error message. */
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
|
||||
} else if (sizeof(wchar_t) == 2) { // wchar_t is UTF-16
|
||||
int16_t temp[2];
|
||||
size_t count;
|
||||
if (( count = ucodepoint_to_utf16(codePoint, &temp)) > 0) {
|
||||
if (count < (outSize-len)) {
|
||||
memcpy( &out[len], temp, sizeof(int16_t) * count );
|
||||
len += count;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
|
||||
state = 99;
|
||||
int16_t utf16_elements[2];
|
||||
size_t n_elements;
|
||||
if (( n_elements = ucodepoint_to_utf16(codePoint, &utf16_elements)) > 0) {
|
||||
if (out != NULL) {
|
||||
if ((out_len + n_elements) < outSize) {
|
||||
memcpy( &out[out_len], utf16_elements, sizeof(int16_t) * n_elements);
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in wchar_t array (out).\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
}
|
||||
out_len += n_elements;
|
||||
} else {
|
||||
/* ucodepoint_to_utf16() will have, in this case produced an error message. */
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Unsupported wchar_t size.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
case 2: { /* Expecting two continuation bytes. */
|
||||
@ -367,7 +422,7 @@ size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize) {
|
||||
state = 1;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
case 3: { /* Expecting three continuation bytes. */
|
||||
@ -376,35 +431,38 @@ size_t utf8_to_wchar(const char *in, wchar_t *out, size_t outSize) {
|
||||
state = 2;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: UTF8 string appears to be corrupted.\n", __FUNCTION__);
|
||||
state = 99;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
} break;
|
||||
default: { /* Error State. */
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
default: {
|
||||
state = ERROR_STATE;
|
||||
} break;
|
||||
}
|
||||
in ++;
|
||||
}
|
||||
if (state != 0) { /* If we didn't finish in state 0, it's an error. */
|
||||
out_len = 0;
|
||||
}
|
||||
if (out != NULL) out[out_len] = 0; /* NULL termination of string. */
|
||||
return out_len;
|
||||
}
|
||||
|
||||
/* If we finished in state 0, then we're good. Just
|
||||
terminate the string, otherwise we had an error. */
|
||||
if (state == 0) {
|
||||
out[len] = 0;
|
||||
return len;
|
||||
} else {
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
}
|
||||
return len;
|
||||
size_t utf8_to_wchar_len(const char *in) {
|
||||
return utf8_to_wchar( in, NULL, (size_t)0);
|
||||
}
|
||||
|
||||
size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize ) {
|
||||
|
||||
unsigned int codePoint = 0;
|
||||
size_t len = 0;
|
||||
size_t out_len = 0;
|
||||
int state = 0;
|
||||
|
||||
while ( *in != 0 ) {
|
||||
if (in == NULL) {
|
||||
fprintf(stderr,"%s:ERROR: wchar_t-pointer (in) is NULL. No conversion performed.\n", __FUNCTION__);
|
||||
return 0;
|
||||
}
|
||||
|
||||
while ((*in != 0) && (state != ERROR_STATE)) {
|
||||
if (*in >= 0xd800 && *in <= 0xdbff) /* If High-surrogate. */
|
||||
codePoint = ((*in - 0xd800) << 10) + 0x10000;
|
||||
else {
|
||||
@ -414,24 +472,33 @@ size_t wchar_to_utf8(const wchar_t *in, char *out, size_t outSize ) {
|
||||
codePoint = *in;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Invalid Unicode value.\n", __FUNCTION__);
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
|
||||
char temp[4];
|
||||
size_t count = ucodepoint_to_utf8(codePoint, &temp);
|
||||
if (count < (outSize-len)) {
|
||||
memcpy( &out[len], temp, sizeof(char) * count );
|
||||
len += count;
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__);
|
||||
out[0] = 0;
|
||||
return 0;
|
||||
if (state != ERROR_STATE) {
|
||||
char utf8_elements[4];
|
||||
size_t n_elements = ucodepoint_to_utf8(codePoint, &utf8_elements);
|
||||
if (out != NULL) {
|
||||
if ((out_len + n_elements) < outSize) {
|
||||
memcpy( &out[out_len], utf8_elements, sizeof(char) * n_elements );
|
||||
} else {
|
||||
fprintf(stderr,"%s:ERROR: Insufficient room in char array (out).\n", __FUNCTION__);
|
||||
state = ERROR_STATE;
|
||||
}
|
||||
}
|
||||
out_len += n_elements;
|
||||
codePoint = 0;
|
||||
}
|
||||
codePoint = 0;
|
||||
}
|
||||
in++;
|
||||
}
|
||||
out[len] = L'\0'; /* NULL termination of string. */
|
||||
return len;
|
||||
if (state != 0) { /* If we didn't finish in state 0, it's an error. */
|
||||
out_len = 0;
|
||||
}
|
||||
if (out != NULL) out[out_len] = 0; /* NULL termination of string. */
|
||||
return out_len;
|
||||
}
|
||||
|
||||
size_t wchar_to_utf8_len(const wchar_t *in) {
|
||||
return wchar_to_utf8( in, NULL, (size_t)0);
|
||||
}
|
||||
|
@ -11,7 +11,7 @@ include ${TRICK_HOME}/share/trick/makefiles/Makefile.common
|
||||
TRICK_CPPFLAGS += -I$(GTEST_HOME)/include -I$(TRICK_HOME)/include -g -Wall -Wextra -DGTEST_HAS_TR1_TUPLE=0
|
||||
|
||||
TRICK_LIBS = ${TRICK_LIB_DIR}/libtrick.a
|
||||
TRICK_EXEC_LINK_LIBS += -L${GTEST_HOME}/lib64 -L${GTEST_HOME}/lib -lgtest -lgtest_main -lpthread
|
||||
TRICK_EXEC_LINK_LIBS += -L${GTEST_HOME}/lib64 -L${GTEST_HOME}/lib -lgtest -lpthread
|
||||
|
||||
# Added for Ubuntu... not required for other systems.
|
||||
TRICK_EXEC_LINK_LIBS += -lpthread
|
||||
|
@ -5,6 +5,11 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "trick/unicode_utils.h"
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
const char* ISO_6429_Restore_Default = "\x1b[00m";
|
||||
const char* ISO_6429_Bold = "\x1b[01m";
|
||||
const char* ISO_6429_Underline = "\x1b[04m";
|
||||
@ -31,6 +36,12 @@ void Error_Message_Expected() {
|
||||
printf("%s\n", ISO_6429_Restore_Default );
|
||||
}
|
||||
|
||||
/* The following are the utf-8 encodings of four unicode characters used in the following tests. */
|
||||
// Greek Phi Symbol => U+03d5 => 0xcf 0x95 // see: https://www.compart.com/en/unicode/U+03D5
|
||||
// Superscript Latin Small Letter I => U+2071 => 0xe2 0x81 0xb1 // see: https://www.compart.com/en/unicode/U+2071
|
||||
// Modifier Letter Small Greek Phi => U+1D60 => 0xe1 0xb5 0xa0 // see: https://www.compart.com/en/unicode/U+1D60
|
||||
// Aegean Number Ten => U+10110 => 0xf0 0x90 0x84 0x90 // see: https://www.compart.com/en/unicode/U+10110
|
||||
|
||||
// -------------------------------------------------------
|
||||
// Test suite for ucodepoint_to_utf32()
|
||||
// -------------------------------------------------------
|
||||
@ -134,38 +145,39 @@ TEST(ucodepoint_to_utf8, ascii ) {
|
||||
}
|
||||
|
||||
// -------------------------------------------------------
|
||||
// Test suite for utf8_to_printable_ascii()
|
||||
// Test suite for escape_to_ascii()
|
||||
// -------------------------------------------------------
|
||||
TEST(utf8_to_printable_ascii, null_input ) {
|
||||
TEST(escape_to_ascii, null_input ) {
|
||||
/* Should generate error message if input character pointer is NULL. */
|
||||
char resultant_ascii_s[128];
|
||||
char output[128];
|
||||
char* null_ptr = (char*)0;
|
||||
Error_Message_Expected();
|
||||
size_t size = utf8_to_printable_ascii( null_ptr, resultant_ascii_s, sizeof(resultant_ascii_s));
|
||||
size_t size = escape_to_ascii( null_ptr, output, sizeof(output));
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(utf8_to_printable_ascii, null_output ) {
|
||||
/* Should generate error message if output character pointer is NULL. */
|
||||
TEST(escape_to_ascii, null_output ) {
|
||||
/* If output character pointer is NULL, still determine the length. */
|
||||
char* null_ptr = (char*)0;
|
||||
const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)";
|
||||
Error_Message_Expected();
|
||||
size_t size = utf8_to_printable_ascii( input, null_ptr, size_t(5));
|
||||
EXPECT_EQ(0, size);
|
||||
const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n";
|
||||
size_t expected_size = strlen ("e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n");
|
||||
size_t size = escape_to_ascii( input, null_ptr, size_t(5));
|
||||
EXPECT_EQ(expected_size, size);
|
||||
}
|
||||
|
||||
TEST(utf8_to_printable_ascii, normal_1 ) {
|
||||
char resultant_ascii_s[128];
|
||||
/* utf8_to_printable_ascii() should escape all Unicode and non-printable ASCII characters. */
|
||||
TEST(escape_to_ascii, normal_1 ) {
|
||||
char output[128];
|
||||
/* escape_to_ascii() should escape all Unicode and non-printable ASCII characters. */
|
||||
const char* utf8_s = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n";
|
||||
const char* expected_ascii_s = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n";
|
||||
(void) utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
|
||||
EXPECT_STREQ(expected_ascii_s, resultant_ascii_s);
|
||||
size_t size = escape_to_ascii( utf8_s, output, sizeof(output));
|
||||
EXPECT_EQ( strlen(expected_ascii_s), size);
|
||||
EXPECT_STREQ(expected_ascii_s, output);
|
||||
}
|
||||
|
||||
TEST(utf8_to_printable_ascii, normal_2 ) {
|
||||
char resultant_ascii_s[256];
|
||||
/* utf8_to_printable_ascii() should escape all Unicode and non-printable ASCII characters. */
|
||||
TEST(escape_to_ascii, normal_2 ) {
|
||||
char output[256];
|
||||
/* escape_to_ascii() should escape all Unicode and non-printable ASCII characters. */
|
||||
const char ascii[128] = { '\x01','\x02','\x03','\x04','\x05','\x06','\x07','\x08','\x09','\x0a','\x0b','\x0c','\x0d','\x0e','\x0f',
|
||||
'\x10','\x11','\x12','\x13','\x14','\x15','\x16','\x17','\x18','\x19','\x1a','\x1b','\x1c','\x1d','\x1e','\x1f',
|
||||
'\x20','\x21','\x22','\x23','\x24','\x25','\x26','\x27','\x28','\x29','\x2a','\x2b','\x2c','\x2d','\x2e','\x2f',
|
||||
@ -181,133 +193,147 @@ TEST(utf8_to_printable_ascii, normal_2 ) {
|
||||
"\\r\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f"
|
||||
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f";
|
||||
|
||||
size_t size = utf8_to_printable_ascii( ascii, resultant_ascii_s, sizeof(resultant_ascii_s));
|
||||
EXPECT_EQ(209, size);
|
||||
EXPECT_STREQ(expected_ascii_s, resultant_ascii_s);
|
||||
size_t size = escape_to_ascii( ascii, output, sizeof(output));
|
||||
EXPECT_EQ(strlen(expected_ascii_s), size);
|
||||
EXPECT_STREQ(expected_ascii_s, output);
|
||||
}
|
||||
|
||||
/* The following are the utf-8 encodings of four unicode characters used in the following tests. */
|
||||
// Greek Phi Symbol => U+03d5 => 0xcf 0x95 // see: https://www.compart.com/en/unicode/U+03D5
|
||||
// Superscript Latin Small Letter I => U+2071 => 0xe2 0x81 0xb1 // see: https://www.compart.com/en/unicode/U+2071
|
||||
// Modifier Letter Small Greek Phi => U+1D60 => 0xe1 0xb5 0xa0 // see: https://www.compart.com/en/unicode/U+1D60
|
||||
// Aegean Number Ten => U+10110 => 0xf0 0x90 0x84 0x90 // see: https://www.compart.com/en/unicode/U+10110
|
||||
TEST(escape_to_ascii, demotest ) {
|
||||
char output[128];
|
||||
|
||||
TEST(utf8_to_printable_ascii, demotest ) {
|
||||
char resultant_ascii_s[128];
|
||||
/* This test simply demonstrates that the following UTF-8 string (utf8_s),
|
||||
used in subsequent tests, is a well formed UTF-8 string. */
|
||||
|
||||
const char utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x95','\0'};
|
||||
const char* expected_ascii_s = "Phi = \\u03d5";
|
||||
(void) utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
|
||||
EXPECT_STREQ(expected_ascii_s, resultant_ascii_s);
|
||||
|
||||
size_t size = escape_to_ascii( utf8_s, output, sizeof(output));
|
||||
|
||||
EXPECT_STREQ(expected_ascii_s, output);
|
||||
EXPECT_EQ(strlen(expected_ascii_s), size);
|
||||
}
|
||||
|
||||
TEST(utf8_to_printable_ascii, detect_corruption_1 ) {
|
||||
char resultant_ascii_s[128];
|
||||
/* The following string is deliberately corrupted with a spurious
|
||||
continuation character (in corrupted_utf8_s[6]).*/
|
||||
const char corrupted_utf8_s[11] = {'P','h','i',' ','=',' ','\x80','\x95','\0'};
|
||||
TEST(escape_to_ascii, detect_corruption_1 ) {
|
||||
char output[128];
|
||||
|
||||
/* The input string is deliberately corrupted with a spurious
|
||||
continuation character.*/
|
||||
|
||||
char utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x95','\0'};
|
||||
utf8_s[6] = '\x80'; /* Deliberately corrupt the UTF-8 string. */
|
||||
|
||||
Error_Message_Expected();
|
||||
size_t size = utf8_to_printable_ascii( corrupted_utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
|
||||
size_t size = escape_to_ascii( utf8_s, output, sizeof(output));
|
||||
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(utf8_to_printable_ascii, detect_corruption_2 ) {
|
||||
char resultant_ascii_s[128];
|
||||
TEST(escape_to_ascii, detect_corruption_2 ) {
|
||||
char output[128];
|
||||
|
||||
/* The following string is deliberately corrupted: 0xcf is a header
|
||||
for a two-byte sequence, it should be followed by a continuation
|
||||
byte (most significant 2 bits are 10). 0x75 starts with 01 */
|
||||
const char corrupted_utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x75','\0'};
|
||||
|
||||
char utf8_s[11] = {'P','h','i',' ','=',' ','\xcf','\x95','\0'};
|
||||
utf8_s[7] = '\x75'; /* Deliberately corrupt the UTF-8 string. */
|
||||
|
||||
Error_Message_Expected();
|
||||
size_t size = utf8_to_printable_ascii( corrupted_utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
|
||||
size_t size = escape_to_ascii( utf8_s, output, sizeof(output));
|
||||
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(utf8_to_printable_ascii, insufficient_result_array_size ) {
|
||||
/* The result array must be of sufficient size. Here it is not. */
|
||||
char resultant_ascii_s[16];
|
||||
TEST(escape_to_ascii, insufficient_result_array_size ) {
|
||||
char output[16];
|
||||
|
||||
/* If the output array pointer is not NULL, it must be of sufficient size. Here it is not. */
|
||||
const char* utf8_s = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n";
|
||||
Error_Message_Expected();
|
||||
size_t size = utf8_to_printable_ascii( utf8_s, resultant_ascii_s, sizeof(resultant_ascii_s));
|
||||
size_t size = escape_to_ascii( utf8_s, output, sizeof(output));
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------
|
||||
// Test suite for ascii_to_utf8()
|
||||
// Test suite for unescape_to_utf8()
|
||||
// -------------------------------------------------------
|
||||
|
||||
TEST(ascii_to_utf8, null_input ) {
|
||||
TEST(unescape_to_utf8, null_input ) {
|
||||
/* Should generate error message if input character pointer is NULL. */
|
||||
char resultant_ascii_s[128];
|
||||
char output[128];
|
||||
char* null_ptr = (char*)0;
|
||||
Error_Message_Expected();
|
||||
size_t size = ascii_to_utf8( null_ptr, resultant_ascii_s, sizeof(resultant_ascii_s));
|
||||
size_t size = unescape_to_utf8( null_ptr, output, sizeof(output));
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(ascii_to_utf8, null_output ) {
|
||||
/* Should generate error message if output character pointer is NULL. */
|
||||
TEST(unescape_to_utf8, null_output ) {
|
||||
/* Should return the length of the string that would have been produced. */
|
||||
char* null_ptr = (char*)0;
|
||||
const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)";
|
||||
|
||||
Error_Message_Expected();
|
||||
size_t size = ascii_to_utf8( input, null_ptr, size_t(5));
|
||||
EXPECT_EQ(0, size);
|
||||
const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n";
|
||||
size_t expected_size = strlen("e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n");
|
||||
size_t size = unescape_to_utf8( input, null_ptr, size_t(5));
|
||||
EXPECT_EQ(expected_size, size);
|
||||
}
|
||||
|
||||
TEST(ascii_to_utf8, normal_1) {
|
||||
/* ascii_to_utf8() should un-escape all escaped ASCII and escaped unicode.
|
||||
*/
|
||||
char actual_output[256];
|
||||
TEST(unescape_to_utf8, normal_1) {
|
||||
/* unescape_to_utf8() should un-escape all escaped ASCII and escaped unicode,
|
||||
producing a utf8 character string. It should also return the length of
|
||||
that string. */
|
||||
char actual_output[128];
|
||||
const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n";
|
||||
const char* expected_output = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)\n";
|
||||
|
||||
size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
EXPECT_EQ(30, size);
|
||||
size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
|
||||
EXPECT_EQ( strlen(expected_output), size);
|
||||
EXPECT_STREQ(expected_output, actual_output);
|
||||
}
|
||||
|
||||
TEST(ascii_to_utf8, non_ascii_chars) {
|
||||
char actual_output[256];
|
||||
/* The input string should only contain ASCII characters, that is,
|
||||
each element should have a value < 128. That isn't the case in the
|
||||
following string. Therefore, an error message should be emitted.
|
||||
TEST(unescape_to_utf8, non_ascii_chars) {
|
||||
char actual_output[128];
|
||||
/*
|
||||
???
|
||||
*/
|
||||
const char* input = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
|
||||
const char* input = "eⁱᵠ = cos(ϕ) + i*sin(\\u03d5)\\n";
|
||||
|
||||
Error_Message_Expected();
|
||||
size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
EXPECT_EQ(0, size);
|
||||
size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
|
||||
EXPECT_EQ(30, size);
|
||||
}
|
||||
|
||||
TEST(ascii_to_utf8, insufficient_hex_digits_1) {
|
||||
TEST(unescape_to_utf8, insufficient_hex_digits_1) {
|
||||
/* The \U escape code expects exactly 8 hexidecimal digits to follow.
|
||||
If fewer than 8 are present, then an error message should result.
|
||||
Note: "\U10110" will fail in a C/C++ literal at compile time too,
|
||||
because it is incomplete. It should be "\U00010110".
|
||||
*/
|
||||
char actual_output[256];
|
||||
char actual_output[128];
|
||||
const char* input = "Aegean Number Ten = \\U10110\n";
|
||||
|
||||
Error_Message_Expected();
|
||||
size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(ascii_to_utf8, insufficient_hex_digits_2) {
|
||||
TEST(unescape_to_utf8, insufficient_hex_digits_2) {
|
||||
/* The \u escape code expects exactly 4 hexidecimal digits to follow.
|
||||
If fewer than 4 are present, then an error message should result.
|
||||
Note: "\u3d5" will fail in a C/C++ literal at compile time too,
|
||||
because it is incomplete. It should be "\u03d5".
|
||||
*/
|
||||
char actual_output[256];
|
||||
char actual_output[128];
|
||||
const char* input = "Phi = \\u3d5\n";
|
||||
|
||||
Error_Message_Expected();
|
||||
size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(ascii_to_utf8, insufficient_result_array_size) {
|
||||
TEST(unescape_to_utf8, insufficient_result_array_size) {
|
||||
/* The result array must be of sufficient size. If it isn't, then an error
|
||||
message should be emitted.
|
||||
*/
|
||||
@ -315,13 +341,31 @@ TEST(ascii_to_utf8, insufficient_result_array_size) {
|
||||
const char* input = "e\\u2071\\u1d60 = cos(\\u03d5) + i*sin(\\u03d5)\\n";
|
||||
|
||||
Error_Message_Expected();
|
||||
size_t size = ascii_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
size_t size = unescape_to_utf8(input, actual_output, sizeof(actual_output));
|
||||
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------
|
||||
// Test suite for utf8_to_wchar()
|
||||
// -------------------------------------------------------
|
||||
TEST(utf8_to_wchar, null_input ) {
|
||||
/* Should generate error message if input character pointer is NULL. */
|
||||
wchar_t output[128];
|
||||
char* null_ptr = (char*)0;
|
||||
Error_Message_Expected();
|
||||
size_t size = utf8_to_wchar( null_ptr, output, sizeof(output)/sizeof(wchar_t));
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(utf8_to_wchar, null_output ) {
|
||||
/* Should return the length of the string that would have been produced. */
|
||||
wchar_t* null_ptr = (wchar_t*)0;
|
||||
const char* input = "e\u2071\u1d60 = cos(\u03d5) + i*sin(\u03d5)";
|
||||
size_t expected_size = wcslen(L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)");
|
||||
size_t size = utf8_to_wchar( input, null_ptr, size_t(0));
|
||||
EXPECT_EQ(expected_size, size);
|
||||
}
|
||||
|
||||
/* The following three tests demonstrate three different ways to
|
||||
create the same input string. */
|
||||
@ -348,7 +392,7 @@ TEST(utf8_to_wchar, test2) {
|
||||
|
||||
TEST(utf8_to_wchar, test3) {
|
||||
wchar_t resultant_wchar_s[128];
|
||||
const char input[30] = {'e','\xe2','\x81','\xb1','\xe1', '\xb5','\xa0',' ','=',' ',
|
||||
const char input[30] = {'e','\xe2','\x81','\xb1','\xe1','\xb5','\xa0',' ','=',' ',
|
||||
'c','o','s','(','\xcf','\x95',')',' ','+',' ','i','*','s',
|
||||
'i','n','(','\xcf','\x95',')','\0'};
|
||||
const wchar_t* expected_wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
|
||||
@ -369,7 +413,7 @@ TEST(utf8_to_wchar, insufficient_result_array_size) {
|
||||
|
||||
TEST(utf8_to_wchar, corrupted_input) {
|
||||
wchar_t resultant_wchar_s[128];
|
||||
char input[30] = {'e','\xe2','\x81','\xb1','\xe1', '\xb5','\xa0',' ','=',' ',
|
||||
char input[30] = {'e','\xe2','\x81','\xb1','\xe1','\xb5','\xa0',' ','=',' ',
|
||||
'c','o','s','(','\xcf','\x95',')',' ','+',' ','i','*','s',
|
||||
'i','n','(','\xcf','\x95',')','\0'};
|
||||
|
||||
@ -384,8 +428,26 @@ TEST(utf8_to_wchar, corrupted_input) {
|
||||
// -------------------------------------------------------
|
||||
// Test suite for wchar_to_utf8()
|
||||
// -------------------------------------------------------
|
||||
TEST(wchar_to_utf8, test1) {
|
||||
TEST(wchar_to_utf8, null_input ) {
|
||||
/* Should generate error message if input character pointer is NULL. */
|
||||
wchar_t* null_ptr = (wchar_t*)0;
|
||||
char output[128];
|
||||
Error_Message_Expected();
|
||||
size_t size = wchar_to_utf8( null_ptr, output, sizeof(output)/sizeof(wchar_t));
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(wchar_to_utf8, null_output ) {
|
||||
/* If output is NULL, still generate the length of the array that would have been produced. */
|
||||
const wchar_t* input = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
|
||||
char* null_ptr = (char*)0;
|
||||
size_t expected_size = strlen("eⁱᵠ = cos(ϕ) + i*sin(ϕ)");
|
||||
size_t size = wchar_to_utf8( input, null_ptr, (size_t)0);
|
||||
EXPECT_EQ(expected_size, size);
|
||||
}
|
||||
|
||||
TEST(wchar_to_utf8, test1) {
|
||||
/* Should convert wchar_t array to a UTF-8 array. */
|
||||
char resultant_utf8_s[128];
|
||||
const wchar_t* wide_s = L"eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
|
||||
const char* expected_utf8_s = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
|
||||
@ -395,6 +457,29 @@ TEST(wchar_to_utf8, test1) {
|
||||
EXPECT_EQ(true, test_result);
|
||||
}
|
||||
|
||||
TEST(wchar_to_utf8, test2) {
|
||||
/* Same test as above, but input is a constrained array. A variant fo this is used below.*/
|
||||
char resultant_utf8_s[128];
|
||||
wchar_t wide_s[32] = { L'e', L'ⁱ', L'ᵠ', L' ', L'=', L' ', L'c', L'o', L's', L'(', L'ϕ', L')',
|
||||
L' ', L'+', L' ', L'i', L'*', L's', L'i', L'n', L'(', L'ϕ', L')' };
|
||||
const char* expected_utf8_s = "eⁱᵠ = cos(ϕ) + i*sin(ϕ)";
|
||||
|
||||
wchar_to_utf8(wide_s, resultant_utf8_s, sizeof(resultant_utf8_s)/sizeof(char));
|
||||
bool test_result = (strcmp(expected_utf8_s, resultant_utf8_s) == 0);
|
||||
EXPECT_EQ(true, test_result);
|
||||
}
|
||||
|
||||
TEST(wchar_to_utf8, invalid_unicode) {
|
||||
/* Should detect an invalid unicode codepoint. */
|
||||
char resultant_utf8_s[128];
|
||||
wchar_t wide_s[32] = { L'e', L'ⁱ', L'ᵠ', L' ', L'=', L' ', L'c', L'o', L's', L'(', L'ϕ', L')',
|
||||
L' ', L'+', L' ', L'i', L'*', L's', L'i', L'n', L'(', L'ϕ', L')' };
|
||||
wide_s[14] = (wchar_t)0x110000;
|
||||
Error_Message_Expected();
|
||||
size_t size = wchar_to_utf8(wide_s, resultant_utf8_s, sizeof(resultant_utf8_s)/sizeof(char));
|
||||
EXPECT_EQ(0, size);
|
||||
}
|
||||
|
||||
TEST(wchar_to_utf8, insufficient_result_array_size) {
|
||||
|
||||
char resultant_utf8_s[16];
|
||||
|
Loading…
x
Reference in New Issue
Block a user