genode/repos/os/include/util/utf8.h

/*
 * \brief  Unicode codepoint type and UTF-8 decoder
 * \author Norman Feske
 * \date   2018-03-15
 */

/*
 * Copyright (C) 2018-2019 Genode Labs GmbH
 *
 * This file is part of the Genode OS framework, which is distributed
 * under the terms of the GNU Affero General Public License version 3.
 */

#ifndef _INCLUDE__OS__UTIL__UTF8_H_
#define _INCLUDE__OS__UTIL__UTF8_H_

#include <base/output.h>
#include <base/stdint.h>

namespace Genode {
	struct Codepoint;
	class  Utf8_ptr;
}


struct Genode::Codepoint
{
	static constexpr uint32_t INVALID = 0xfffe;

	uint32_t value;

	bool valid() const { return value != INVALID; }

	void print(Output &output) const
	{
		/* extract 'n' bits 'at' bit position of value */
		auto bits = [&] (unsigned at, unsigned n) {
			return (char)((value >> at) & ((1 << n) - 1)); };

		if (value < 1<<7) {
			output.out_char(bits( 0, 7));
		} else
		if (value < 1<<11) {
			output.out_char(bits( 6, 5) | (char)0xc0);
			output.out_char(bits( 0, 6) | (char)0x80);
		} else
		if (value < 1<<16) {
			output.out_char(bits(12, 4) | (char)0xe0);
			output.out_char(bits( 6, 6) | (char)0x80);
			output.out_char(bits( 0, 6) | (char)0x80);
		} else
		if (value < 0x11<<16) {
			output.out_char(bits(18, 3) | (char)0xf0);
			output.out_char(bits(12, 6) | (char)0x80);
			output.out_char(bits( 6, 6) | (char)0x80);
			output.out_char(bits( 0, 6) | (char)0x80);
		}
	}
};


/**
 * Wrapper around a 'char const' pointer that is able to iterate over UTF-8
 * characters
 *
 * Note that this class is not a smart pointer. It is suffixed with '_ptr' to
 * highlight the fact that it stores a pointer while being copyable. Hence,
 * objects of this type must be handled with the same caution as pointers.
 */
class Genode::Utf8_ptr
{
	private:

		uint8_t const * const _utf8;

		/**
		 * Return true if byte is a tail character of an UTF-8 sequence
		 */
		static bool _tail_char(uint8_t c) { return (c & 0xc0) == 0x80; }

		/**
		 * Return expected number of bytes following the 'c1' start of an
		 * UTF-8 sequence
		 */
		static unsigned _tail_length(uint8_t c1)
		{
			if (c1 < 128)
				return 0;

			/* bit 7 is known to be set, count the next set bits */
			for (unsigned i = 0; i < 4; i++)
				if ((c1 & (1 << (6 - i))) == 0)
					return i;

			return 0;
		}

		/**
		 * Consume trailing bytes of UTF-8 sequence of length 'n'
		 *
		 * \param c1  character bits of the initial UTF-8 byte
		 */
		static Codepoint _decode_tail(uint32_t c1, uint8_t const *utf8, unsigned n)
		{
			uint32_t value = c1;

			for (unsigned i = 0; i < n; i++, utf8++) {

				/* detect premature end of string or end of UTF-8 sequence */
				uint8_t const c = *utf8;
				if (!c || !_tail_char(c))
					return Codepoint { Codepoint::INVALID };

				value = (value << 6) | (c & 0x3f);
			}

			/* reject overlong sequences */
			bool const overlong = ((n > 0 && value <    0x80)
			                    || (n > 1 && value <   0x800)
			                    || (n > 2 && value < 0x10000));

			/* conflict with UTF-16 surrogate halves or reserved codepoints */
			bool const illegal = (n > 1) && ((value >= 0xd800 && value <= 0xdfff)
			                              || (value >= 0xfdd0 && value <= 0xfdef)
			                              || (value == 0xfffe)
			                              || (value >  0x10ffff));

			bool const valid = !overlong && !illegal;

			return Codepoint { valid ? value : Codepoint::INVALID };
		}

		bool _end() const { return !_utf8 || !*_utf8; }

		/**
		 * Scan for the null termination of '_utf8'
		 *
		 * \param max  maximum number of bytes to scan
		 * \return     number of present bytes, up to 'max'
		 */
		unsigned _bytes_present(unsigned max) const
		{
			for (unsigned i = 0; i < max; i++)
				if (!_utf8[i])
					return i;

			return max;
		}

	public:

		/**
		 * Constructor
		 *
		 * \param utf8  null-terminated buffer containing UTF-8-encoded text
		 */
		Utf8_ptr(char const *utf8) : _utf8((uint8_t const *)utf8) { }

		Utf8_ptr(Utf8_ptr const &other) : _utf8(other._utf8) { }

		Utf8_ptr &operator = (Utf8_ptr const &other)
		{
			const_cast<uint8_t const *&>(_utf8) = other._utf8;
			return *this;
		}

		/**
		 * Return next UTF-8 character
		 */
		Utf8_ptr const next() const
		{
			if (_end()) return Utf8_ptr(nullptr);

			unsigned        const tail_length = _tail_length(_utf8[0]);
			uint8_t const * const tail        = _utf8 + 1;

			for (unsigned i = 0; i < tail_length; i++)
				if (!_tail_char(tail[i]))
					return Utf8_ptr((char const *)tail + i);

			return Utf8_ptr((char const *)tail + tail_length);
		}

		/**
		 * Return true if string contains a complete UTF-8 sequence
		 *
		 * This method solely checks for a premature truncation of the string.
		 * It does not check the validity of the UTF-8 sequence. The success of
		 * 'complete' method is a precondition for the correct operation of the
		 * 'next' or 'codepoint' methods. A complete sequence may still yield
		 * an invalid 'Codepoint'.
		 */
		bool complete() const
		{
			if (_end()) return false;

			unsigned const expected_length = _tail_length(_utf8[0]) + 1;

			return expected_length == _bytes_present(expected_length);
		}

		/**
		 * Return character as Unicode codepoint
		 */
		Codepoint codepoint() const
		{
			uint8_t const *s = _utf8;
			uint8_t const c1 = *s++;

			if ((c1 & 0x80) == 0)    return Codepoint { c1 };
			if ((c1 & 0xe0) == 0xc0) return _decode_tail(c1 & 0x1f, s, 1);
			if ((c1 & 0xf0) == 0xe0) return _decode_tail(c1 & 0x0f, s, 2);
			if ((c1 & 0xf8) == 0xf0) return _decode_tail(c1 & 0x07, s, 3);

			return Codepoint { Codepoint::INVALID };
		}

		/**
		 * Return length of UTF-8 sequence in bytes
		 */
		unsigned length() const
		{
			return _end() ? 0 : _bytes_present(1 + _tail_length(_utf8[0]));
		}
};

#endif /* _INCLUDE__OS__UTIL__UTF8_H_ */
os: util/utf8.h for UTF-8 string handling This patch adds a simple UTF-8 decoder at 'os/include/util/utf8.h' along with a test at 'os/run/utf8.run'. Fixes #2717, related to issue #2716 2018-03-15 13:12:04 +00:00			`/*`
			`* \brief Unicode codepoint type and UTF-8 decoder`
			`* \author Norman Feske`
			`* \date 2018-03-15`
			`*/`

			`/*`
Refactor terminal for intrinsic Unicode support Refactor the graphical terminal server to internally represent characters as 16-bit codepoints and handle the duplex terminal stream as UTF-8. - Make the Codepoint class printable to the Output interface - Decode data received at the Terminal session from UTF-8 to a 16-bit character - Pass 16-bit characters through terminal decoder and char-cell arrays - Send Unicode through terminal session in a burst of UTF-8 bytes Fix #3148 2019-02-08 00:26:21 +00:00			`* Copyright (C) 2018-2019 Genode Labs GmbH`
os: util/utf8.h for UTF-8 string handling This patch adds a simple UTF-8 decoder at 'os/include/util/utf8.h' along with a test at 'os/run/utf8.run'. Fixes #2717, related to issue #2716 2018-03-15 13:12:04 +00:00			`*`
			`* This file is part of the Genode OS framework, which is distributed`
			`* under the terms of the GNU Affero General Public License version 3.`
			`*/`

			`#ifndef _INCLUDE__OS__UTIL__UTF8_H_`
			`#define _INCLUDE__OS__UTIL__UTF8_H_`

Refactor terminal for intrinsic Unicode support Refactor the graphical terminal server to internally represent characters as 16-bit codepoints and handle the duplex terminal stream as UTF-8. - Make the Codepoint class printable to the Output interface - Decode data received at the Terminal session from UTF-8 to a 16-bit character - Pass 16-bit characters through terminal decoder and char-cell arrays - Send Unicode through terminal session in a burst of UTF-8 bytes Fix #3148 2019-02-08 00:26:21 +00:00			`#include <base/output.h>`
os: util/utf8.h for UTF-8 string handling This patch adds a simple UTF-8 decoder at 'os/include/util/utf8.h' along with a test at 'os/run/utf8.run'. Fixes #2717, related to issue #2716 2018-03-15 13:12:04 +00:00			`#include <base/stdint.h>`

			`namespace Genode {`
			`struct Codepoint;`
			`class Utf8_ptr;`
			`}`


			`struct Genode::Codepoint`
			`{`
utf8: non-character U+fffe as invalid codepoint Unicode non-characters [1] are guaranteed to never be used for a character. The formerly used U+fffd however is a valid character - the replacement character [2] correctly displayed by Qt5 as <?>. [1] https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Non-characters [2] https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character Issue #3483 2019-08-23 10:08:58 +00:00			`static constexpr uint32_t INVALID = 0xfffe;`
os: util/utf8.h for UTF-8 string handling This patch adds a simple UTF-8 decoder at 'os/include/util/utf8.h' along with a test at 'os/run/utf8.run'. Fixes #2717, related to issue #2716 2018-03-15 13:12:04 +00:00
			`uint32_t value;`

			`bool valid() const { return value != INVALID; }`
Refactor terminal for intrinsic Unicode support Refactor the graphical terminal server to internally represent characters as 16-bit codepoints and handle the duplex terminal stream as UTF-8. - Make the Codepoint class printable to the Output interface - Decode data received at the Terminal session from UTF-8 to a 16-bit character - Pass 16-bit characters through terminal decoder and char-cell arrays - Send Unicode through terminal session in a burst of UTF-8 bytes Fix #3148 2019-02-08 00:26:21 +00:00
			`void print(Output &output) const`
			`{`
			`/* extract 'n' bits 'at' bit position of value */`
			`auto bits = [&] (unsigned at, unsigned n) {`
			`return (char)((value >> at) & ((1 << n) - 1)); };`

			`if (value < 1<<7) {`
			`output.out_char(bits( 0, 7));`
			`} else`
			`if (value < 1<<11) {`
os: avoid implicit conversions Issue #23 2021-12-02 10:23:38 +00:00			`output.out_char(bits( 6, 5) \| (char)0xc0);`
			`output.out_char(bits( 0, 6) \| (char)0x80);`
Refactor terminal for intrinsic Unicode support Refactor the graphical terminal server to internally represent characters as 16-bit codepoints and handle the duplex terminal stream as UTF-8. - Make the Codepoint class printable to the Output interface - Decode data received at the Terminal session from UTF-8 to a 16-bit character - Pass 16-bit characters through terminal decoder and char-cell arrays - Send Unicode through terminal session in a burst of UTF-8 bytes Fix #3148 2019-02-08 00:26:21 +00:00			`} else`
			`if (value < 1<<16) {`
os: avoid implicit conversions Issue #23 2021-12-02 10:23:38 +00:00			`output.out_char(bits(12, 4) \| (char)0xe0);`
			`output.out_char(bits( 6, 6) \| (char)0x80);`
			`output.out_char(bits( 0, 6) \| (char)0x80);`
Refactor terminal for intrinsic Unicode support Refactor the graphical terminal server to internally represent characters as 16-bit codepoints and handle the duplex terminal stream as UTF-8. - Make the Codepoint class printable to the Output interface - Decode data received at the Terminal session from UTF-8 to a 16-bit character - Pass 16-bit characters through terminal decoder and char-cell arrays - Send Unicode through terminal session in a burst of UTF-8 bytes Fix #3148 2019-02-08 00:26:21 +00:00			`} else`
			`if (value < 0x11<<16) {`
os: avoid implicit conversions Issue #23 2021-12-02 10:23:38 +00:00			`output.out_char(bits(18, 3) \| (char)0xf0);`
			`output.out_char(bits(12, 6) \| (char)0x80);`
			`output.out_char(bits( 6, 6) \| (char)0x80);`
			`output.out_char(bits( 0, 6) \| (char)0x80);`
Refactor terminal for intrinsic Unicode support Refactor the graphical terminal server to internally represent characters as 16-bit codepoints and handle the duplex terminal stream as UTF-8. - Make the Codepoint class printable to the Output interface - Decode data received at the Terminal session from UTF-8 to a 16-bit character - Pass 16-bit characters through terminal decoder and char-cell arrays - Send Unicode through terminal session in a burst of UTF-8 bytes Fix #3148 2019-02-08 00:26:21 +00:00			`}`
			`}`
os: util/utf8.h for UTF-8 string handling This patch adds a simple UTF-8 decoder at 'os/include/util/utf8.h' along with a test at 'os/run/utf8.run'. Fixes #2717, related to issue #2716 2018-03-15 13:12:04 +00:00			`};`


			`/**`
			`* Wrapper around a 'char const' pointer that is able to iterate over UTF-8`
			`* characters`
			`*`
			`* Note that this class is not a smart pointer. It is suffixed with '_ptr' to`
			`* highlight the fact that it stores a pointer while being copyable. Hence,`
			`* objects of this type must be handled with the same caution as pointers.`
			`*/`
			`class Genode::Utf8_ptr`
			`{`
			`private:`

			`uint8_t const * const _utf8;`

			`/**`
			`* Return true if byte is a tail character of an UTF-8 sequence`
			`*/`
			`static bool _tail_char(uint8_t c) { return (c & 0xc0) == 0x80; }`

			`/**`
			`* Return expected number of bytes following the 'c1' start of an`
			`* UTF-8 sequence`
			`*/`
			`static unsigned _tail_length(uint8_t c1)`
			`{`
			`if (c1 < 128)`
			`return 0;`

			`/* bit 7 is known to be set, count the next set bits */`
			`for (unsigned i = 0; i < 4; i++)`
			`if ((c1 & (1 << (6 - i))) == 0)`
			`return i;`

			`return 0;`
			`}`

			`/**`
			`* Consume trailing bytes of UTF-8 sequence of length 'n'`
			`*`
			`* \param c1 character bits of the initial UTF-8 byte`
			`*/`
			`static Codepoint _decode_tail(uint32_t c1, uint8_t const *utf8, unsigned n)`
			`{`
			`uint32_t value = c1;`

			`for (unsigned i = 0; i < n; i++, utf8++) {`

			`/* detect premature end of string or end of UTF-8 sequence */`
			`uint8_t const c = *utf8;`
			`if (!c \|\| !_tail_char(c))`
			`return Codepoint { Codepoint::INVALID };`

			`value = (value << 6) \| (c & 0x3f);`
			`}`

			`/* reject overlong sequences */`
			`bool const overlong = ((n > 0 && value < 0x80)`
			`\|\| (n > 1 && value < 0x800)`
			`\|\| (n > 2 && value < 0x10000));`

			`/* conflict with UTF-16 surrogate halves or reserved codepoints */`
			`bool const illegal = (n > 1) && ((value >= 0xd800 && value <= 0xdfff)`
			`\|\| (value >= 0xfdd0 && value <= 0xfdef)`
			`\|\| (value == 0xfffe)`
			`\|\| (value > 0x10ffff));`

			`bool const valid = !overlong && !illegal;`

			`return Codepoint { valid ? value : Codepoint::INVALID };`
			`}`

			`bool _end() const { return !_utf8 \|\| !*_utf8; }`

			`/**`
			`* Scan for the null termination of '_utf8'`
			`*`
			`* \param max maximum number of bytes to scan`
			`* \return number of present bytes, up to 'max'`
			`*/`
			`unsigned _bytes_present(unsigned max) const`
			`{`
			`for (unsigned i = 0; i < max; i++)`
			`if (!_utf8[i])`
			`return i;`

			`return max;`
			`}`

			`public:`

			`/**`
			`* Constructor`
			`*`
			`* \param utf8 null-terminated buffer containing UTF-8-encoded text`
			`*/`
			`Utf8_ptr(char const utf8) : _utf8((uint8_t const )utf8) { }`

			`Utf8_ptr(Utf8_ptr const &other) : _utf8(other._utf8) { }`

			`Utf8_ptr &operator = (Utf8_ptr const &other)`
			`{`
			`const_cast<uint8_t const *&>(_utf8) = other._utf8;`
			`return *this;`
			`}`

			`/**`
			`* Return next UTF-8 character`
			`*/`
			`Utf8_ptr const next() const`
			`{`
			`if (_end()) return Utf8_ptr(nullptr);`

			`unsigned const tail_length = _tail_length(_utf8[0]);`
			`uint8_t const * const tail = _utf8 + 1;`

			`for (unsigned i = 0; i < tail_length; i++)`
			`if (!_tail_char(tail[i]))`
			`return Utf8_ptr((char const *)tail + i);`

			`return Utf8_ptr((char const *)tail + tail_length);`
			`}`

			`/**`
			`* Return true if string contains a complete UTF-8 sequence`
			`*`
			`* This method solely checks for a premature truncation of the string.`
			`* It does not check the validity of the UTF-8 sequence. The success of`
			`* 'complete' method is a precondition for the correct operation of the`
			`* 'next' or 'codepoint' methods. A complete sequence may still yield`
			`* an invalid 'Codepoint'.`
			`*/`
			`bool complete() const`
			`{`
			`if (_end()) return false;`

			`unsigned const expected_length = _tail_length(_utf8[0]) + 1;`

			`return expected_length == _bytes_present(expected_length);`
			`}`

			`/**`
			`* Return character as Unicode codepoint`
			`*/`
			`Codepoint codepoint() const`
			`{`
			`uint8_t const *s = _utf8;`
			`uint8_t const c1 = *s++;`

			`if ((c1 & 0x80) == 0) return Codepoint { c1 };`
			`if ((c1 & 0xe0) == 0xc0) return _decode_tail(c1 & 0x1f, s, 1);`
			`if ((c1 & 0xf0) == 0xe0) return _decode_tail(c1 & 0x0f, s, 2);`
			`if ((c1 & 0xf8) == 0xf0) return _decode_tail(c1 & 0x07, s, 3);`

			`return Codepoint { Codepoint::INVALID };`
			`}`

			`/**`
			`* Return length of UTF-8 sequence in bytes`
			`*/`
			`unsigned length() const`
			`{`
			`return _end() ? 0 : _bytes_present(1 + _tail_length(_utf8[0]));`
			`}`
			`};`

			`#endif /* _INCLUDE__OS__UTIL__UTF8_H_ */`