genode/repos/base/include/util/token.h

/*
 * \brief  Tokenizer support
 * \author Norman Feske
 * \date   2006-05-19
 */

/*
 * Copyright (C) 2006-2017 Genode Labs GmbH
 *
 * This file is part of the Genode OS framework, which is distributed
 * under the terms of the GNU Affero General Public License version 3.
 */

#ifndef _INCLUDE__UTIL__TOKEN_H_
#define _INCLUDE__UTIL__TOKEN_H_

#include <util/string.h>

namespace Genode {

	struct Scanner_policy_identifier_with_underline;
	template <typename> class Token;
}


/**
 * Scanner policy that accepts underline characters in identifiers
 */
struct Genode::Scanner_policy_identifier_with_underline
{
	/**
	 * Return true if character belongs to a valid identifier
	 *
	 * \param c  character
	 * \param i  index of character in token
	 * \return   true if character is a valid identifier character
	 *
	 * Letters and underline characters are allowed anywhere in an
	 * identifier, digits must not appear at the beginning.
	 */
	static bool identifier_char(char c, unsigned i) {
		return is_letter(c) || (c == '_') || (i && is_digit(c)); }
};


/**
 * Token
 *
 * This class is used to group characters of a string which belong
 * to one syntactical token types number, identifier, string,
 * whitespace or another single character.
 *
 * \param SCANNER_POLICY  policy that defines the way of token scanning
 *
 * See 'Scanner_policy_identifier_with_underline' for an example scanner
 * policy.
 */
template <typename SCANNER_POLICY>
class Genode::Token
{
	public:

		enum Type { SINGLECHAR, NUMBER, IDENT, STRING, WHITESPACE, END };

		/**
		 * Constructor
		 *
		 * \param s        start of string to construct a token from
		 * \param max_len  maximum token length
		 *
		 * The 'max_len' argument is useful for processing character arrays
		 * that are not null-terminated.
		 */
		Token(const char *s = 0, size_t max_len = ~0UL)
		: _start(s), _max_len(max_len), _len(s ? _calc_len(max_len) : 0) { }

		/**
		 * Accessors
		 */
		char *start() const { return (char *)_start; }
		size_t  len() const { return _len; }
		Type   type() const { return _type(_len); }

		/**
		 * Return token as null-terminated string
		 */
		void string(char *dst, size_t max_len) const {
			copy_cstring(dst, start(), min(len() + 1, max_len)); }

		/**
		 * Return true if token is valid
		 */
		bool valid() const { return _start && _len; }

		operator bool () const { return valid(); }

		/**
		 * Access single characters of token
		 */
		char operator [] (int idx)
		{
			return ((idx >= 0) && ((unsigned)idx < _len)) ? _start[idx] : 0;
		}

		/**
		 * Return next token
		 */
		Token next() const { return Token(_start + _len, _max_len - _len); }

		/**
		 * Return next token after delimiter
		 */
		Token next_after(char const *delim)
		{
			size_t const len = strlen(delim);

			if (!valid() || len > _max_len)
				return Token();

			char const *s = _start;
			for (size_t rest = _max_len; rest >= len; --rest, ++s)
				if (strcmp(s, delim, len) == 0)
					return Token(s, rest).next();

			return Token();
		}

		/**
		 * Return true if token starts with pattern
		 */
		bool matches(char const *pattern)
		{
			size_t const len = strlen(pattern);

			if (!valid() || len > _max_len)
				return false;

			return strcmp(pattern, _start, len) == 0;
		}

		/**
		 * Return next non-whitespace token
		 */
		Token eat_whitespace() const { return (_type(_len) == WHITESPACE) ? next() : *this; }

	private:

		const char *_start;
		size_t      _max_len;
		size_t      _len;

		/**
		 * Return type of token
		 *
		 * \param  max_len  maximum token length
		 *
		 * This method is used during the construction of 'Token'
		 * objects, in particular for determining the value of the '_len'
		 * member. Therefore, we explicitely pass the 'max_len' to the
		 * method. For the public interface, there exists the 'type()'
		 * accessor, which relies on '_len' as implicit argument.
		 */
		Type _type(size_t max_len) const
		{
			if (!_start || max_len < 1 || !*_start) return END;

			/* determine the type based on the first character */
			char c = *_start;
			if (SCANNER_POLICY::identifier_char(c, 0)) return IDENT;
			if (is_digit(c))                           return NUMBER;
			if (is_whitespace(c))                      return WHITESPACE;

			/* if string is incomplete, discard it (type END) */
			if (c == '"')
				return _quoted_string_len(max_len) ? STRING : END;

			return SINGLECHAR;
		}

		size_t _quoted_string_len(size_t max_len) const
		{
			/*
			 * The 'end_of_quote' function examines two 'char' values.
			 * Hence, the upper bound of the index is max_len - 2.
			 */
			unsigned i = 0;
			for (; i + 1 < max_len && !end_of_quote(&_start[i]); i++)

				/* string ends without final quotation mark? too bad! */
				if (!_start[i]) return 0;

			/* exceeded maximum token length */
			if (i + 1 == max_len)
				return 0;

			/*
			 * We stopped our search at the character before the
			 * final quotation mark but we return the number of
			 * characters including the quotation marks.
			 */
			return i + 2;
		}

		/**
		 * Return length of token
		 */
		int _calc_len(size_t max_len) const
		{
			switch (_type(max_len)) {

			case SINGLECHAR:
				return 1;

			case NUMBER:
				{
					unsigned i = 0;
					for (; i < max_len && is_digit(_start[i]); i++);
					return i;
				}

			case IDENT:
				{
					unsigned i = 0;
					for (; i < max_len; i++) {
						if (SCANNER_POLICY::identifier_char(_start[i], i))
							continue;

						/* stop if any other (invalid) character occurs */
						break;
					}
					return i;
				}

			case STRING:

				return _quoted_string_len(max_len);

			case WHITESPACE:
				{
					unsigned i = 0;
					for (; i < max_len && is_whitespace(_start[i]); i++);
					return i;
				}

			case END:
			default:
				return 0;
			}
		}
};

#endif /* _INCLUDE__UTIL__TOKEN_H_ */
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`/*`
			`* \brief Tokenizer support`
			`* \author Norman Feske`
			`* \date 2006-05-19`
			`*/`

			`/*`
Adjust file headers to refer to the AGPLv3 2017-02-20 12:23:52 +00:00			`* Copyright (C) 2006-2017 Genode Labs GmbH`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`*`
			`* This file is part of the Genode OS framework, which is distributed`
Adjust file headers to refer to the AGPLv3 2017-02-20 12:23:52 +00:00			`* under the terms of the GNU Affero General Public License version 3.`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`*/`

			`#ifndef _INCLUDE__UTIL__TOKEN_H_`
			`#define _INCLUDE__UTIL__TOKEN_H_`

			`#include <util/string.h>`

			`namespace Genode {`

base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`struct Scanner_policy_identifier_with_underline;`
			`template <typename> class Token;`
			`}`


			`/**`
			`* Scanner policy that accepts underline characters in identifiers`
			`*/`
			`struct Genode::Scanner_policy_identifier_with_underline`
			`{`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`/**`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`* Return true if character belongs to a valid identifier`
			`*`
			`* \param c character`
			`* \param i index of character in token`
			`* \return true if character is a valid identifier character`
			`*`
			`* Letters and underline characters are allowed anywhere in an`
			`* identifier, digits must not appear at the beginning.`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`*/`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`static bool identifier_char(char c, unsigned i) {`
			`return is_letter(c) \|\| (c == '_') \|\| (i && is_digit(c)); }`
			`};`


			`/**`
			`* Token`
			`*`
			`* This class is used to group characters of a string which belong`
			`* to one syntactical token types number, identifier, string,`
			`* whitespace or another single character.`
			`*`
			`* \param SCANNER_POLICY policy that defines the way of token scanning`
			`*`
			`* See 'Scanner_policy_identifier_with_underline' for an example scanner`
			`* policy.`
			`*/`
			`template <typename SCANNER_POLICY>`
			`class Genode::Token`
			`{`
			`public:`

			`enum Type { SINGLECHAR, NUMBER, IDENT, STRING, WHITESPACE, END };`

Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`/**`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`* Constructor`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`*`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`* \param s start of string to construct a token from`
			`* \param max_len maximum token length`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`*`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`* The 'max_len' argument is useful for processing character arrays`
			`* that are not null-terminated.`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`*/`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`Token(const char *s = 0, size_t max_len = ~0UL)`
			`: _start(s), _max_len(max_len), _len(s ? _calc_len(max_len) : 0) { }`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/**`
			`* Accessors`
			`*/`
			`char start() const { return (char )_start; }`
			`size_t len() const { return _len; }`
			`Type type() const { return _type(_len); }`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/**`
			`* Return token as null-terminated string`
			`*/`
			`void string(char *dst, size_t max_len) const {`
Replace Genode::strncpy by Genode::copy_cstring - Since Genode::strncpy is not 100% compatible with the POSIX strncpy function, better use a distinct name. - Remove bogus return value from the function, easing the potential enforcement of mandatory return-value checks later. Fixes #3752 2020-05-11 14:10:27 +00:00			`copy_cstring(dst, start(), min(len() + 1, max_len)); }`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/**`
			`* Return true if token is valid`
			`*/`
Improve tokenizing to support XML comments The whole XML comment has to be parsed as one XML tag to support strange but valid combinations like <!----> <!--invisible-tag></invisible-tag--> Fixes #1424 2016-05-28 21:57:18 +00:00			`bool valid() const { return _start && _len; }`

			`operator bool () const { return valid(); }`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/**`
			`* Access single characters of token`
			`*/`
			`char operator [] (int idx)`
			`{`
			`return ((idx >= 0) && ((unsigned)idx < _len)) ? _start[idx] : 0;`
			`}`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/**`
			`* Return next token`
			`*/`
			`Token next() const { return Token(_start + _len, _max_len - _len); }`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
Improve tokenizing to support XML comments The whole XML comment has to be parsed as one XML tag to support strange but valid combinations like <!----> <!--invisible-tag></invisible-tag--> Fixes #1424 2016-05-28 21:57:18 +00:00			`/**`
			`* Return next token after delimiter`
			`*/`
			`Token next_after(char const *delim)`
			`{`
			`size_t const len = strlen(delim);`

			`if (!valid() \|\| len > _max_len)`
			`return Token();`

			`char const *s = _start;`
			`for (size_t rest = _max_len; rest >= len; --rest, ++s)`
			`if (strcmp(s, delim, len) == 0)`
			`return Token(s, rest).next();`

			`return Token();`
			`}`

			`/**`
			`* Return true if token starts with pattern`
			`*/`
			`bool matches(char const *pattern)`
			`{`
			`size_t const len = strlen(pattern);`

			`if (!valid() \|\| len > _max_len)`
			`return false;`

			`return strcmp(pattern, _start, len) == 0;`
			`}`

base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/**`
			`* Return next non-whitespace token`
			`*/`
			`Token eat_whitespace() const { return (_type(_len) == WHITESPACE) ? next() : *this; }`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`private:`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`const char *_start;`
			`size_t _max_len;`
			`size_t _len;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/**`
			`* Return type of token`
			`*`
			`* \param max_len maximum token length`
			`*`
Revised API documentation This patch curates the API documentation to become suitable for the functional specificaton, which is partially generated from the header files. 2015-03-20 16:50:41 +00:00			`* This method is used during the construction of 'Token'`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`* objects, in particular for determining the value of the '_len'`
			`* member. Therefore, we explicitely pass the 'max_len' to the`
Revised API documentation This patch curates the API documentation to become suitable for the functional specificaton, which is partially generated from the header files. 2015-03-20 16:50:41 +00:00			`* method. For the public interface, there exists the 'type()'`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`* accessor, which relies on '_len' as implicit argument.`
			`*/`
			`Type _type(size_t max_len) const`
			`{`
			`if (!_start \|\| max_len < 1 \|\| !*_start) return END;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/* determine the type based on the first character */`
			`char c = *_start;`
			`if (SCANNER_POLICY::identifier_char(c, 0)) return IDENT;`
			`if (is_digit(c)) return NUMBER;`
			`if (is_whitespace(c)) return WHITESPACE;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/* if string is incomplete, discard it (type END) */`
			`if (c == '"')`
			`return _quoted_string_len(max_len) ? STRING : END;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`return SINGLECHAR;`
			`}`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`size_t _quoted_string_len(size_t max_len) const`
			`{`
util/token.h: fix possible out-of-bounds read The 'WHITESPACE' case of the _calc_len method wrongly accessed the character before checking upper bound of the token. The problem is fixed by switching the order of both conditions. Fixes #3756 2020-05-07 19:23:07 +00:00			`/*`
			`* The 'end_of_quote' function examines two 'char' values.`
			`* Hence, the upper bound of the index is max_len - 2.`
			`*/`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`unsigned i = 0;`
util/token.h: fix possible out-of-bounds read The 'WHITESPACE' case of the _calc_len method wrongly accessed the character before checking upper bound of the token. The problem is fixed by switching the order of both conditions. Fixes #3756 2020-05-07 19:23:07 +00:00			`for (; i + 1 < max_len && !end_of_quote(&_start[i]); i++)`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/* string ends without final quotation mark? too bad! */`
			`if (!_start[i]) return 0;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/* exceeded maximum token length */`
util/token.h: fix possible out-of-bounds read The 'WHITESPACE' case of the _calc_len method wrongly accessed the character before checking upper bound of the token. The problem is fixed by switching the order of both conditions. Fixes #3756 2020-05-07 19:23:07 +00:00			`if (i + 1 == max_len)`
			`return 0;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/*`
			`* We stopped our search at the character before the`
			`* final quotation mark but we return the number of`
			`* characters including the quotation marks.`
			`*/`
			`return i + 2;`
			`}`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/**`
			`* Return length of token`
			`*/`
			`int _calc_len(size_t max_len) const`
			`{`
			`switch (_type(max_len)) {`

			`case SINGLECHAR:`
			`return 1;`

			`case NUMBER:`
			`{`
			`unsigned i = 0;`
			`for (; i < max_len && is_digit(_start[i]); i++);`
			`return i;`
			`}`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`case IDENT:`
			`{`
			`unsigned i = 0;`
			`for (; i < max_len; i++) {`
			`if (SCANNER_POLICY::identifier_char(_start[i], i))`
			`continue;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`/* stop if any other (invalid) character occurs */`
			`break;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`}`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`return i;`
			`}`

			`case STRING:`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`return _quoted_string_len(max_len);`

			`case WHITESPACE:`
			`{`
			`unsigned i = 0;`
util/token.h: fix possible out-of-bounds read The 'WHITESPACE' case of the _calc_len method wrongly accessed the character before checking upper bound of the token. The problem is fixed by switching the order of both conditions. Fixes #3756 2020-05-07 19:23:07 +00:00			`for (; i < max_len && is_whitespace(_start[i]); i++);`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`return i;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`}`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00
			`case END:`
			`default:`
			`return 0;`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00			`}`
base,os: Coding-style unification Fixes #1432 2015-03-04 20:12:14 +00:00			`}`
			`};`
Imported Genode release 11.11 2011-12-22 15:19:25 +00:00
			`#endif /* _INCLUDE__UTIL__TOKEN_H_ */`