/* * \brief Tokenizer support * \author Norman Feske * \date 2006-05-19 */ /* * Copyright (C) 2006-2013 Genode Labs GmbH * * This file is part of the Genode OS framework, which is distributed * under the terms of the GNU General Public License version 2. */ #ifndef _INCLUDE__UTIL__TOKEN_H_ #define _INCLUDE__UTIL__TOKEN_H_ #include <util/string.h> namespace Genode { /** * Scanner policy that accepts underline characters in identifiers */ struct Scanner_policy_identifier_with_underline { /** * Return true if character belongs to a valid identifier * * \param c character * \param i index of character in token * \return true if character is a valid identifier character * * Letters and underline characters are allowed anywhere in an * identifier, digits must not appear at the beginning. */ static bool identifier_char(char c, unsigned i) { return is_letter(c) || (c == '_') || (i && is_digit(c)); } }; /** * Token * * This class is used to group characters of a string which belong * to one syntactical token types number, identifier, string, * whitespace or another single character. * * \param SCANNER_POLICY policy that defines the way of token scanning * * See 'Scanner_policy_identifier_with_underline' for an example scanner * policy. */ template <typename SCANNER_POLICY> class Token { public: enum Type { SINGLECHAR, NUMBER, IDENT, STRING, WHITESPACE, END }; /** * Constructor * * \param s start of string to construct a token from * \param max_len maximum token length * * The 'max_len' argument is useful for processing character arrays * that are not null-terminated. */ Token(const char *s = 0, size_t max_len = ~0UL) : _start(s), _max_len(max_len), _len(s ? _calc_len(max_len) : 0) { } /** * Accessors */ char *start() const { return (char *)_start; } size_t len() const { return _len; } Type type() const { return _type(_len); } /** * Return token as null-terminated string */ void string(char *dst, size_t max_len) const { strncpy(dst, start(), min(len() + 1, max_len)); } /** * Return true if token is valid */ operator bool () const { return _start && _len; } /** * Access single characters of token */ char operator [] (int idx) { return ((idx >= 0) && ((unsigned)idx < _len)) ? _start[idx] : 0; } /** * Return next token */ Token next() const { return Token(_start + _len, _max_len - _len); } /** * Return next non-whitespace token */ Token eat_whitespace() const { return (_type(_len) == WHITESPACE) ? next() : *this; } private: const char *_start; size_t _max_len; size_t _len; /** * Return type of token * * \param max_len maximum token length * * This function is used during the construction of 'Token' * objects, in particular for determining the value of the '_len' * member. Therefore, we explicitely pass the 'max_len' to the * function. For the public interface, there exists the 'type()' * accessor, which relies on '_len' as implicit argument. */ Type _type(size_t max_len) const { if (!_start || max_len < 1 || !*_start) return END; /* determine the type based on the first character */ char c = *_start; if (SCANNER_POLICY::identifier_char(c, 0)) return IDENT; if (is_digit(c)) return NUMBER; if (is_whitespace(c)) return WHITESPACE; /* if string is incomplete, discard it (type END) */ if (c == '"') return _quoted_string_len(max_len) ? STRING : END; return SINGLECHAR; } size_t _quoted_string_len(size_t max_len) const { unsigned i = 0; for (; !end_of_quote(&_start[i]) && i < max_len; i++) /* string ends without final quotation mark? too bad! */ if (!_start[i]) return 0; /* exceeded maximum token length */ if (i == max_len) return 0; /* * We stopped our search at the character before the * final quotation mark but we return the number of * characters including the quotation marks. */ return i + 2; } /** * Return length of token */ int _calc_len(size_t max_len) const { switch (_type(max_len)) { case SINGLECHAR: return 1; case NUMBER: { unsigned i = 0; for (; i < max_len && is_digit(_start[i]); i++); return i; } case IDENT: { unsigned i = 0; for (; i < max_len; i++) { if (SCANNER_POLICY::identifier_char(_start[i], i)) continue; /* stop if any other (invalid) character occurs */ break; } return i; } case STRING: return _quoted_string_len(max_len); case WHITESPACE: { unsigned i = 0; for (; is_whitespace(_start[i]) && i < max_len; i++); return i; } case END: default: return 0; } } }; } #endif /* _INCLUDE__UTIL__TOKEN_H_ */