2011-12-22 15:19:25 +00:00
|
|
|
/*
|
|
|
|
* \brief Tokenizer support
|
|
|
|
* \author Norman Feske
|
|
|
|
* \date 2006-05-19
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2017-02-20 12:23:52 +00:00
|
|
|
* Copyright (C) 2006-2017 Genode Labs GmbH
|
2011-12-22 15:19:25 +00:00
|
|
|
*
|
|
|
|
* This file is part of the Genode OS framework, which is distributed
|
2017-02-20 12:23:52 +00:00
|
|
|
* under the terms of the GNU Affero General Public License version 3.
|
2011-12-22 15:19:25 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _INCLUDE__UTIL__TOKEN_H_
|
|
|
|
#define _INCLUDE__UTIL__TOKEN_H_
|
|
|
|
|
|
|
|
#include <util/string.h>
|
|
|
|
|
|
|
|
namespace Genode {
|
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
struct Scanner_policy_identifier_with_underline;
|
|
|
|
template <typename> class Token;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Scanner policy that accepts underline characters in identifiers
|
|
|
|
*/
|
|
|
|
struct Genode::Scanner_policy_identifier_with_underline
|
|
|
|
{
|
2011-12-22 15:19:25 +00:00
|
|
|
/**
|
2015-03-04 20:12:14 +00:00
|
|
|
* Return true if character belongs to a valid identifier
|
|
|
|
*
|
|
|
|
* \param c character
|
|
|
|
* \param i index of character in token
|
|
|
|
* \return true if character is a valid identifier character
|
|
|
|
*
|
|
|
|
* Letters and underline characters are allowed anywhere in an
|
|
|
|
* identifier, digits must not appear at the beginning.
|
2011-12-22 15:19:25 +00:00
|
|
|
*/
|
2015-03-04 20:12:14 +00:00
|
|
|
static bool identifier_char(char c, unsigned i) {
|
|
|
|
return is_letter(c) || (c == '_') || (i && is_digit(c)); }
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Token
|
|
|
|
*
|
|
|
|
* This class is used to group characters of a string which belong
|
|
|
|
* to one syntactical token types number, identifier, string,
|
|
|
|
* whitespace or another single character.
|
|
|
|
*
|
|
|
|
* \param SCANNER_POLICY policy that defines the way of token scanning
|
|
|
|
*
|
|
|
|
* See 'Scanner_policy_identifier_with_underline' for an example scanner
|
|
|
|
* policy.
|
|
|
|
*/
|
|
|
|
template <typename SCANNER_POLICY>
|
|
|
|
class Genode::Token
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
|
|
|
enum Type { SINGLECHAR, NUMBER, IDENT, STRING, WHITESPACE, END };
|
|
|
|
|
2011-12-22 15:19:25 +00:00
|
|
|
/**
|
2015-03-04 20:12:14 +00:00
|
|
|
* Constructor
|
2011-12-22 15:19:25 +00:00
|
|
|
*
|
2015-03-04 20:12:14 +00:00
|
|
|
* \param s start of string to construct a token from
|
|
|
|
* \param max_len maximum token length
|
2011-12-22 15:19:25 +00:00
|
|
|
*
|
2015-03-04 20:12:14 +00:00
|
|
|
* The 'max_len' argument is useful for processing character arrays
|
|
|
|
* that are not null-terminated.
|
2011-12-22 15:19:25 +00:00
|
|
|
*/
|
2015-03-04 20:12:14 +00:00
|
|
|
Token(const char *s = 0, size_t max_len = ~0UL)
|
|
|
|
: _start(s), _max_len(max_len), _len(s ? _calc_len(max_len) : 0) { }
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/**
|
|
|
|
* Accessors
|
|
|
|
*/
|
|
|
|
char *start() const { return (char *)_start; }
|
|
|
|
size_t len() const { return _len; }
|
|
|
|
Type type() const { return _type(_len); }
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/**
|
|
|
|
* Return token as null-terminated string
|
|
|
|
*/
|
|
|
|
void string(char *dst, size_t max_len) const {
|
2020-05-11 14:10:27 +00:00
|
|
|
copy_cstring(dst, start(), min(len() + 1, max_len)); }
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/**
|
|
|
|
* Return true if token is valid
|
|
|
|
*/
|
2016-05-28 21:57:18 +00:00
|
|
|
bool valid() const { return _start && _len; }
|
|
|
|
|
|
|
|
operator bool () const { return valid(); }
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/**
|
|
|
|
* Access single characters of token
|
|
|
|
*/
|
|
|
|
char operator [] (int idx)
|
|
|
|
{
|
|
|
|
return ((idx >= 0) && ((unsigned)idx < _len)) ? _start[idx] : 0;
|
|
|
|
}
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/**
|
|
|
|
* Return next token
|
|
|
|
*/
|
|
|
|
Token next() const { return Token(_start + _len, _max_len - _len); }
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2016-05-28 21:57:18 +00:00
|
|
|
/**
|
|
|
|
* Return next token after delimiter
|
|
|
|
*/
|
|
|
|
Token next_after(char const *delim)
|
|
|
|
{
|
|
|
|
size_t const len = strlen(delim);
|
|
|
|
|
|
|
|
if (!valid() || len > _max_len)
|
|
|
|
return Token();
|
|
|
|
|
|
|
|
char const *s = _start;
|
|
|
|
for (size_t rest = _max_len; rest >= len; --rest, ++s)
|
|
|
|
if (strcmp(s, delim, len) == 0)
|
|
|
|
return Token(s, rest).next();
|
|
|
|
|
|
|
|
return Token();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return true if token starts with pattern
|
|
|
|
*/
|
|
|
|
bool matches(char const *pattern)
|
|
|
|
{
|
|
|
|
size_t const len = strlen(pattern);
|
|
|
|
|
|
|
|
if (!valid() || len > _max_len)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return strcmp(pattern, _start, len) == 0;
|
|
|
|
}
|
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/**
|
|
|
|
* Return next non-whitespace token
|
|
|
|
*/
|
|
|
|
Token eat_whitespace() const { return (_type(_len) == WHITESPACE) ? next() : *this; }
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
private:
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
const char *_start;
|
|
|
|
size_t _max_len;
|
|
|
|
size_t _len;
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/**
|
|
|
|
* Return type of token
|
|
|
|
*
|
|
|
|
* \param max_len maximum token length
|
|
|
|
*
|
2015-03-20 16:50:41 +00:00
|
|
|
* This method is used during the construction of 'Token'
|
2015-03-04 20:12:14 +00:00
|
|
|
* objects, in particular for determining the value of the '_len'
|
|
|
|
* member. Therefore, we explicitely pass the 'max_len' to the
|
2015-03-20 16:50:41 +00:00
|
|
|
* method. For the public interface, there exists the 'type()'
|
2015-03-04 20:12:14 +00:00
|
|
|
* accessor, which relies on '_len' as implicit argument.
|
|
|
|
*/
|
|
|
|
Type _type(size_t max_len) const
|
|
|
|
{
|
|
|
|
if (!_start || max_len < 1 || !*_start) return END;
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/* determine the type based on the first character */
|
|
|
|
char c = *_start;
|
|
|
|
if (SCANNER_POLICY::identifier_char(c, 0)) return IDENT;
|
|
|
|
if (is_digit(c)) return NUMBER;
|
|
|
|
if (is_whitespace(c)) return WHITESPACE;
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/* if string is incomplete, discard it (type END) */
|
|
|
|
if (c == '"')
|
|
|
|
return _quoted_string_len(max_len) ? STRING : END;
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
return SINGLECHAR;
|
|
|
|
}
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
size_t _quoted_string_len(size_t max_len) const
|
|
|
|
{
|
2020-05-07 19:23:07 +00:00
|
|
|
/*
|
|
|
|
* The 'end_of_quote' function examines two 'char' values.
|
|
|
|
* Hence, the upper bound of the index is max_len - 2.
|
|
|
|
*/
|
2015-03-04 20:12:14 +00:00
|
|
|
unsigned i = 0;
|
2020-05-07 19:23:07 +00:00
|
|
|
for (; i + 1 < max_len && !end_of_quote(&_start[i]); i++)
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/* string ends without final quotation mark? too bad! */
|
|
|
|
if (!_start[i]) return 0;
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/* exceeded maximum token length */
|
2020-05-07 19:23:07 +00:00
|
|
|
if (i + 1 == max_len)
|
|
|
|
return 0;
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/*
|
|
|
|
* We stopped our search at the character before the
|
|
|
|
* final quotation mark but we return the number of
|
|
|
|
* characters including the quotation marks.
|
|
|
|
*/
|
|
|
|
return i + 2;
|
|
|
|
}
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/**
|
|
|
|
* Return length of token
|
|
|
|
*/
|
|
|
|
int _calc_len(size_t max_len) const
|
|
|
|
{
|
|
|
|
switch (_type(max_len)) {
|
|
|
|
|
|
|
|
case SINGLECHAR:
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
case NUMBER:
|
|
|
|
{
|
|
|
|
unsigned i = 0;
|
|
|
|
for (; i < max_len && is_digit(_start[i]); i++);
|
|
|
|
return i;
|
|
|
|
}
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
case IDENT:
|
|
|
|
{
|
|
|
|
unsigned i = 0;
|
|
|
|
for (; i < max_len; i++) {
|
|
|
|
if (SCANNER_POLICY::identifier_char(_start[i], i))
|
|
|
|
continue;
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
/* stop if any other (invalid) character occurs */
|
|
|
|
break;
|
2011-12-22 15:19:25 +00:00
|
|
|
}
|
2015-03-04 20:12:14 +00:00
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
case STRING:
|
2011-12-22 15:19:25 +00:00
|
|
|
|
2015-03-04 20:12:14 +00:00
|
|
|
return _quoted_string_len(max_len);
|
|
|
|
|
|
|
|
case WHITESPACE:
|
|
|
|
{
|
|
|
|
unsigned i = 0;
|
2020-05-07 19:23:07 +00:00
|
|
|
for (; i < max_len && is_whitespace(_start[i]); i++);
|
2015-03-04 20:12:14 +00:00
|
|
|
return i;
|
2011-12-22 15:19:25 +00:00
|
|
|
}
|
2015-03-04 20:12:14 +00:00
|
|
|
|
|
|
|
case END:
|
|
|
|
default:
|
|
|
|
return 0;
|
2011-12-22 15:19:25 +00:00
|
|
|
}
|
2015-03-04 20:12:14 +00:00
|
|
|
}
|
|
|
|
};
|
2011-12-22 15:19:25 +00:00
|
|
|
|
|
|
|
#endif /* _INCLUDE__UTIL__TOKEN_H_ */
|