genode/repos/base/include/util/xml_node.h

/*
 * \brief  XML parser
 * \author Norman Feske
 * \date   2007-08-21
 */

/*
 * Copyright (C) 2007-2013 Genode Labs GmbH
 *
 * This file is part of the Genode OS framework, which is distributed
 * under the terms of the GNU General Public License version 2.
 */

#ifndef _INCLUDE__UTIL__XML_NODE_H_
#define _INCLUDE__UTIL__XML_NODE_H_

#include <util/token.h>
#include <base/exception.h>

namespace Genode {
	class Xml_attribute;
	class Xml_node;
}


/**
 * Representation of an XML-node attribute
 *
 * An attribute has the form 'name="value"'.
 */
class Genode::Xml_attribute
{
	private:

		/**
		 * Scanner policy that accepts hyphens in identifiers
		 */
		struct Scanner_policy_xml_identifier {
			static bool identifier_char(char c, unsigned i) {
				return is_letter(c) || c == '_' || c == ':'
				    || (i && (c == '-' || c == '.' || is_digit(c))); } };

		/**
		 * Define tokenizer that matches XML tags (with hyphens) as identifiers
		 */
		typedef ::Genode::Token<Scanner_policy_xml_identifier> Token;

		Token _name;
		Token _value;

		friend class Xml_node;

		/*
		 * Even though 'Tag' is part of 'Xml_node', the friendship
		 * to 'Xml_node' does not apply for 'Tag' when compiling
		 * the code with 'gcc-3.4'. Hence, we need to add an
		 * explicit friendship to 'Tag'.
		 */
		friend class Tag;

		/**
		 * Constructor
		 *
		 * This constructor is meant to be used as implicitly to
		 * construct an 'Xml_attribute' from a token sequence via an
		 * assignment from the leading 'Token'.
		 */
		Xml_attribute(Token t) :
			_name(t.eat_whitespace()), _value(_name.next().next())
		{
			if (_name.type() != Token::IDENT)
				throw Nonexistent_attribute();

			if (_name.next()[0] != '=' || _value.type() != Token::STRING)
				throw Invalid_syntax();
		}

		/**
		 * Return token following the attribute declaration
		 */
		Token _next() const { return _name.next().next().next(); }

	public:

		/*********************
		 ** Exception types **
		 *********************/

		class Invalid_syntax        : public Exception { };
		class Nonexistent_attribute : public Exception { };


		/**
		 * Return attribute type as null-terminated string
		 */
		void type(char *dst, size_t max_len) const
		{
			/*
			 * Limit number of characters by token length, take
			 * null-termination into account.
			 */
			max_len = min(max_len, _name.len() + 1);
			strncpy(dst, _name.start(), max_len);
		}

		typedef String<64> Name;
		Name name() const {
			return Name(Cstring(_name.start(), _name.len())); }

		/**
		 * Return true if attribute has specified type
		 */
		bool has_type(const char *type) {
			return strlen(type) == _name.len() &&
			       strcmp(type, _name.start(), _name.len()) == 0; }

		/**
		 * Return size of value
		 */
		size_t      value_size() const { return _value.len() - 2; }
		char const *value_base() const { return _value.start() + 1; }

		/**
		 * Return attribute value as null-terminated string
		 */
		void value(char *dst, size_t max_len) const
		{
			/*
			 * The value of 'max_len' denotes the maximum number of
			 * characters to be written to 'dst' including the null
			 * termination. From the quoted value string, we strip
			 * both quote characters and add a null-termination
			 * character.
			 */
			max_len = min(max_len, _value.len() - 2 + 1);
			strncpy(dst, _value.start() + 1, max_len);
		}

		/**
		 * Return true if attribute has the specified value
		 */
		bool has_value(const char *value) const {
			return strlen(value) == (_value.len() - 2) &&
			       !strcmp(value, _value.start() + 1, _value.len() - 2); }

		/**
		 * Return attribute value as typed value
		 *
		 * \param T  type of value to read
		 * \return   true on success, or
		 *           false if attribute is invalid or value
		 *           conversion failed
		 */
		template <typename T>
		bool value(T *out) const
		{
			/*
			 * The '_value' token starts with a quote, which we
			 * need to skip to access the string. For validating
			 * the length, we have to consider both the starting
			 * and the trailing quote character.
			 */
			return ascii_to(_value.start() + 1, *out) == _value.len() - 2;
		}

		/**
		 * Return attribute value as Genode::String
		 */
		template <size_t N>
		void value(String<N> *out) const
		{
			char buf[N];
			value(buf, sizeof(buf));
			*out = String<N>(Cstring(buf));
		}

		/**
		 * Return next attribute in attribute list
		 */
		Xml_attribute next() const { return Xml_attribute(_next()); }
};


/**
 * Representation of an XML node
 */
class Genode::Xml_node
{
	private:

		typedef Xml_attribute::Token Token;

		/**
		 * Forward declaration needed for befriending Tag with Xml_attribute
		 */
		class Tag;

	public:

		/*********************
		 ** Exception types **
		 *********************/

		typedef Genode::Exception                    Exception;
		typedef Xml_attribute::Nonexistent_attribute Nonexistent_attribute;
		typedef Xml_attribute::Invalid_syntax        Invalid_syntax;

		class Nonexistent_sub_node  : public Exception { };


		/**
		 * Type definition for maintaining backward compatibility
		 */
		typedef Xml_attribute Attribute;

	private:

		class Tag
		{
			public:

				enum Type { START, END, EMPTY, INVALID };

			private:

				Token _token;
				Token _name;
				Type  _type;

			public:

				/**
				 * Constructor
				 *
				 * \param start  first token of the tag
				 *
				 * At construction time, the validity of the tag is checked and
				 * the tag type is determined. A valid tag consists of:
				 * # Leading '<' tag delimiter
				 * # '/' for marking an end tag
				 * # Tag name
				 * # Optional attribute sequence (if tag is no end tag)
				 * # '/' for marking an empty-element tag (if tag is no end tag)
				 * # Closing '>' tag delimiter
				 */
				Tag(Token start) : _token(start), _type(INVALID)
				{
					Type supposed_type = START;

					if (_token[0] != '<')
						return;

					if (_token.next()[0] == '/')
						supposed_type = END;

					if (_token.next().type() != Token::IDENT && _token.next()[0] != '/')
						return;

					_name = _token.next()[0] == '/' ? _token.next().next() : _token.next();
					if (_name.type() != Token::IDENT)
						return;

					/* skip attributes to find tag delimiter */
					Token delimiter = _name.next();
					if (supposed_type != END)
						try {
							for (Xml_attribute a = _name.next(); ; a = a._next())
								delimiter = a._next();
						} catch (Nonexistent_attribute) { }

					delimiter = delimiter.eat_whitespace();

					/*
					 * Now we expect the '>' delimiter. For empty-element tags,
					 * the delimiter is prefixed with a '/'.
					 */
					if (delimiter[0] == '/') {

						/* if a '/' was already at the start, the tag is invalid */
						if (supposed_type == END)
							return;

						supposed_type = EMPTY;

						/* skip '/' */
						delimiter = delimiter.next();
					}

					if (delimiter[0] != '>') return;

					_type  = supposed_type;
				}

				/**
				 * Default constructor produces invalid Tag
				 */
				Tag() : _type(INVALID) { }

				/**
				 * Return type of tag
				 */
				Type type() const { return _type; }

				/**
				 * Return true if tag is the start of a valid XML node
				 */
				bool node() const { return _type == START || _type == EMPTY; }

				/**
				 * Return first token of tag
				 */
				Token token() const { return _token; }

				/**
				 * Return name of tag
				 */
				Token name() const { return _name; }

				/**
				 * Return token after the closing tag delimiter
				 */
				Token next_token() const
				{
					/*
					 * Search for next closing delimiter, skip potential
					 * attributes and '/' delimiter prefix of empty-element
					 * tags.
					 */
					Token t = _name;
					for (; t && t[0] != '>'; t = t.next());

					/* if 't' is invalid, 't.next()' is invalid too */
					return t.next();
				}

				/**
				 * Return first attribute of tag
				 */
				Xml_attribute attribute() const { return Xml_attribute(_name.next()); }
		};

		class Comment
		{
			private:

				Token _next;   /* token following the comment */
				bool  _valid;  /* true if comment is well formed */

			public:

				/**
				 * Constructor
				 *
				 * \param start  first token of the comment tag
				 */
				Comment(Token t) : _valid(false)
				{
					/* check for comment start */
					if (!t.matches("<!--"))
						return;

					/* skip four single characters for "<!--" */
					t = t.next().next().next().next();

					/* find token after comment delimiter */
					_next  = t.next_after("-->");
					_valid = _next.valid();
				}

				/**
				 * Default constructor produces invalid Comment
				 */
				Comment() : _valid(false) { }

				/**
				 * Return true if comment is valid
				 */
				bool valid() const { return _valid; }

				/**
				 * Return token after the closing comment delimiter
				 */
				Token next_token() const { return _next; }
		};

		/**
		 * Helper class to decode XML character entities
		 */
		struct Decoded_character
		{
			char   character   = 0;
			size_t encoded_len = 1;

			struct Translation
			{
				char        character;
				char const *seq;
				size_t      seq_len;
			};

			static Translation translate(char const *src, size_t src_len)
			{
				enum { NUM = 6 };
				static Translation translations[NUM] = {
					{ '>',  "&gt;",   4 },
					{ '<',  "&lt;",   4 },
					{ '&',  "&amp;",  5 },
					{ '"',  "&quot;", 6 },
					{ '\'', "&apos;", 6 },
					{ 0,    "&#x00;", 6 }
				};

				if (src_len == 0)
					return { 0, nullptr, 0 };

				for (unsigned i = 0; i < NUM; i++) {

					Translation const &translation = translations[i];

					if (src_len < translation.seq_len
					 || memcmp(src, translation.seq, translation.seq_len))
						continue;

					/* translation matches */
					return translation;
				}

				/* sequence is not known, pass single character as is */
				return { *src, nullptr, 1 };
			}

			Decoded_character(char const *src, size_t src_len)
			{
				if (*src != '&' || src_len == 0) {
					character = *src;
					return;
				}

				Translation const translation = translate(src, src_len);

				character   = translation.character;
				encoded_len = translation.seq_len;
			}
		};

		const char *_addr;          /* first character of XML data      */
		size_t      _max_len;       /* length of XML data in characters */
		int         _num_sub_nodes; /* number of immediate sub nodes    */
		Tag         _start_tag;
		Tag         _end_tag;

		/**
		 * Search for end tag of XML node and initialize '_num_sub_nodes'
		 *
		 * \return  end tag or invalid tag
		 *
		 * The method searches for a end tag that matches the same
		 * depth level and the same name as the start tag of the XML
		 * node. If the XML structure is invalid, the search results
		 * is an invalid Tag.
		 *
		 * During the search, the method also counts the number of
		 * immediate sub nodes.
		 */
		Tag _init_end_tag()
		{
			/*
			 * If the start tag is invalid or an empty-element tag,
			 * we use the same tag as end tag.
			 */
			if (_start_tag.type() != Tag::START)
				return _start_tag;

			int   depth = 1;
			Token curr_token = _start_tag.next_token();

			while (curr_token.type() != Token::END) {

				/* eat XML comment */
				Comment curr_comment(curr_token);
				if (curr_comment.valid()) {
					curr_token = curr_comment.next_token();
					continue;
				}

				/* skip all tokens that are no tags */
				Tag curr_tag(curr_token);
				if (curr_tag.type() == Tag::INVALID) {
					curr_token = curr_token.next();
					continue;
				}

				/* count sub nodes at depth 1 */
				if (depth == 1 && curr_tag.node())
					_num_sub_nodes++;

				/* keep track of the current depth */
				depth += (curr_tag.type() == Tag::START);
				depth -= (curr_tag.type() == Tag::END);

				/* within sub nodes, continue after current token */
				if (depth > 0) {

					/* continue search with token after current tag */
					curr_token = curr_tag.next_token();
					continue;
				}

				/* reaching the same depth as the start tag */
				const char *start_name = _start_tag.name().start();
				size_t      start_len  = _start_tag.name().len();
				const char *curr_name  =   curr_tag.name().start();
				size_t      curr_len   =   curr_tag.name().len();

				/* on mismatch of start tag and end tag, return invalid tag */
				if (start_len != curr_len
				 || strcmp(start_name, curr_name, curr_len))
					return Tag();

				/* end tag corresponds to start tag */
				return curr_tag;
			}
			return Tag();
		}

		/**
		 * Find next non-whitespace and non-comment token
		 */
		static Token skip_non_tag_characters(Token t)
		{
			while (true) {

				t = t.eat_whitespace();

				/* eat comment */
				Comment comment(t);
				if (comment.valid()) {
					t = comment.next_token();
					continue;
				}

				/* skip token if it is valid but does not start a tag */
				Tag curr_tag(t);
				if (curr_tag.type() == Tag::INVALID && curr_tag.token()) {
					t = t.next();
					continue;
				}

				break;
			}
			return t;
		}

		/**
		 * Create sub node from XML node
		 *
		 * \throw Nonexistent_sub_node
		 * \throw Invalid_syntax
		 */
		Xml_node _sub_node(const char *at) const
		{
			if (at < addr() || (size_t)(at - addr()) >= _max_len)
				throw Nonexistent_sub_node();

			return Xml_node(at, _max_len - (at - addr()));
		}

	public:

		/**
		 * Constructor
		 *
		 * The constructor validates if the start tag has a matching end tag of
		 * the same depth and counts the number of immediate sub nodes.
		 *
		 * \throw Invalid_syntax
		 */
		Xml_node(const char *addr, size_t max_len = ~0UL) :
			_addr(addr),
			_max_len(max_len),
			_num_sub_nodes(0),
			_start_tag(skip_non_tag_characters(Token(addr, max_len))),
			_end_tag(_init_end_tag())
		{
			/* check validity of XML node */
			if (_start_tag.type() == Tag::EMPTY) return;
			if (_start_tag.type() == Tag::START && _end_tag.type() == Tag::END) return;

			throw Invalid_syntax();
		}

		/**
		 * Request type name of XML node as null-terminated string
		 */
		void type_name(char *dst, size_t max_len) const {
			_start_tag.name().string(dst, max_len); }

		typedef String<64> Type;
		Type type() const
		{
			Token name = _start_tag.name();
			return Type(Cstring(name.start(), name.len()));
		}

		/**
		 * Return true if tag is of specified type
		 */
		bool has_type(const char *type) const {
			return (!strcmp(type, _start_tag.name().start(),
			                      _start_tag.name().len())
			      && strlen(type) == _start_tag.name().len()); }

		/**
		 * Request content of XML node as null-terminated string
		 */
		void value(char *dst, size_t max_len) const {
			max_len = min(content_size() + 1, min(max_len, _max_len));
			strncpy(dst, content_addr(), max_len); }

		/**
		 * Read content as typed value from XML node
		 *
		 * \param T    type of value to read from XML node
		 * \param out  resulting value
		 * \return     true on success
		 */
		template <typename T>
		bool value(T *out) const {
			return ascii_to(content_addr(), *out) == content_size(); }

		/**
		 * Return begin of node including the start tag
		 */
		const char *addr() const { return _addr; }

		/**
		 * Return size of node including start and end tags
		 */
		size_t size() const { return _end_tag.next_token().start() - addr(); }

		/**
		 * Return begin of node content as an opaque string
		 *
		 * Note that the returned string is not null-terminated as it
		 * points directly into a sub range of the unmodified Xml_node
		 * address range.
		 *
		 * XXX This method is deprecated. Use 'content_base()' instead.
		 *
		 * \noapi
		 */
		char *content_addr() const { return _start_tag.next_token().start(); }

		/**
		 * Return pointer to start of content
		 */
		char const *content_base() const { return content_addr(); }

		/**
		 * Return size of node content
		 */
		size_t content_size() const
		{
			if (_start_tag.type() == Tag::EMPTY)
				return 0;

			return _end_tag.token().start() - content_addr();
		}

		/**
		 * Export decoded node content from XML node
		 *
		 * \param dst      destination buffer
		 * \param dst_len  size of destination buffer in bytes
		 * \return         number of bytes written to the destination buffer
		 *
		 * This function transforms XML character entities into their
		 * respective characters.
		 */
		size_t decoded_content(char *dst, size_t dst_len) const
		{
			size_t      result_len = 0;
			char const *src        = content_base();
			size_t      src_len    = content_size();

			for (; dst_len > 1 && src_len; result_len++) {

				Decoded_character const decoded_character(src, src_len);

				*dst++ = decoded_character.character;

				src     += decoded_character.encoded_len;
				src_len -= decoded_character.encoded_len;
			}

			return result_len;
		}

		/**
		 * Read decoded node content as Genode::String
		 */
		template <typename STRING>
		STRING decoded_content() const
		{
			char buf[STRING::capacity() + 1];
			size_t const len = decoded_content(buf, sizeof(buf));
			buf[min(len, STRING::capacity())] = 0;
			return STRING(Cstring(buf));
		}

		/**
		 * Return the number of the XML node's immediate sub nodes
		 */
		size_t num_sub_nodes() const { return _num_sub_nodes; }

		/**
		 * Return XML node following the current one
		 *
		 * \throw Nonexistent_sub_node  sub sequent node does not exist
		 */
		Xml_node next() const
		{
			Token after_node = _end_tag.next_token();
			after_node = skip_non_tag_characters(after_node);
			try { return _sub_node(after_node.start()); }
			catch (Invalid_syntax) { throw Nonexistent_sub_node(); }
		}

		/**
		 * Return next XML node of specified type
		 *
		 * \param type  type of XML node, or
		 *              0 for matching any type
		 */
		Xml_node next(const char *type) const
		{
			Xml_node node = next();
			for (; type && !node.has_type(type); node = node.next());
			return node;
		}

		/**
		 * Return true if node is the last of a node sequence
		 */
		bool last(const char *type = 0) const
		{
			try { next(type); return false; }
			catch (Nonexistent_sub_node) { return true; }
		}

		/**
		 * Return true if node is the last of a node sequence
		 *
		 * \noapi
		 * \deprecated use 'last' instead
		 */
		bool is_last(const char *type = 0) const { return last(); }

		/**
		 * Return sub node with specified index
		 *
		 * \param  idx                   index of sub node,
		 *                               default is the first node
		 * \throw  Nonexistent_sub_node  no such sub node exists
		 */
		Xml_node sub_node(unsigned idx = 0U) const
		{
			if (_num_sub_nodes > 0) {

				/* look up node at specified index */
				try {
					Xml_node curr_node = _sub_node(content_addr());
					for (; idx > 0; idx--)
						curr_node = curr_node.next();
					return curr_node;
				} catch (Invalid_syntax) { }
			}

			throw Nonexistent_sub_node();
		}

		/**
		 * Return first sub node that matches the specified type
		 *
		 * \throw Nonexistent_sub_node  no such sub_node exists
		 */
		Xml_node sub_node(const char *type) const
		{
			if (_num_sub_nodes > 0) {

				/* search for sub node of specified type */
				try {
					Xml_node curr_node = _sub_node(content_addr());
					for ( ; true; curr_node = curr_node.next())
						if (curr_node.has_type(type))
							return curr_node;
				} catch (...) { }
			}

			throw Nonexistent_sub_node();
		}

		/**
		 * Execute functor 'fn' for each sub node of specified type
		 */
		template <typename FN>
		void for_each_sub_node(char const *type, FN const &fn) const
		{
			if (_num_sub_nodes == 0)
				return;

			Xml_node node = sub_node();
			for (int i = 0; ; node = node.next()) {

				if (!type || node.has_type(type))
					fn(node);

				if (++i == _num_sub_nodes)
					break;
			}
		}

		/**
		 * Execute functor 'fn' for each sub node
		 */
		template <typename FN>
		void for_each_sub_node(FN const &fn) const
		{
			for_each_sub_node(nullptr, fn);
		}

		/**
		 * Return Nth attribute of XML node
		 *
		 * \param idx                    attribute index,
		 *                               first attribute has index 0
		 * \throw Nonexistent_attribute  no such attribute exists
		 * \return                       XML attribute
		 */
		Xml_attribute attribute(unsigned idx) const
		{
			/* get first attribute of the node */
			Xml_attribute a = _start_tag.attribute();

			/* skip attributes until we reach the target index */
			for (; idx > 0; idx--)
				a = a._next();

			return a;
		}

		/**
		 * Return attribute of specified type
		 *
		 * \param type                   name of attribute type
		 * \throw Nonexistent_attribute  no such attribute exists
		 * \return                       XML attribute
		 */
		Xml_attribute attribute(const char *type) const
		{
			/* iterate, beginning with the first attribute of the node */
			for (Xml_attribute a = _start_tag.attribute(); ; a = a.next())
				if (a.has_type(type))
					return a;
		}

		/**
		 * Shortcut for reading an attribute value from XML node
		 *
		 * \param type           attribute name
		 * \param default_value  value returned if no attribute with the
		 *                       name 'type' is present.
		 * \return               attribute value or specified default value
		 *
		 * Without this shortcut, attribute values can be obtained by
		 * 'node.attribute(...).value(...)' only. Because the attribute
		 * lookup may throw a 'Nonexistent_attribute' exception, code that
		 * reads optional attributes (those with default values) has to
		 * handle the exception accordingly. Such code tends to become
		 * clumsy, in particular when many attributes are processed in a
		 * subsequent fashion. This method template relieves the XML node
		 * user from implementing the exception handling manually.
		 */
		template <typename T>
		inline T attribute_value(char const *type, T default_value) const
		{
			T result = default_value;
			try { attribute(type).value(&result); } catch (...) { }
			return result;
		}

		/**
		 * Return true if attribute of specified type exists
		 */
		inline bool has_attribute(char const *type) const
		{
			try { attribute(type); return true; } catch (...) { }
			return false;
		}

		/**
		 * Return true if sub node of specified type exists
		 */
		inline bool has_sub_node(char const *type) const
		{
			try { sub_node(type); return true; } catch (...) { }
			return false;
		}

		void print(Output &output) const {
			output.out_string(addr(), size()); }
};

#endif /* _INCLUDE__UTIL__XML_NODE_H_ */