genode/repos/os/include/util/xml_node.h

/*
 * \brief  XML parser
 * \author Norman Feske
 * \date   2007-08-21
 */

/*
 * Copyright (C) 2007-2013 Genode Labs GmbH
 *
 * This file is part of the Genode OS framework, which is distributed
 * under the terms of the GNU General Public License version 2.
 */

#ifndef _INCLUDE__UTIL__XML_NODE_H_
#define _INCLUDE__UTIL__XML_NODE_H_

#include <util/token.h>
#include <base/exception.h>

namespace Genode {

	/**
	 * Representation of an XML node
	 */
	class Xml_node
	{
		/**
		 * Scanner policy that accepts hyphens in identifiers
		 */
		struct Scanner_policy_xml_identifier {
			static bool identifier_char(char c, unsigned i) {
				return is_letter(c) || c == '_' || c == ':'
				    || (i && (c == '-' || c == '.' || is_digit(c))); } };

		/**
		 * Define tokenizer that matches XML tags (with hyphens) as identifiers
		 */
		typedef ::Genode::Token<Scanner_policy_xml_identifier> Token;

		/**
		 * Forward declaration needed for befriending Tag with Attribut
		 */
		class Tag;

		public:

			/*********************
			 ** Exception types **
			 *********************/

			class Exception             : public ::Genode::Exception { };
			class Invalid_syntax        : public Exception { };
			class Nonexistent_sub_node  : public Exception { };
			class Nonexistent_attribute : public Exception { };


			/**
			 * Representation of an XML-node attribute
			 *
			 * An attribute has the form 'name="value"'.
			 */
			class Attribute
			{
				private:

					Token _name;
					Token _value;

					friend class Xml_node;

					/*
					 * Even though 'Tag' is part of 'Xml_node', the friendship
					 * to 'Xml_node' does not apply for 'Tag' when compiling
					 * the code with 'gcc-3.4'. Hence, we need to add an
					 * explicit friendship to 'Tag'.
					 */
					friend class Tag;

					/**
					 * Constructor
					 *
					 * This constructor is meant to be used as implicitly to
					 * construct an 'Xml_attribute' from a token sequence via an
					 * assignment from the leading 'Token'.
					 */
					Attribute(Token t) :
						_name(t.eat_whitespace()), _value(_name.next().next())
					{
						if (_name.type() != Token::IDENT)
							throw Nonexistent_attribute();

						if (_name.next()[0] != '=' || _value.type() != Token::STRING)
							throw Invalid_syntax();
					}

					/**
					 * Return token following the attribute declaration
					 */
					Token _next() const { return _name.next().next().next(); }

				public:

					/**
					 * Return attribute type as null-terminated string
					 */
					void type(char *dst, size_t max_len) const
					{
						/*
						 * Limit number of characters by token length, take
						 * null-termination into account.
						 */
						max_len = min(max_len, _name.len() + 1);
						strncpy(dst, _name.start(), max_len);
					}

					/**
					 * Return true if attribute has specified type
					 */
					bool has_type(const char *type) {
						return strlen(type) == _name.len() &&
						       strcmp(type, _name.start(), _name.len()) == 0; }

					/**
					 * Return size of value
					 */
					size_t      value_size() const { return _value.len() - 2; }
					char const *value_base() const { return _value.start() + 1; }

					/**
					 * Return attribute value as null-terminated string
					 *
					 * \return true on success, or
					 *         false if attribute is invalid
					 */
					bool value(char *dst, size_t max_len) const
					{
						/*
						 * The value of 'max_len' denotes the maximum number of
						 * characters to be written to 'dst' including the null
						 * termination. From the quoted value string, we strip
						 * both quote characters and add a null-termination
						 * character.
						 */
						max_len = min(max_len, _value.len() - 2 + 1);
						strncpy(dst, _value.start() + 1, max_len);
						return true;
					}

					/**
					 * Return true if attribute has the specified value
					 */
					bool has_value(const char *value) const {
						return strlen(value) == (_value.len() - 2) &&
						       !strcmp(value, _value.start() + 1, _value.len() - 2); }

					/**
					 * Return attribute value as typed value
					 *
					 * \param T  type of value to read
					 * \return   true on success, or
					 *           false if attribute is invalid or value
					 *           conversion failed
					 */
					template <typename T>
					bool value(T *out) const
					{
						/*
						 * The '_value' token starts with a quote, which we
						 * need to skip to access the string. For validating
						 * the length, we have to consider both the starting
						 * and the trailing quote character.
						 */
						return ascii_to(_value.start() + 1, out) == _value.len() - 2;
					}

					/**
					 * Return next attribute in attribute list
					 */
					Attribute next() const { return Attribute(_next()); }
			};

		private:

			class Tag
			{
				public:

					enum Type { START, END, EMPTY, INVALID };

				private:

					Token _token;
					Token _name;
					Type  _type;

				public:

					/**
					 * Constructor
					 *
					 * \param start  first token of the tag
					 *
					 * At construction time, the validity of the tag is checked and
					 * the tag type is determined. A valid tag consists of:
					 * # Leading '<' tag delimiter
					 * # '/' for marking an end tag
					 * # Tag name
					 * # Optional attribute sequence (if tag is no end tag)
					 * # '/' for marking an empty-element tag (if tag is no end tag)
					 * # Closing '>' tag delimiter
					 */
					Tag(Token start) : _token(start), _type(INVALID)
					{
						Type supposed_type = START;

						if (_token[0] != '<')
							return;

						if (_token.next()[0] == '/')
							supposed_type = END;

						if (_token.next().type() != Token::IDENT && _token.next()[0] != '/')
							return;

						_name = _token.next()[0] == '/' ? _token.next().next() : _token.next();
						if (_name.type() != Token::IDENT)
							return;

						/* skip attributes to find tag delimiter */
						Token delimiter = _name.next();
						if (supposed_type != END)
							try {
								for (Attribute a = _name.next(); ; a = a._next())
									delimiter = a._next();
							} catch (Nonexistent_attribute) { }

						delimiter = delimiter.eat_whitespace();

						/*
						 * Now we expect the '>' delimiter. For empty-element tags,
						 * the delimiter is prefixed with a '/'.
						 */
						if (delimiter[0] == '/') {

							/* if a '/' was already at the start, the tag is invalid */
							if (supposed_type == END)
								return;

							supposed_type = EMPTY;

							/* skip '/' */
							delimiter = delimiter.next();
						}

						if (delimiter[0] != '>') return;

						_type  = supposed_type;
					}

					/**
					 * Default constructor produces invalid Tag
					 */
					Tag() : _type(INVALID) { }

					/**
					 * Return type of tag
					 */
					Type type() const { return _type; }

					/**
					 * Return true if tag is the start of a valid XML node
					 */
					bool is_node() const { return _type == START || _type == EMPTY; }

					/**
					 * Return first token of tag
					 */
					Token token() const { return _token; }

					/**
					 * Return name of tag
					 */
					Token name() const { return _name; }

					/**
					 * Return token after the closing tag delimiter
					 */
					Token next_token() const
					{
						/*
						 * Search for next closing delimiter, skip potential
						 * attributes and '/' delimiter prefix of empty-element
						 * tags.
						 */
						Token t = _name;
						for (; t && t[0] != '>'; t = t.next());

						/* if 't' is invalid, 't.next()' is invalid too */
						return t.next();
					}

					/**
					 * Return first attribute of tag
					 */
					Attribute attribute() const { return Attribute(_name.next()); }
			};

			class Comment
			{
				private:

					Token _next;   /* token following the comment */
					bool  _valid;  /* true if comment is well formed */

					/**
					 * Check if token sequence matches specified character sequence
					 *
					 * \param t  start of token sequence
					 * \param s  null-terminated character sequence
					 */
					static bool _match(Token t, const char *s)
					{
						for (int i = 0; s[i]; t = t.next(), i++)
							if (t[0] != s[i])
								return false;
						return true;
					}

				public:

					/**
					 * Constructor
					 *
					 * \param start  first token of the comment tag
					 */
					Comment(Token t) : _valid(false)
					{
						/* check for comment-start tag */
						if (!_match(t, "<!--"))
							return;

						/* search for comment-end tag */
						for ( ; t && !_match(t, "-->"); t = t.next());

						if (t.type() == Token::END)
							return;

						_next  = t.next().next().next();
						_valid = true;
					}

					/**
					 * Default constructor produces invalid Comment
					 */
					Comment() : _valid(false) { }

					/**
					 * Return true if comment is valid
					 */
					bool valid() const { return _valid; }

					/**
					 * Return token after the closing comment delimiter
					 */
					Token next_token() const { return _next; }
			};

			const char *_addr;          /* first character of XML data      */
			size_t      _max_len;       /* length of XML data in characters */
			int         _num_sub_nodes; /* number of immediate sub nodes    */
			Tag         _start_tag;
			Tag         _end_tag;

			/**
			 * Search for end tag of XML node and initialize '_num_sub_nodes'
			 *
			 * \return  end tag or invalid tag
			 *
			 * The function searches for a end tag that matches the same
			 * depth level and the same name as the start tag of the XML
			 * node. If the XML structure is invalid, the search results
			 * is an invalid Tag.
			 *
			 * During the search, the function also counts the number of
			 * immediate sub nodes.
			 */
			Tag _init_end_tag()
			{
				/*
				 * If the start tag is invalid or an empty-element tag,
				 * we use the same tag as end tag.
				 */
				if (_start_tag.type() != Tag::START)
					return _start_tag;

				int   depth = 1;
				Token curr_token = _start_tag.next_token();

				while (curr_token.type() != Token::END) {

					/* eat XML comment */
					Comment curr_comment(curr_token);
					if (curr_comment.valid()) {
						curr_token = curr_comment.next_token();
						continue;
					}

					/* skip all tokens that are no tags */
					Tag curr_tag(curr_token);
					if (curr_tag.type() == Tag::INVALID) {
						curr_token = curr_token.next();
						continue;
					}

					/* count sub nodes at depth 1 */
					if (depth == 1 && curr_tag.is_node())
						_num_sub_nodes++;

					/* keep track of the current depth */
					depth += (curr_tag.type() == Tag::START);
					depth -= (curr_tag.type() == Tag::END);

					/* within sub nodes, continue after current token */
					if (depth > 0) {

						/* continue search with token after current tag */
						curr_token = curr_tag.next_token();
						continue;
					}

					/* reaching the same depth as the start tag */
					const char *start_name = _start_tag.name().start();
					size_t      start_len  = _start_tag.name().len();
					const char *curr_name  =   curr_tag.name().start();
					size_t      curr_len   =   curr_tag.name().len();

					/* on mismatch of start tag and end tag, return invalid tag */
					if (start_len != curr_len
					 || strcmp(start_name, curr_name, curr_len))
						return Tag();

					/* end tag corresponds to start tag */
					return curr_tag;
				}
				return Tag();
			}

			/**
			 * Find next non-whitespace and non-comment token
			 */
			static Token eat_whitespaces_and_comments(Token t)
			{
				while (true) {

					t = t.eat_whitespace();

					/* eat comment */
					Comment comment(t);
					if (comment.valid()) {
						t = comment.next_token();
						continue;
					}

					break;
				}
				return t;
			}

			/**
			 * Create sub node from XML node
			 *
			 * \throw Nonexistent_sub_node
			 * \throw Invalid_syntax
			 */
			Xml_node _sub_node(const char *at) const
			{
				if (at < addr() || (size_t)(at - addr()) >= _max_len)
					throw Nonexistent_sub_node();

				return Xml_node(at, _max_len - (at - addr()));
			}

		public:

			/**
			 * Constructor
			 *
			 * The constructor validates if the start tag has a
			 * matching end tag of the same depth and counts
			 * the number of immediate sub nodes.
			 */
			Xml_node(const char *addr, size_t max_len = ~0UL) :
				_addr(addr),
				_max_len(max_len),
				_num_sub_nodes(0),
				_start_tag(eat_whitespaces_and_comments(Token(addr, max_len))),
				_end_tag(_init_end_tag())
			{
				/* check validity of XML node */
				if (_start_tag.type() == Tag::EMPTY) return;
				if (_start_tag.type() == Tag::START && _end_tag.type() == Tag::END) return;

				throw Invalid_syntax();
			}

			/**
			 * Request type name of XML node as null-terminated string
			 */
			void type_name(char *dst, size_t max_len) const {
				_start_tag.name().string(dst, max_len); }

			/**
			 * Return true if tag is of specified type
			 */
			bool has_type(const char *type) const {
				return (!strcmp(type, _start_tag.name().start(),
				                      _start_tag.name().len())
				      && strlen(type) == _start_tag.name().len()); }

			/**
			 * Request content of XML node as null-terminated string
			 */
			void value(char *dst, size_t max_len) const {
				max_len = min(content_size() + 1, min(max_len, _max_len));
				strncpy(dst, content_addr(), max_len); }

			/**
			 * Read content as typed value from XML node
			 *
			 * \param T    type of value to read from XML node
			 * \param out  resulting value
			 * \return     true on success
			 */
			template <typename T>
			bool value(T *out) const {
				return ascii_to(content_addr(), out) == content_size(); }

			/**
			 * Return begin of node including the start tag
			 */
			const char *addr() const { return _addr; }

			/**
			 * Return size of node including start and end tags
			 */
			size_t size() const { return _end_tag.next_token().start() - addr(); }

			/**
			 * Return begin of node content as an opaque string
			 *
			 * Note that the returned string is not null-terminated as it
			 * points directly into a sub range of the unmodified Xml_node
			 * address range.
			 *
			 * XXX This function is deprecated. Use 'content_base()' instead.
			 */
			char *content_addr() const { return _start_tag.next_token().start(); }

			/**
			 * Return pointer to start of content
			 */
			char const *content_base() const { return content_addr(); }

			/**
			 * Return size of node content
			 */
			size_t content_size() const
			{
				if (_start_tag.type() == Tag::EMPTY)
					return 0;

				return _end_tag.token().start() - content_addr();
			}

			/**
			 * Return the number of the XML node's immediate sub nodes
			 */
			size_t num_sub_nodes() const { return _num_sub_nodes; }

			/**
			 * Return XML node following the current one
			 *
			 * \throw Nonexistent_sub_node  sub sequent node does not exist
			 */
			Xml_node next() const
			{
				Token after_node = _end_tag.next_token();
				after_node = eat_whitespaces_and_comments(after_node);
				try { return _sub_node(after_node.start()); }
				catch (Invalid_syntax) { throw Nonexistent_sub_node(); }
			}

			/**
			 * Return next XML node of specified type
			 *
			 * \param type  type of XML node, or
			 *              0 for matching any type
			 */
			Xml_node next(const char *type) const
			{
				Xml_node node = next();
				for (; type && !node.has_type(type); node = node.next());
				return node;
			}

			/**
			 * Return true if node is the last of a node sequence
			 */
			bool is_last(const char *type = 0) const
			{
				try { next(type); return false; }
				catch (Nonexistent_sub_node) { return true; }
			}

			/**
			 * Return sub node with specified index
			 *
			 * \param  idx                   index of sub node,
			 *                               default is the first node
			 * \throw  Nonexistent_sub_node  no such sub node exists
			 */
			Xml_node sub_node(unsigned idx = 0U) const
			{
				/* look up node at specified index */
				try {
					Xml_node curr_node = _sub_node(content_addr());
					for (; idx > 0; idx--)
						curr_node = curr_node.next();
					return curr_node;
				} catch (Invalid_syntax) { }

				throw Nonexistent_sub_node();
			}

			/**
			 * Return first sub node that matches the specified type
			 *
			 * \throw Nonexistent_sub_node  no such sub_node exists
			 */
			Xml_node sub_node(const char *type) const
			{
				/* search for sub node of specified type */
				try {
					Xml_node curr_node = _sub_node(content_addr());
					for ( ; true; curr_node = curr_node.next())
						if (curr_node.has_type(type))
							return curr_node;
				} catch (...) { }

				throw Nonexistent_sub_node();
			}

			/**
			 * Return Nth attribute of XML node
			 *
			 * \param idx                    attribute index,
			 *                               first attribute has index 0
			 * \throw Nonexistent_attribute  no such attribute exists
			 * \return                       XML attribute
			 */
			Attribute attribute(unsigned idx) const
			{
				/* get first attribute of the node */
				Attribute a = _start_tag.attribute();

				/* skip attributes until we reach the target index */
				for (; idx > 0; idx--)
					a = a._next();

				return a;
			}

			/**
			 * Return attribute of specified type
			 *
			 * \param type                   name of attribute type
			 * \throw Nonexistent_attribute  no such attribute exists
			 * \return                       XML attribute
			 */
			Attribute attribute(const char *type) const
			{
				/* iterate, beginning with the first attribute of the node */
				for (Attribute a = _start_tag.attribute(); ; a = a.next())
					if (a.has_type(type))
						return a;
			}

			/**
			 * Shortcut for reading an attribute value from XML node
			 *
			 * \param type           attribute name
			 * \param default_value  value returned if no attribute with the
			 *                       name 'type' is present.
			 * \return               attribute value or specified default value
			 *
			 * Without this shortcut, attribute values can be obtained by
			 * 'node.attribute(...).value(...)' only. Because the attribute
			 * lookup may throw a 'Nonexistent_attribute' exception, code that
			 * reads optional attributes (those with default values) has to
			 * handle the exception accordingly. Such code tends to become
			 * clumsy, in particular when many attributes are processed in a
			 * subsequent fashion. This function template relieves the XML node
			 * user from implementing the exception handling manually.
			 */
			template <typename T>
			inline T attribute_value(char const *type, T default_value) const
			{
				T result = default_value;
				try { attribute(type).value(&result); } catch (...) { }
				return result;
			}

			/**
			 * Return true if attribute of specified type exists
			 */
			inline bool has_attribute(char const *type) const
			{
				try { attribute(type); return true; } catch (...) { }
				return false;
			}

			/**
			 * Return true if sub node of specified type exists
			 */
			inline bool has_sub_node(char const *type) const
			{
				try { sub_node(type); return true; } catch (...) { }
				return false;
			}
	};
}

#endif /* _INCLUDE__UTIL__XML_NODE_H_ */