/* * \brief XML parser * \author Norman Feske * \date 2007-08-21 */ /* * Copyright (C) 2007-2017 Genode Labs GmbH * * This file is part of the Genode OS framework, which is distributed * under the terms of the GNU Affero General Public License version 3. */ #ifndef _INCLUDE__UTIL__XML_NODE_H_ #define _INCLUDE__UTIL__XML_NODE_H_ #include #include namespace Genode { class Xml_attribute; class Xml_node; } /** * Representation of an XML-node attribute * * An attribute has the form 'name="value"'. */ class Genode::Xml_attribute { private: /** * Scanner policy that accepts hyphens in identifiers */ struct Scanner_policy_xml_identifier { static bool identifier_char(char c, unsigned i) { return is_letter(c) || c == '_' || c == ':' || (i && (c == '-' || c == '.' || is_digit(c))); } }; /** * Define tokenizer that matches XML tags (with hyphens) as identifiers */ typedef ::Genode::Token Token; Token _name; Token _value; friend class Xml_node; /* * Even though 'Tag' is part of 'Xml_node', the friendship * to 'Xml_node' does not apply for 'Tag' when compiling * the code with 'gcc-3.4'. Hence, we need to add an * explicit friendship to 'Tag'. */ friend class Tag; /** * Constructor * * This constructor is meant to be used as implicitly to * construct an 'Xml_attribute' from a token sequence via an * assignment from the leading 'Token'. */ Xml_attribute(Token t) : _name(t.eat_whitespace()), _value(_name.next().next()) { if (_name.type() != Token::IDENT) throw Nonexistent_attribute(); if (_name.next()[0] != '=' || _value.type() != Token::STRING) throw Invalid_syntax(); } /** * Return token following the attribute declaration */ Token _next() const { return _name.next().next().next(); } public: /********************* ** Exception types ** *********************/ class Invalid_syntax : public Exception { }; class Nonexistent_attribute : public Exception { }; /** * Return attribute type as null-terminated string */ void type(char *dst, size_t max_len) const { /* * Limit number of characters by token length, take * null-termination into account. */ max_len = min(max_len, _name.len() + 1); strncpy(dst, _name.start(), max_len); } typedef String<64> Name; Name name() const { return Name(Cstring(_name.start(), _name.len())); } /** * Return true if attribute has specified type */ bool has_type(const char *type) { return strlen(type) == _name.len() && strcmp(type, _name.start(), _name.len()) == 0; } /** * Return size of value */ size_t value_size() const { return _value.len() - 2; } char const *value_base() const { return _value.start() + 1; } /** * Return attribute value as null-terminated string */ void value(char *dst, size_t max_len) const { /* * The value of 'max_len' denotes the maximum number of * characters to be written to 'dst' including the null * termination. From the quoted value string, we strip * both quote characters and add a null-termination * character. */ max_len = min(max_len, _value.len() - 2 + 1); strncpy(dst, _value.start() + 1, max_len); } /** * Return true if attribute has the specified value */ bool has_value(const char *value) const { return strlen(value) == (_value.len() - 2) && !strcmp(value, _value.start() + 1, _value.len() - 2); } /** * Return attribute value as typed value * * \param T type of value to read * \return true on success, or * false if attribute is invalid or value * conversion failed */ template bool value(T *out) const { /* * The '_value' token starts with a quote, which we * need to skip to access the string. For validating * the length, we have to consider both the starting * and the trailing quote character. */ return ascii_to(_value.start() + 1, *out) == _value.len() - 2; } /** * Return attribute value as Genode::String */ template void value(String *out) const { char buf[N]; value(buf, sizeof(buf)); *out = String(Cstring(buf)); } /** * Return next attribute in attribute list */ Xml_attribute next() const { return Xml_attribute(_next()); } }; /** * Representation of an XML node */ class Genode::Xml_node { private: typedef Xml_attribute::Token Token; /** * Forward declaration needed for befriending Tag with Xml_attribute */ class Tag; public: /********************* ** Exception types ** *********************/ typedef Genode::Exception Exception; typedef Xml_attribute::Nonexistent_attribute Nonexistent_attribute; typedef Xml_attribute::Invalid_syntax Invalid_syntax; class Nonexistent_sub_node : public Exception { }; /** * Type definition for maintaining backward compatibility */ typedef Xml_attribute Attribute; private: class Tag { public: enum Type { START, END, EMPTY, INVALID }; private: Token _token { }; Token _name { }; Type _type { INVALID }; public: /** * Constructor * * \param start first token of the tag * * At construction time, the validity of the tag is checked and * the tag type is determined. A valid tag consists of: * # Leading '<' tag delimiter * # '/' for marking an end tag * # Tag name * # Optional attribute sequence (if tag is no end tag) * # '/' for marking an empty-element tag (if tag is no end tag) * # Closing '>' tag delimiter */ Tag(Token start) : _token(start) { Type supposed_type = START; if (_token[0] != '<') return; if (_token.next()[0] == '/') supposed_type = END; if (_token.next().type() != Token::IDENT && _token.next()[0] != '/') return; _name = _token.next()[0] == '/' ? _token.next().next() : _token.next(); if (_name.type() != Token::IDENT) return; /* skip attributes to find tag delimiter */ Token delimiter = _name.next(); if (supposed_type != END) try { for (Xml_attribute a = _name.next(); ; a = a._next()) delimiter = a._next(); } catch (Nonexistent_attribute) { } delimiter = delimiter.eat_whitespace(); /* * Now we expect the '>' delimiter. For empty-element tags, * the delimiter is prefixed with a '/'. */ if (delimiter[0] == '/') { /* if a '/' was already at the start, the tag is invalid */ if (supposed_type == END) return; supposed_type = EMPTY; /* skip '/' */ delimiter = delimiter.next(); } if (delimiter[0] != '>') return; _type = supposed_type; } /** * Default constructor produces invalid Tag */ Tag() { } /** * Return type of tag */ Type type() const { return _type; } /** * Return true if tag is the start of a valid XML node */ bool node() const { return _type == START || _type == EMPTY; } /** * Return first token of tag */ Token token() const { return _token; } /** * Return name of tag */ Token name() const { return _name; } /** * Return token after the closing tag delimiter */ Token next_token() const { /* * Search for next closing delimiter, skip potential * attributes and '/' delimiter prefix of empty-element * tags. */ Token t = _name; for (; t && t[0] != '>'; t = t.next()); /* if 't' is invalid, 't.next()' is invalid too */ return t.next(); } /** * Return first attribute of tag */ Xml_attribute attribute() const { return Xml_attribute(_name.next()); } }; class Comment { private: Token _next { }; /* token following the comment */ bool _valid { false }; /* true if comment is well formed */ public: /** * Constructor * * \param start first token of the comment tag */ Comment(Token t) { /* check for comment start */ if (!t.matches(""); _valid = _next.valid(); } /** * Default constructor produces invalid Comment */ Comment() { } /** * Return true if comment is valid */ bool valid() const { return _valid; } /** * Return token after the closing comment delimiter */ Token next_token() const { return _next; } }; /** * Helper class to decode XML character entities */ struct Decoded_character { char character = 0; size_t encoded_len = 1; struct Translation { char character; char const *seq; size_t seq_len; }; static Translation translate(char const *src, size_t src_len) { enum { NUM = 6 }; static Translation translations[NUM] = { { '>', ">", 4 }, { '<', "<", 4 }, { '&', "&", 5 }, { '"', """, 6 }, { '\'', "'", 6 }, { 0, "�", 6 } }; if (src_len == 0) return { 0, nullptr, 0 }; for (unsigned i = 0; i < NUM; i++) { Translation const &translation = translations[i]; if (src_len < translation.seq_len || memcmp(src, translation.seq, translation.seq_len)) continue; /* translation matches */ return translation; } /* sequence is not known, pass single character as is */ return { *src, nullptr, 1 }; } Decoded_character(char const *src, size_t src_len) { if (*src != '&' || src_len == 0) { character = *src; return; } Translation const translation = translate(src, src_len); character = translation.character; encoded_len = translation.seq_len; } }; const char *_addr; /* first character of XML data */ size_t _max_len; /* length of XML data in characters */ int _num_sub_nodes; /* number of immediate sub nodes */ Tag _start_tag; Tag _end_tag; /** * Search for end tag of XML node and initialize '_num_sub_nodes' * * \return end tag or invalid tag * * The method searches for a end tag that matches the same * depth level and the same name as the start tag of the XML * node. If the XML structure is invalid, the search results * is an invalid Tag. * * During the search, the method also counts the number of * immediate sub nodes. */ Tag _init_end_tag() { /* * If the start tag is invalid or an empty-element tag, * we use the same tag as end tag. */ if (_start_tag.type() != Tag::START) return _start_tag; int depth = 1; Token curr_token = _start_tag.next_token(); while (curr_token.type() != Token::END) { /* eat XML comment */ Comment curr_comment(curr_token); if (curr_comment.valid()) { curr_token = curr_comment.next_token(); continue; } /* skip all tokens that are no tags */ Tag curr_tag(curr_token); if (curr_tag.type() == Tag::INVALID) { curr_token = curr_token.next(); continue; } /* count sub nodes at depth 1 */ if (depth == 1 && curr_tag.node()) _num_sub_nodes++; /* keep track of the current depth */ depth += (curr_tag.type() == Tag::START); depth -= (curr_tag.type() == Tag::END); /* within sub nodes, continue after current token */ if (depth > 0) { /* continue search with token after current tag */ curr_token = curr_tag.next_token(); continue; } /* reaching the same depth as the start tag */ const char *start_name = _start_tag.name().start(); size_t start_len = _start_tag.name().len(); const char *curr_name = curr_tag.name().start(); size_t curr_len = curr_tag.name().len(); /* on mismatch of start tag and end tag, return invalid tag */ if (start_len != curr_len || strcmp(start_name, curr_name, curr_len)) return Tag(); /* end tag corresponds to start tag */ return curr_tag; } return Tag(); } /** * Find next non-whitespace and non-comment token */ static Token skip_non_tag_characters(Token t) { while (true) { t = t.eat_whitespace(); /* eat comment */ Comment comment(t); if (comment.valid()) { t = comment.next_token(); continue; } /* skip token if it is valid but does not start a tag */ Tag curr_tag(t); if (curr_tag.type() == Tag::INVALID && curr_tag.token()) { t = t.next(); continue; } break; } return t; } /** * Create sub node from XML node * * \throw Nonexistent_sub_node * \throw Invalid_syntax */ Xml_node _sub_node(const char *at) const { if (at < addr() || (size_t)(at - addr()) >= _max_len) throw Nonexistent_sub_node(); return Xml_node(at, _max_len - (at - addr())); } public: /** * Constructor * * The constructor validates if the start tag has a matching end tag of * the same depth and counts the number of immediate sub nodes. * * \throw Invalid_syntax */ Xml_node(const char *addr, size_t max_len = ~0UL) : _addr(addr), _max_len(max_len), _num_sub_nodes(0), _start_tag(skip_non_tag_characters(Token(addr, max_len))), _end_tag(_init_end_tag()) { /* check validity of XML node */ if (_start_tag.type() == Tag::EMPTY) return; if (_start_tag.type() == Tag::START && _end_tag.type() == Tag::END) return; throw Invalid_syntax(); } /** * Request type name of XML node as null-terminated string */ void type_name(char *dst, size_t max_len) const { _start_tag.name().string(dst, max_len); } typedef String<64> Type; Type type() const { Token name = _start_tag.name(); return Type(Cstring(name.start(), name.len())); } /** * Return true if tag is of specified type */ bool has_type(const char *type) const { return (!strcmp(type, _start_tag.name().start(), _start_tag.name().len()) && strlen(type) == _start_tag.name().len()); } /** * Request content of XML node as null-terminated string */ void value(char *dst, size_t max_len) const { max_len = min(content_size() + 1, min(max_len, _max_len)); strncpy(dst, content_addr(), max_len); } /** * Read content as typed value from XML node * * \param T type of value to read from XML node * \param out resulting value * \return true on success */ template bool value(T *out) const { return ascii_to(content_addr(), *out) == content_size(); } /** * Return begin of node including the start tag */ const char *addr() const { return _addr; } /** * Return size of node including start and end tags */ size_t size() const { return _end_tag.next_token().start() - addr(); } /** * Return begin of node content as an opaque string * * Note that the returned string is not null-terminated as it * points directly into a sub range of the unmodified Xml_node * address range. * * XXX This method is deprecated. Use 'content_base()' instead. * * \noapi */ char *content_addr() const { return _start_tag.next_token().start(); } /** * Return pointer to start of content */ char const *content_base() const { return content_addr(); } /** * Return size of node content */ size_t content_size() const { if (_start_tag.type() == Tag::EMPTY) return 0; return _end_tag.token().start() - content_addr(); } /** * Export decoded node content from XML node * * \param dst destination buffer * \param dst_len size of destination buffer in bytes * \return number of bytes written to the destination buffer * * This function transforms XML character entities into their * respective characters. */ size_t decoded_content(char *dst, size_t dst_len) const { size_t result_len = 0; char const *src = content_base(); size_t src_len = content_size(); for (; dst_len && src_len; dst_len--, result_len++) { Decoded_character const decoded_character(src, src_len); *dst++ = decoded_character.character; src += decoded_character.encoded_len; src_len -= decoded_character.encoded_len; } return result_len; } /** * Read decoded node content as Genode::String */ template STRING decoded_content() const { char buf[STRING::capacity() + 1]; size_t const len = decoded_content(buf, sizeof(buf) - 1); buf[min(len, sizeof(buf) - 1)] = 0; return STRING(Cstring(buf)); } /** * Return the number of the XML node's immediate sub nodes */ size_t num_sub_nodes() const { return _num_sub_nodes; } /** * Return XML node following the current one * * \throw Nonexistent_sub_node sub sequent node does not exist */ Xml_node next() const { Token after_node = _end_tag.next_token(); after_node = skip_non_tag_characters(after_node); try { return _sub_node(after_node.start()); } catch (Invalid_syntax) { throw Nonexistent_sub_node(); } } /** * Return next XML node of specified type * * \param type type of XML node, or * 0 for matching any type */ Xml_node next(const char *type) const { Xml_node node = next(); for (; type && !node.has_type(type); node = node.next()); return node; } /** * Return true if node is the last of a node sequence */ bool last(const char *type = 0) const { try { next(type); return false; } catch (Nonexistent_sub_node) { return true; } } /** * Return sub node with specified index * * \param idx index of sub node, * default is the first node * \throw Nonexistent_sub_node no such sub node exists */ Xml_node sub_node(unsigned idx = 0U) const { if (_num_sub_nodes > 0) { /* look up node at specified index */ try { Xml_node curr_node = _sub_node(content_addr()); for (; idx > 0; idx--) curr_node = curr_node.next(); return curr_node; } catch (Invalid_syntax) { } } throw Nonexistent_sub_node(); } /** * Return first sub node that matches the specified type * * \throw Nonexistent_sub_node no such sub_node exists */ Xml_node sub_node(const char *type) const { if (_num_sub_nodes > 0) { /* search for sub node of specified type */ try { Xml_node curr_node = _sub_node(content_addr()); for ( ; true; curr_node = curr_node.next()) if (curr_node.has_type(type)) return curr_node; } catch (...) { } } throw Nonexistent_sub_node(); } /** * Execute functor 'fn' for each sub node of specified type */ template void for_each_sub_node(char const *type, FN const &fn) const { if (_num_sub_nodes == 0) return; Xml_node node = sub_node(); for (int i = 0; ; node = node.next()) { if (!type || node.has_type(type)) fn(node); if (++i == _num_sub_nodes) break; } } /** * Execute functor 'fn' for each sub node */ template void for_each_sub_node(FN const &fn) const { for_each_sub_node(nullptr, fn); } /** * Return Nth attribute of XML node * * \param idx attribute index, * first attribute has index 0 * \throw Nonexistent_attribute no such attribute exists * \return XML attribute */ Xml_attribute attribute(unsigned idx) const { /* get first attribute of the node */ Xml_attribute a = _start_tag.attribute(); /* skip attributes until we reach the target index */ for (; idx > 0; idx--) a = a._next(); return a; } /** * Return attribute of specified type * * \param type name of attribute type * \throw Nonexistent_attribute no such attribute exists * \return XML attribute */ Xml_attribute attribute(const char *type) const { /* iterate, beginning with the first attribute of the node */ for (Xml_attribute a = _start_tag.attribute(); ; a = a.next()) if (a.has_type(type)) return a; } /** * Shortcut for reading an attribute value from XML node * * \param type attribute name * \param default_value value returned if no attribute with the * name 'type' is present. * \return attribute value or specified default value * * Without this shortcut, attribute values can be obtained by * 'node.attribute(...).value(...)' only. Because the attribute * lookup may throw a 'Nonexistent_attribute' exception, code that * reads optional attributes (those with default values) has to * handle the exception accordingly. Such code tends to become * clumsy, in particular when many attributes are processed in a * subsequent fashion. This method template relieves the XML node * user from implementing the exception handling manually. */ template inline T attribute_value(char const *type, T default_value) const { T result = default_value; try { attribute(type).value(&result); } catch (...) { } return result; } /** * Return true if attribute of specified type exists */ inline bool has_attribute(char const *type) const { try { attribute(type); return true; } catch (...) { } return false; } /** * Return true if sub node of specified type exists */ inline bool has_sub_node(char const *type) const { try { sub_node(type); return true; } catch (...) { } return false; } void print(Output &output) const { output.out_string(addr(), size()); } }; #endif /* _INCLUDE__UTIL__XML_NODE_H_ */