#!/usr/bin/tclsh

#
# \brief  Regular-expression-based C++ parser
# \author Norman Feske
# \date   2007-08-15
#

# check command line arguments
set config_out_xml    [regsub -- "-format +xml"    $argv "" argv]
set config_out_tokens [regsub -- "-format +tokens" $argv "" argv]
set config_out_source [regsub -- "-format +source" $argv "" argv]
set config_whitespace [regsub -- "-whitespace"     $argv "" argv]

# read file
set input_pathname [lindex $argv 0]
if {[catch {

	#
	# Create root node of the syntax tree
	#
	set txt(0) [exec cat $input_pathname]
	set typ(0) content
	set num 1
}]} {
	foreach line {
		""
		"Parse C++ file and output syntax tree."
		""
		"  usage: parse_cxx \[-whitespace\] \[-format {xml|tokens|source}\] <source_file>"
		""
		"The supported output formats are:"
		""
		"  xml    - XML-based representation"
		"  tokens - List of tokens (parser-internal representation)"
		"  source - Source as generated from syntax tree (for debugging)"
		""
		"If the '-whitespace' argument is specified, whitespaces get translated to tokens."
		""
	} { puts stderr $line }

	exit -1;
}

# do not stop parsing (this variable is only used for debugging)
set stop 0

#
# Replace all '&' characters from the original input
# because they cause trouble with the regexp command.
#
regsub -all {&} $txt(0) "Г" txt(0)


##
# Extract expression from content
#
# All expressions that occur in the token types 'typelist'
# and that match the 'subexpr' criterion get replaced in
# their original token by a reference tag and form a new
# token of the type 'newtype'.
#
# The reference is coded as Ї<token_type><token_id>А.
# Since the reference has the type coded in, we can
# match sub tokens of specific types via plain regular
# expressions.
##
proc extract {newtype subexpr typelist} {
	global num txt typ stop
	set old_num $num

	if {$stop} { return }

	for {set i 0} {$i < $old_num} {incr i} {
		if {[lsearch $typelist $typ($i)] > -1} {
			while {[regexp $subexpr $txt($i) mid]} {

				# new sub text #
				set typ($num) $newtype
				set txt($num) $mid

				# substitute expression by a reference #
				regsub $subexpr $txt($i)  "Ї$newtype$numА" txt($i)
				incr num
			}
		}
	}
}


##
# Extract operations
#
# \param op_name  name of operator
# \param op_type  type of operator, can be "binary", "pre", or "post"
# \param op_dir   direction of application, can be "ltr" (left to
#                 right) or "trl" (right to left)
##
proc extract_op {newtype op_name op_type op_dir typelist} {
	global num txt typ stop
	set old_num $num

	if {$stop} { return }

	# Extracting operators is context-sensitive. In particular,
	# unary operators must not be applied if they have an
	# operand as neighbor. Hence, we construct a pattern with
	# three subpatterns, one for the leading context, one for
	# the new operand sub token, and one for the trailing context.

	if {$op_dir == "ltr"} {
		set lpattern ""
	} else {
		set lpattern ".*"
	}

	set repl_left {\1}
	if {$op_type == "pre"} {
		set pattern "(^|$lpattern\(?:\[^i\]er|\[^e\]r|\[^r\\d\]\)\\d+А\\s*)(Ї$op_name\\d+А\\s*Їidentifier\\d+А)" }
		set repl_right {}
	if {$op_type == "post"} {
		set pattern "($lpattern)(Їidentifier\\d+А\\s*Ї$op_name\\d+А\\s*)((Ї(\[^i\]|i\[^d\]|id\[^e\]))|;|\$)" }
		set repl_right {\3}
	if {$op_type == "binary"} {
		set pattern "($lpattern)(Їidentifier\\d+А\\s*Ї$op_name\\d+А\\s*Їidentifier\\d+А)"
		set repl_right {}
	}

	for {set i 0} {$i < $old_num} {incr i} {
		if {[lsearch $typelist $typ($i)] > -1} {

			while {[regexp $pattern $txt($i) dummy lcontext match rcontext]} {

				# new sub text #
				set typ($num) $newtype
				set txt($num) $match

				set old_txt $txt($i)

				# substitute expression by a reference #
				regsub $pattern $txt($i)  "$repl_leftЇ$newtype$numА$repl_right" txt($i)

				incr num
			}
		}
	}
}


proc extract_operations { from } {
	set operators { { doublecolon    binary ltr }
	                { parenblk       post   ltr # function call }
	                { arrayindex     post   ltr }
	                { deref          binary ltr }
	                { dot            binary ltr }
	                { incr           post   ltr }
	                { decr           post   ltr }
	                { not            pre    rtl }
	                { tilde          pre    rtl }
	                { incr           pre    rtl }
	                { decr           pre    rtl }
	                { minus          pre    rtl }
	                { plus           pre    rtl }
	                { star           pre    rtl # deref }
	                { amper          pre    rtl # addrof }
	                { keysizeof      pre    rtl }
	                { parenblk       pre    rtl # cast }
	                { star           binary ltr }
	                { div            binary ltr }
	                { mod            binary ltr }
	                { plus           binary ltr }
	                { minus          binary ltr }
	                { lshift         binary ltr }
	                { rshift         binary ltr }
	                { less           binary ltr }
	                { lessequal      binary ltr }
	                { greater        binary ltr }
	                { greaterequal   binary ltr }
	                { equal          binary ltr }
	                { notequal       binary ltr }
	                { amper          binary ltr # bitand }
	                { xor            binary ltr }
	                { bitor          binary ltr }
	                { and            binary ltr }
	                { or             binary ltr }
	                { cond           binary ltr }
	                { assign         binary rtl }
	                { assignopplus   binary rtl }
	                { assignopminus  binary rtl }
	                { assignopmult   binary rtl }
	                { assignopdiv    binary rtl }
	                { assignopmod    binary rtl }
	                { assignopbitand binary rtl }
	                { assignopbitxor binary rtl }
	                { assignopbitor  binary rtl }
	                { assignoplshift binary rtl }
	                { assignoprshift binary rtl }
	                { keythrow       pre    rtl }
	                { comma          binary ltr }
	            }

	foreach op $operators {
		set op_name [lindex $op 0]
		set op_type [lindex $op 1]
		set op_dir  [lindex $op 2]
		extract_op identifier $op_name $op_type $op_dir $from
	}
}


proc extract_enum_operations { from } {
	set operators { { doublecolon    binary ltr }
	                { parenblk       post   ltr # function call }
	                { not            pre    rtl }
	                { tilde          pre    rtl }
	                { minus          pre    rtl }
	                { plus           pre    rtl }
	                { amper          pre    rtl # addrof }
	                { keysizeof      pre    rtl }
	                { star           binary ltr }
	                { div            binary ltr }
	                { mod            binary ltr }
	                { plus           binary ltr }
	                { minus          binary ltr }
	                { lshift         binary ltr }
	                { rshift         binary ltr }
	                { less           binary ltr }
	                { lessequal      binary ltr }
	                { greater        binary ltr }
	                { greaterequal   binary ltr }
	                { equal          binary ltr }
	                { notequal       binary ltr }
	                { amper          binary ltr # bitand }
	                { xor            binary ltr }
	                { bitor          binary ltr }
	                { and            binary ltr }
	                { or             binary ltr }
	                { cond           binary ltr }
	            }

	foreach op $operators {
		set op_name [lindex $op 0]
		set op_type [lindex $op 1]
		set op_dir  [lindex $op 2]
		extract_op identifier $op_name $op_type $op_dir $from
	}
}


##
# Refine types of sub tokens
#
# This function changes the type of sub tokens of the specified
# environment token to the specified replacement type.  It is
# used to specialize token types depending on their environment.
# For example, for turning blocks within classes into specialized
# declaration blocks, for which other rules apply than for
# function-body blocks.
##
proc refine_sub_tokens {env_type sub_type repl_sub_type} {
	global num txt typ stop

	if {$stop} { return }

	# iterate through token list in search of env-typed tokens
	for {set i 0} {$i < $num} {incr i} {
		if {$typ($i) == $env_type} {

			set env $txt($i)
			while {[regexp "Ї$sub_type\(\\d+)А" $env dummy sub_token_idx]} {
				set typ($sub_token_idx) $repl_sub_type
				regsub "Ї$sub_type\(\\d+)А" $env "Ї$repl_sub_type$sub_token_idxА" env
			}

			# update environment token
			set txt($i) $env
		}
	}
}


#####################################################
## Rules for splitting the input into its elements ##
#####################################################

#
# Starting with only the root token (content0) of the syntax tree
# containing the whole source code as one string, we extract
# typed sub tokens to partition the string into parts of distinct
# meanings (token types).  In the process of subsequently
# applying extraction rules to specific token types, a syntax
# tree is formed.
#

# extract line comments
extract lcomment {/\*[^\n]*?\*/} content

# extract multi-line comments
extract mlcomment {/\*.*?\*/} content

extract quotedchar {'(.|\\.)'} content

# extract strings
#
# Strings may contain quoted '"' characters.
#
extract string {\"([^\"]|\")*?\"} content

# extract C++-style comments
extract cxxcomment {\/\/[^\n]*} content

# extract preprocessor directives
#
# Preprocessor macros may span over multiple lines if a
# backslash is supplied at the end of each line.
#
extract preproc {#([^\n]|\\\n)*} content

extract preprefix {#} preproc

# extract keywords
foreach keyword {
	private public protected unsigned extern
	while for if else switch do return typedef
	static_cast reinterpret_cast dynamic_cast
	using namespace class struct union enum template
	const inline static virtual friend explicit
	volatile case default operator new throw alignas
	try catch continue sizeof asm override typename constexpr
	GENODE_RPC GENODE_RPC_THROW
	GENODE_RPC_INTERFACE GENODE_RPC_INTERFACE_INHERIT
	GENODE_TYPE_LIST
} {
	set keytag $keyword
	regsub -all {_} $keytag "" keytag
	set keytag [string tolower $keytag]
	extract "key$keytag" "\\m$keyword\\M" content
}

# extract extern "C"
extract "keyexternc" {Їkeyextern\d+А\s*Їstring\d+А} content

# fold parenthesis and blocks
extract parenblk {\([^()]*?\)} {content parenblk}
extract block    {\{[^{}]*?\}} {content parenblk block}
extract openbrace  "\{" block
extract closebrace "\}" block
extract openparen  {\(} parenblk
extract closeparen {\)} parenblk

extract externcblk {Їkeyexternc\d+А\s*Їblock\d+А} content

# extract template argument blocks
extract tplargs    {<[^<>{}]*>$} {content block parenblk}
extract tplargs    {<[^<>{}]*>(?=[^>])} {content block parenblk}

# extract special characters
extract equal          {==}   {content block parenblk tplargs}
extract assignopplus   {\+=}  {content block parenblk tplargs}
extract assignopminus  {\-=}  {content block parenblk tplargs}
extract assignopmult   {\*=}  {content block parenblk tplargs}
extract assignopdiv    {\/=}  {content block parenblk tplargs}
extract assignopmod    {%=}   {content block parenblk tplargs}
extract assignopbitor  {\|=}  {content block parenblk tplargs}
extract assignopbitand {Г=}   {content block parenblk tplargs}
extract assignopbitxor {\^=}  {content block parenblk tplargs}
extract assignopneq    {\!=}  {content block parenblk tplargs}
extract assignoplshift {<<=}  {content block parenblk tplargs}
extract assignoprshift {>>=}  {content block parenblk tplargs}
extract incr           {\+\+} {content block parenblk tplargs}
extract decr           {\-\-} {content block parenblk tplargs}
extract doublecolon    {::}   {content block parenblk tplargs}
extract or             {\|\|} {content block parenblk tplargs}
extract bitor          {\|}   {content block parenblk tplargs}
extract and            {ГГ}   {content block parenblk tplargs}
extract amper          {Г}    {content block parenblk tplargs}
extract plus           {\+}   {content block parenblk tplargs}
extract div            {\/}   {content block parenblk tplargs}
extract star           {\*}   {content block parenblk tplargs}
extract notequal       {\!=}  {content block parenblk tplargs}
extract not            {\!}   {content block parenblk tplargs}
extract deref          {\->}  {content block parenblk tplargs}
extract dot            {\.}   {content block parenblk tplargs}
extract tilde          {~}    {content block parenblk tplargs}
extract lshift         {<<}   {content block parenblk tplargs}
extract rshift         {>>}   {content block parenblk tplargs}
extract greaterequal   {>=}   {content block parenblk tplargs}
extract lessequal      {<=}   {content block parenblk tplargs}
extract greater        {>}    {content block parenblk tplargs}
extract less           {<}    {content block parenblk tplargs}
extract minus          {\-}   {content block parenblk tplargs}
extract mod            {%}    {content block parenblk tplargs}
extract xor            {\^}   {content block parenblk tplargs}
extract question       {\?}   {content block parenblk tplargs}
extract comma          {,}    {content block parenblk tplargs}
extract assign         {=}    {content block parenblk tplargs}

extract attribute {__attribute__\s*Їparenblk\d+А} {content block parenblk}

extract alignas {Їkeyalignas\d+А\s*Їparenblk\d+А} {content block parenblk}

# extract identifiers
extract identifier {([\w_][\w\d_]*)+(?=[^А]*(Ї|$))} {content parenblk block tplargs}

extract identifier {Їquotedchar\d+А} {content parenblk block tplargs}

# merge template arguments with the predecessing identifier
extract identifier {Їidentifier\d+А\s*Їtplargs\d+А} {content block parenblk tplargs}

# extract using namespace
extract using {Їkeyusing\d+А\s*Їkeynamespace\d+А\s*Їidentifier\d+А\s*;} {content block}

# extract casted identifiers and thereby potentially creating new valid assignments
extract identifier {Їkey(static|dynamic|reinterpret)cast\d+А\s*Їtplargs\d+А\s*Їparenblk\d+А} {block}

#
# XXX the C++ precedence rules are not fully implemented
#

# extract namespaced identifiers
extract identifier {Їidentifier\d+А\s*Їdoublecolon\d+А\s*Їidentifier\d+А} {content block}

# extract identifiers in the root namespace
extract identifier {Їdoublecolon\d+А\s*Їidentifier\d+А} {content block}

extract whilecond {Їkeywhile\d+А\s*Їparenblk\d+А} block
extract forcond {Їkeyfor\d+А\s*Їparenblk\d+А} block
extract ifcond {Їkeyif\d+А\s*Їparenblk\d+А} block
extract switchcond {Їkeyswitch\d+А\s*Їparenblk\d+А} block
extract catchcond {Їkeycatch\d+А\s*Їparenblk\d+А} block

# extract forward declarations of structs and classes
extract classdecl  {Їkeyclass\d+А\s*Їidentifier\d+А\s*;}  {content block}
extract structdecl {Їkeystruct\d+А\s*Їidentifier\d+А\s*;} {content block}

# extract classes
extract class   {(Їmlcomment\d+А *\n[ \t]*)?Їkeyclass\d+А\s*(Їalignas\d+А\s*)?Їidentifier\d+А[^;]*;}  {content block}
extract struct  {(Їmlcomment\d+А *\n[ \t]*)?Їkeystruct\d+А\s*(Їalignas\d+А\s*)?Їidentifier\d+А[^;]*;} {content block}
extract union   {(Їmlcomment\d+А *\n[ \t]*)?Їkeyunion\d+А\s*Їidentifier\d+А[^;]*;}  {content block}
extract enum    {(Їmlcomment\d+А *\n[ \t]*)?Їkeyenum\d+А\s*[^;]*;}                  {content block}

extract inherit {:.*?(?=\s*Їblock\d+А)} {class struct union}

# partition block types into more expressive sub types
refine_sub_tokens class  block classblock
refine_sub_tokens struct block classblock
refine_sub_tokens union  block classblock
refine_sub_tokens enum   block enumblock

extract_enum_operations enumblock

#enumvalue {Їidentifier\d+А[^,]*?(?=Їcomma\d+А)} enumblock
extract enumentry {Їidentifier\d+А\s*Їassign\d+А\s*Їidentifier\d+А} enumblock
extract enumvalue {Їidentifier\d+А$} enumentry
extract enumentry {Їidentifier\d+А} enumblock

# extract template classes
extract tplclassdecl {(Їmlcomment\d+А[\t ]*\n[\t ]*)?Їkeytemplate\d+А\s*Їtplargs\d+А\s*Їclassdecl\d+А} {content block classblock}
extract tplstructdecl {(Їmlcomment\d+А[\t ]*\n[\t ]*)?Їkeytemplate\d+А\s*Їtplargs\d+А\s*Їstructdecl\d+А} {content block classblock}

extract tplclass {(Їmlcomment\d+А[\t ]*\n[\t ]*)?Їkeytemplate\d+А\s*Їtplargs\d+А\s*Їclass\d+А} {content block classblock}
extract tplstruct {(Їmlcomment\d+А[\t ]*\n[\t ]*)?Їkeytemplate\d+А\s*Їtplargs\d+А\s*Їstruct\d+А} {content block classblock}

refine_sub_tokens tplclassdecl classdecl class;
refine_sub_tokens tplstructdecl structdecl class;

extract arrayindex {\[[^\]]*\]} {content classblock block arrayindex}

# detect case labels within switch statements and protection labels
extract caselabel {Їkeycase\d+А[^:]+:} {block}
extract caselabel {Їkeydefault\d+А:} {block}
foreach keyword { private public protected } {
	set label label
	extract "$keyword$label" "Їkey$keyword\\d+А:" {classblock} }

extract identifier {Їidentifier\d+А+\s*Їdoublecolon\d+А\s*Їidentifier\d+А} {content classblock}

# extract class initialize list
extract initializer {:\s*Їidentifier\d+А\s*Їparenblk\d+А(\s*Їcomma\d+А\s*Їidentifier\d+А\s*Їparenblk\d+А)*} {content classblock}
extract colon {:}   {initializer inherit}

# extract asm blocks
extract asm {Їkeyasm\d+А\s*(Їkeyvolatile\d+А)?\s*Їparenblk\d+А} {content block}

# extract Genode-specific RPC declaration macros
set genode_macros { genoderpc genoderpcthrow genoderpcinterface genoderpcinterfaceinherit genodetypelist }
foreach key $genode_macros {
	extract $key "Їkey$key\\d+А\\s*Їparenblk\\d+А\\s*" { classblock parenblk } }

foreach key $genode_macros {
	refine_sub_tokens $key parenblk macroargblk }

# extract functions
extract operatorfunction {Їkeyoperator\d+А\s*Ї[^А]+\d+А\s*Їparenblk\d+А} {content classblock}
extract funcptr {Їparenblk\d+А\s*Їparenblk\d+А(\s*Їattribute\d+А)?} {content classblock block identifier parenblk}
extract function {Їidentifier\d+А\s*Їparenblk\d+А(\s*Їattribute\d+А)?} {content classblock block initializer}

extract operator {Їkeyoperator\d+А\s*Ї[^ ]+\d+А} operatorfunction

extract destfunction {(Їidentifier\d+АЇdoublecolon\d+А)?Їtilde\d+АЇidentifier\d+А\s*Їparenblk\d+А} {content classblock}
extract identifier {(Їidentifier\d+АЇdoublecolon\d+А)?Їtilde\d+АЇidentifier\d+А} destfunction

extract identifier {Їidentifier\d+А\s*Їparenblk\d+А} {parenblk block identifier initializer tplargs}
extract identifier {Їparenblk\d+А} {parenblk block}

# extract arrays
extract array {(Їidentifier\d+А\s*)(Їarrayindex\d+А\s*)+} {content classblock block}
extract identifier {Їarray\d+А} {content classblock block}

# extract assignments
extract identifier {(?=(\s*|;))(Їstar\d+А\s*)*Їidentifier\d+А\s*Їassign\w*\d+А[^;]*} block

# extract throw statements
extract identifier {(?=(\s*|;))Їkeythrow\d+А\s*[^;]*} block

# extract stream operators
#extract lhidentifier {(?=(\s*|;))[^;]*?Ї(lshift|rshift)\d+А[^;]*} block

# extract uses of the new operator
extract identifier {Їkeynew\d+А\s*(Їparenblk\d+А\s*)?Їfunction\d+А} block

# extract return statements
extract return {Їkeyreturn\d+А[^;]*} {block}

# extract modifiers
extract modifier {(Їkey(extern|externc|constexpr|static|inline|virtual|volatile)\d+А\s*)+} {content classblock block}

# extract function declarations
extract funcdecl {(Їmlcomment\d+А *\n[ \t]*)?(Ї(modifier|keyunsigned|keyconst)\d+А\s*)*Ї(identifier|keyunsigned|keyconst)\d+А(\s|(Їamper\d+А)|(Їstar\d+А))*Ї(operator)?function\d+А\s*(Ї(keyconst|keyoverride)\d+А\s*)*(Їassign\d+А\s*Їidentifier\d+А)?\s*;} {content block classblock}

# extract function implementations
extract funcimpl {(Їmlcomment\d+А *\n[ \t]*)?(Ї(modifier|keyunsigned|keyconst)\d+А\s*)*(Ї(identifier|keyunsigned|keyconst)\d+А\s*)+(\s|(Їamper\d+А)|(Їstar\d+А))*Ї(operator)?function\d+А\s*(Ї(keyconst|keyoverride)\d+А\s*)*(Їattribute\d+А\s*)*Їblock\d+А[;\t ]*} {content block classblock}
extract funcimpl {(Їmlcomment\d+А *\n[ \t]*)?Їoperatorfunction\d+А\s*(Їmodifier\d+А\s*)?Їblock\d+А[;\t ]*} {content block classblock}

# extract template functions
extract tplfunc {(Їmlcomment\d+А *\n[ \t]*)?Їkeytemplate\d+А\s*Їtplargs\d+А\s*(Їattribute\d+А\s*)*Їfuncimpl\d+А} {content block classblock}

# extract template functions declarations
extract tplfuncdecl {(Їmlcomment\d+А *\n[ \t]*)?Їkeytemplate\d+А\s*Їtplargs\d+А\s*Їfuncdecl\d+А} {content block classblock}

# extract destructor implementations
extract destimpl {(Їmlcomment\d+А *\n[ \t]*)?(Їmodifier\d+А\s*)?Їtilde\d+АЇfunction\d+А\s*Їblock\d+А[;\t ]*} {content classblock}
refine_sub_tokens destimpl destfunction function

# extract constructor implementations
extract constimpl {(Їmlcomment\d+А *\n[ \t]*)?(Ї(modifier|keyexplicit)\d+А\s*)*Їfunction\d+А\s*(Їinitializer\d+А\s*)?\s*Їblock\d+А[;\t ]*} {content classblock}

# extract template constructors
extract tplfunc {(Їmlcomment\d+А *\n[ \t]*)?Їkeytemplate\d+А\s*Їtplargs\d+А\s*Їconstimpl\d+А} {content block classblock}

# extract destructor declarations
extract destdecl {(Їmlcomment\d+А *\n[ \t]*)?(Їmodifier\d+А\s*)?Їtilde\d+АЇfunction\d+А\s*(Їassign\d+А\s+Їidentifier\d+А)?\s*;} {classblock}

# extract constructor declarations
extract constdecl {(Їmlcomment\d+А *\n[ \t]*)?(Їkeyexplicit\d+А[ \t]*)?Їfunction\d+А\s*(Їassign\d+А\s+Їidentifier\d+А)?\s*;} {classblock}

# extract friendship declarations
extract frienddecl {Їkeyfriend\d+А\s*Їclassdecl\d+А} {classblock}

# classify function signatures and their containing argument-parenthesis blocks
foreach env_type [list destdecl constdecl destimpl constimpl funcimpl funcdecl] {
	refine_sub_tokens $env_type function funcsignature }
refine_sub_tokens funcsignature parenblk argparenblk
refine_sub_tokens operatorfunction parenblk argparenblk

extract_operations parenblk
extract argmodifier {(Їkey(const|volatile)\d+А\s*)+} {argparenblk}

# extract pure-virtual assignments
extract virtassign {Їassign\d+А\s+Їidentifier\d+А} funcdecl

# extract return values
extract retval {(Ї(identifier|keyunsigned|keyconst|star|amper)\d+А\s*)+(?=Їfuncsignature)} {funcdecl funcimpl}
extract retval {(Ї(identifier|keyunsigned|keyconst|star|amper)\d+А\s*)+(?=Їoperatorfunction)} {funcdecl funcimpl}
extract identifier {Ї(keyunsigned|keyconst)\d+А\s*(Їidentifier\d+А)?} {retval}

# extract argument declarations separated by commas
refine_sub_tokens tplargs greater closeparen
refine_sub_tokens tplargs less    openparen
extract varargs {(Їdot\d+А){3}} {argparenblk tplargs}
extract keytypename {Їkeytypename\d+А\s*Їvarargs\d+А} tplargs

extract argdecl {(Ї(argmodifier|keytypename|keyunsigned|identifier|tilde|minus|amper|star|and|varargs|assign|string)\d+А\s*)+(?=Їcomma)}      {argparenblk tplargs}
extract argdecl {(Ї(argmodifier|keytypename|keyunsigned|identifier|tilde|minus|amper|star|and|varargs|assign|string)\d+А\s*)+(?=Їcloseparen)} {argparenblk tplargs}
extract argdefault {Їassign\d+А.*} argdecl

extract argname {Їidentifier\d+А\s*(?=Їargdefault)} {argdecl}

# there may be just a type and no name
extract argtype {^\s*Їidentifier\d+А\s*$} {argdecl}

# the last identifier is the name
extract argname {Їidentifier\d+А\s*$} {argdecl}
extract argtype {^(Ї(argmodifier|keyunsigned)\d+А\s*)*(Ї(identifier|keytypename|varargs|keyunsigned)\d+А)(\s*|(Ї(amper|and|argmodifier)\d+А)|(Їstar\d+А))*(Їargmodifier\d+А\s*)*(Їvarargs\d+А)?} argdecl

# extract typedefs
extract typedef {(Їmlcomment\d+А *\n[ \t]*)?Їkeytypedef\d+А(\s*Ї(identifier|keyunsigned|keytypename)\d+А)+\s*;} {content classblock block}
extract typename {Їidentifier\d+А(?=;)} typedef
extract identifier {(\s*Ї(identifier|keyunsigned)\d+А){2,}} typedef
extract identifier {\s*Їkeyunsigned\d+А} typedef

# extract function pointers
extract vardecl {(Ї(modifier|keyunsigned)\d+А\s*)*(Ї(identifier|keyunsigned)\d+А)((\s|(Їamper\d+А)|(Їstar\d+А))*(Їmodifier\d+А\s*)*(Їfuncptr\d+А)\s*(:\s*Їidentifier\d+А)?\s*(Їassign\d+А[^;]*?)?\s*(Їcomma\d+А)?\s*)+;} {content classblock block}

# extract variable declarations (type + any number of comma-separated variables + optional tailing comment)
extract vardecl {(Їalignas\d+А\s*)?(Ї(modifier|keyunsigned)\d+А\s*)*(Ї(identifier|keyunsigned)\d+А)((\s|(Їamper\d+А)|(Їstar\d+А))*(Ї(modifier|keyconst)\d+А\s*)*(Ї(identifier|array)\d+А)\s*(:\s*Їidentifier\d+А)?\s*(Їassign\d+А[^;]*?)?(Їblock\d+А)?\s*(Їcomma\d+А)?\s*)+;} {content classblock block}

# extract commented variable declaration
extract commentedvardecl {Їvardecl\d+А\s*Їm?lcomment\d+А(\s*Їlcomment\dА)*} {content classblock block}

# extract valid declaration sequences
set elem "(mlcomment|lcomment|vardecl|array|commentedvardecl|typedef|funcimpl|funcdecl|enum|class|struct|union|constimpl|constdecl|destimpl|destdecl|tplfunc|tplfuncdecl|tplstruct|tplstructdecl|tplclass|tplclassdecl|frienddecl|classdecl|structdecl)"
extract declseq "Ї$elem\\d+А(\\s*Ї$elem\\d+А)*" {classblock}

# group protection scopes with corresponding declaration sequences
foreach keyword { private public protected } {
	set label label
	extract $keyword "Ї$keyword$label\\d+А\\s*Їdeclseq\\d+А" {classblock} }

# extract protection-scope labels
extract label {Їkey(private|public|protected)\d+А:} {private public protected}

# extract name spaces
extract namespace {Їkeynamespace\d+А\s*Їidentifier\d+А\s*Їblock\d+А} {content block}
refine_sub_tokens namespace block namespaceblock

#
# The remaining block tokens are code blocks. So we can
# apply code-specific rules to them.
#

extract identifier {Їfunction\d+А} block

extract_operations {block identifier}

# extract statements from remaining code blocks
extract statement {Їasm\d+А;} block
extract statement {Їidentifier\d+А;} block
extract statement {Їreturn\d+А;}     {block}
extract statement {Їkeycontinue\d+А\s*;} block

# extract try-catch statements
extract statement {Їkeytry\d+А\s*Їblock\d+А(\s*Їcatchcond\d+А\s*Їblock\d+А)+} {block}

# wrap blocks into statements
extract statement {Їblock\d+А} {block statement}

# empty statements (all normal semicolons should be encapsulated in statements now)
extract statement {;} {block}

# turn control structures into statements
set pattern_ifelse {(Їifcond\d+А(\s|Їm?lcomment\d+А)*Їstatement\d+А(\s|Їm?lcomment\d+А)*Їkeyelse\d+А(\s|Їm?lcomment\d+А)*Їstatement\d+А)}
set pattern_if     {(Їifcond\d+А(\s|Їm?lcomment\d+А)*Їstatement\d+А(?!(\s|Їm?lcomment\d+А)*Їkeyelse))}
set pattern_for    {(Ї(while|for|switch)cond\d+А(\s|Їm?lcomment\d+А)*Їstatement\d+А)}
extract statement "($pattern_ifelse|$pattern_if|$pattern_for)" {block statement}

# extract control-structure types
extract ifelse $pattern_ifelse                                         {statement}
extract if     $pattern_if                                             {statement}
extract for    {Їforcond\d+А(\s|Їm?lcomment\d+А)*(Їstatement\d+А|;)}   {statement}
extract while  {Їwhilecond\d+А(\s|Їm?lcomment\d+А)*(Їstatement\d+А|;)} {statement}
extract switch {Їswitchcond\d+А(\s|Їm?lcomment\d+А)*Їstatement\d+А}    {statement}

# turn control-flow element into statements
foreach type { ifelse if while for switch try } {
	extract statement "Ї$type\\d+А" block }

# extract valid code sequences
set elem "(mlcomment|vardecl|statement|lcomment)"
extract codeseq "Ї$elem\\d+А(\\s*Ї$elem\\d+А)*" {block}

#
# Extract line breaks, spaces, and tabs from all types
#

if {$config_whitespace} {
	set all_types ""
	for {set i 0} {$i < $num} {incr i} {
		if {[lsearch $all_types $typ($i)] == -1} {
			lappend all_types $typ($i) }}
	
	extract line {\n}       $all_types
	extract align { +(?= )} $all_types
	extract space { }       $all_types
	extract tab   {\t}      $all_types
}


###############################
## Back-end helper functions ##
###############################

##
# Return name of reference token with specified index
##
proc token_by_idx {idx} {
	global typ;
	return "$typ($idx)$idx"
}


##
# Return index of specified reference token
##
proc idx_of_token {token} {
	regexp {[0-9]+} $token idx
	return $idx
}


##
# Return type of specified reference token
##
proc type_of_token {token} {
	regexp {[a-z]+} $token type
	return $type
}


##
# Return marker for reference token
##
proc marker {token} {
	return "Ї$tokenА"
}


##
# Return text referenced by token
##
proc token_text {token} {
	global txt
	return $txt([idx_of_token $token])
}


##
# Assign a line number to each reference token
#
# To be able to provide error messages including line numbers, we
# determine the line number for each reference token and store it
# as an attribute.
#
# The result of the function is stored in the global 'ln' array.
##
proc assign_line_numbers {{token content0}} {
	global ln curr_ln config_whitespace

	if {$token == "content0"} { set curr_ln 1 }

	# assign current line number to current token
	set ln([idx_of_token $token]) $curr_ln

	# count occurrences of line breaks
	if {[type_of_token $token] == "line"} { incr curr_ln }
	if {!$config_whitespace && ($token == "\n")} { incr curr_ln }

	# count lines for all sub-tokens
	set tex [token_text $token]
	while {$tex != ""} {

		# count and eat raw line breaks (needed if 'whitespace' option is disabled)
		if {[regexp {^\n} $tex dummy]} {
			if {!$config_whitespace} { incr curr_ln }
			regsub {\n} $tex "" tex
		}

		# ignore plain text
		if {[regexp {^[^Ї\n]+} $tex plain]} {
			regsub {^[^Ї\n]+} $tex "" tex }

		# traverse into token
		if {[regexp {^Ї(.+?)А} $tex dummy token]} {
			assign_line_numbers $token
			regsub {Ї(.+?)А} $tex "" tex
		}
	}
}


##
# Look up line number of specified reference token
##
proc line_number {token} {
	global ln
	return $ln([idx_of_token $token])
}


##
# Output tokens as valid Tcl List
#
# The result of this function can be used directly
# as input by another Tcl script.
##
proc dump_tokens { } {
	global num typ txt
	set tokens [list]

	for {set i 0} {($i < $num)} {incr i} {
		set token [token_by_idx $i]
		set text $txt($i)

		lappend tokens [list $token [line_number $token] $text]
	}
	puts $tokens
}


##########################
## Source-code back end ##
##########################

##
# Output syntax tree as source code
#
# This constructs the source code from the syntax tree. It is
# useful to check the result against the input to make sure that
# no information gets lost during the parsing procedure.
##
proc dump_source { } {
	global num typ txt

	set output $txt(0)

	while {[regexp {Ї(.+?)А} $output dummy token]} {
		regsub $dummy $output [token_text $token] output
	}

	# revert character substitutions of '&'
	regsub -all {Г} $output "\\\&" output

	puts $output
}


##################
## XML back end ##
##################

proc dump_xml_subtree {token} {
	global dump_xml_indent line

	set type [type_of_token $token]
	set tex  [token_text    $token]
	set line [line_number   $token]

	# shorten frequent leaf nodes
	if {$type == "line"} {
		puts "$dump_xml_indent<linebreak line=\"$line\"/>"
	} elseif {$type == "tab"} {
		puts "$dump_xml_indent<tab line=\"$line\"/>"
	} elseif {$type == "space"} {
		puts "$dump_xml_indent<space line=\"$line\"/>"
	} elseif {$type == "align"} {
		puts "$dump_xml_indent<align line=\"$line\">$tex</align>"
	} else {

		puts "$dump_xml_indent<$type line=\"$line\">"
		set dump_xml_indent "  $dump_xml_indent"

		while {$tex != ""} {

			# consume plain text
			if {[regexp {^[^Ї]+} $tex plain]} {

				# perform character substitutions for xml compliance
				regsub -all {Г}  $plain "\\\&amp;"  plain
				regsub -all {<}  $plain "\\\&lt;"   plain
				regsub -all {>}  $plain "\\\&gt;"   plain
				regsub -all "\"" $plain "\\\&quot;" plain
				regsub -all "'"  $plain "\\\&apos;" plain

				puts "$dump_xml_indent<plain line=\"$line\">$plain</plain>"
				regsub {^[^Ї]+} $tex "" tex
			}

			# consume token
			if {[regexp {Ї(.+?)А} $tex dummy token]} {
				dump_xml_subtree $token
				regsub {Ї(.+?)А} $tex "" tex
			}
		}

		regsub "  " $dump_xml_indent "" dump_xml_indent
		puts "$dump_xml_indent</$type>"
	}
}


##
# Output syntax tree as xml
##
proc dump_xml { } {

	# reset indentation level
	global dump_xml_indent
	set    dump_xml_indent ""

	# output subtree beginning with the root node
	dump_xml_subtree content0
}


##################
## Main program ##
##################

assign_line_numbers

if {$config_out_tokens} { dump_tokens }
if {$config_out_xml}    { dump_xml }
if {$config_out_source} { dump_source }