2011-12-22 15:19:25 +00:00
#!/usr/bin/tclsh
#
# \brief Regular-expression-based C++ parser
# \author Norman Feske
# \date 2007-08-15
#
# check command line arguments
set config_out_xml [regsub -- "-format +xml" $argv "" argv]
set config_out_tokens [regsub -- "-format +tokens" $argv "" argv]
set config_out_source [regsub -- "-format +source" $argv "" argv]
set config_whitespace [regsub -- "-whitespace" $argv "" argv]
# read file
set input_pathname [lindex $argv 0]
if {[catch {
#
# Create root node of the syntax tree
#
set txt(0) [exec cat $input_pathname]
set typ(0) content
set num 1
}]} {
foreach line {
""
"Parse C++ file and output syntax tree."
""
" usage: parse_cxx \[-whitespace\] \[-format {xml|tokens|source}\] <source_file>"
""
"The supported output formats are:"
""
" xml - XML-based representation"
" tokens - List of tokens (parser-internal representation)"
" source - Source as generated from syntax tree (for debugging)"
""
"If the '-whitespace' argument is specified, whitespaces get translated to tokens."
""
} { puts stderr $line }
exit -1;
}
# do not stop parsing (this variable is only used for debugging)
set stop 0
#
# Replace all '&' characters from the original input
# because they cause trouble with the regexp command.
#
regsub -all {&} $txt(0) "<22> " txt(0)
##
# Extract expression from content
#
# All expressions that occur in the token types 'typelist'
# and that match the 'subexpr' criterion get replaced in
# their original token by a reference tag and form a new
# token of the type 'newtype'.
#
# The reference is coded as <20> <token_type><token_id><3E> .
# Since the reference has the type coded in, we can
# match sub tokens of specific types via plain regular
# expressions.
##
proc extract {newtype subexpr typelist} {
global num txt typ stop
set old_num $num
if {$stop} { return }
for {set i 0} {$i < $old_num} {incr i} {
if {[lsearch $typelist $typ($i)] > -1} {
while {[regexp $subexpr $txt($i) mid]} {
# new sub text #
set typ($num) $newtype
set txt($num) $mid
# substitute expression by a reference #
regsub $subexpr $txt($i) "<22> $newtype$num<75> " txt($i)
incr num
}
}
}
}
##
# Extract operations
#
# \param op_name name of operator
# \param op_type type of operator, can be "binary", "pre", or "post"
# \param op_dir direction of application, can be "ltr" (left to
# right) or "trl" (right to left)
##
proc extract_op {newtype op_name op_type op_dir typelist} {
global num txt typ stop
set old_num $num
if {$stop} { return }
# Extracting operators is context-sensitive. In particular,
# unary operators must not be applied if they have an
# operand as neighbor. Hence, we construct a pattern with
# three subpatterns, one for the leading context, one for
# the new operand sub token, and one for the trailing context.
if {$op_dir == "ltr"} {
set lpattern ""
} else {
set lpattern ".*"
}
set repl_left {\1}
if {$op_type == "pre"} {
set pattern "(^|$lpattern\(?:\[^i\]er|\[^e\]r|\[^r\\d\]\)\\d+<2B> \\s*)(<28> $op_name\\d+<2B> \\s*<2A> identifier\\d+<2B> )" }
set repl_right {}
if {$op_type == "post"} {
set pattern "($lpattern)(<28> identifier\\d+<2B> \\s*<2A> $op_name\\d+<2B> \\s*)((<28> (\[^i\]|i\[^d\]|id\[^e\]))|;|\$)" }
set repl_right {\3}
if {$op_type == "binary"} {
set pattern "($lpattern)(<28> identifier\\d+<2B> \\s*<2A> $op_name\\d+<2B> \\s*<2A> identifier\\d+<2B> )"
set repl_right {}
}
for {set i 0} {$i < $old_num} {incr i} {
if {[lsearch $typelist $typ($i)] > -1} {
while {[regexp $pattern $txt($i) dummy lcontext match rcontext]} {
# new sub text #
set typ($num) $newtype
set txt($num) $match
set old_txt $txt($i)
# substitute expression by a reference #
regsub $pattern $txt($i) "$repl_left<66> $newtype$num<75> $repl_right" txt($i)
incr num
}
}
}
}
proc extract_operations { from } {
set operators { { doublecolon binary ltr }
{ parenblk post ltr # function call }
{ arrayindex post ltr }
{ deref binary ltr }
{ dot binary ltr }
{ incr post ltr }
{ decr post ltr }
{ not pre rtl }
{ tilde pre rtl }
{ incr pre rtl }
{ decr pre rtl }
{ minus pre rtl }
{ plus pre rtl }
{ star pre rtl # deref }
{ amper pre rtl # addrof }
{ keysizeof pre rtl }
{ parenblk pre rtl # cast }
{ star binary ltr }
{ div binary ltr }
{ mod binary ltr }
{ plus binary ltr }
{ minus binary ltr }
{ lshift binary ltr }
{ rshift binary ltr }
{ less binary ltr }
{ lessequal binary ltr }
{ greater binary ltr }
{ greaterequal binary ltr }
{ equal binary ltr }
{ notequal binary ltr }
{ amper binary ltr # bitand }
{ xor binary ltr }
{ bitor binary ltr }
{ and binary ltr }
{ or binary ltr }
{ cond binary ltr }
{ assign binary rtl }
{ assignopplus binary rtl }
{ assignopminus binary rtl }
{ assignopmult binary rtl }
{ assignopdiv binary rtl }
{ assignopmod binary rtl }
{ assignopbitand binary rtl }
{ assignopbitxor binary rtl }
{ assignopbitor binary rtl }
{ assignoplshift binary rtl }
{ assignoprshift binary rtl }
{ keythrow pre rtl }
{ comma binary ltr }
}
foreach op $operators {
set op_name [lindex $op 0]
set op_type [lindex $op 1]
set op_dir [lindex $op 2]
extract_op identifier $op_name $op_type $op_dir $from
}
}
proc extract_enum_operations { from } {
set operators { { doublecolon binary ltr }
{ parenblk post ltr # function call }
{ not pre rtl }
{ tilde pre rtl }
{ minus pre rtl }
{ plus pre rtl }
{ amper pre rtl # addrof }
{ keysizeof pre rtl }
{ star binary ltr }
{ div binary ltr }
{ mod binary ltr }
{ plus binary ltr }
{ minus binary ltr }
{ lshift binary ltr }
{ rshift binary ltr }
{ less binary ltr }
{ lessequal binary ltr }
{ greater binary ltr }
{ greaterequal binary ltr }
{ equal binary ltr }
{ notequal binary ltr }
{ amper binary ltr # bitand }
{ xor binary ltr }
{ bitor binary ltr }
{ and binary ltr }
{ or binary ltr }
{ cond binary ltr }
}
foreach op $operators {
set op_name [lindex $op 0]
set op_type [lindex $op 1]
set op_dir [lindex $op 2]
extract_op identifier $op_name $op_type $op_dir $from
}
}
##
# Refine types of sub tokens
#
# This function changes the type of sub tokens of the specified
# environment token to the specified replacement type. It is
# used to specialize token types depending on their environment.
# For example, for turning blocks within classes into specialized
# declaration blocks, for which other rules apply than for
# function-body blocks.
##
proc refine_sub_tokens {env_type sub_type repl_sub_type} {
global num txt typ stop
if {$stop} { return }
# iterate through token list in search of env-typed tokens
for {set i 0} {$i < $num} {incr i} {
if {$typ($i) == $env_type} {
set env $txt($i)
while {[regexp "<22> $sub_type\(\\d+)<29> " $env dummy sub_token_idx]} {
set typ($sub_token_idx) $repl_sub_type
regsub "<22> $sub_type\(\\d+)<29> " $env "<22> $repl_sub_type$sub_token_idx<64> " env
}
# update environment token
set txt($i) $env
}
}
}
#####################################################
## Rules for splitting the input into its elements ##
#####################################################
#
# Starting with only the root token (content0) of the syntax tree
# containing the whole source code as one string, we extract
# typed sub tokens to partition the string into parts of distinct
# meanings (token types). In the process of subsequently
# applying extraction rules to specific token types, a syntax
# tree is formed.
#
# extract line comments
extract lcomment {/\*[^\n]*?\*/} content
# extract multi-line comments
extract mlcomment {/\*.*?\*/} content
extract quotedchar {'(.|\\.)'} content
# extract strings
#
# Strings may contain quoted '"' characters.
#
extract string {\"([^\"]|\")*?\"} content
# extract C++-style comments
extract cxxcomment {\/\/[^\n]*} content
# extract preprocessor directives
#
# Preprocessor macros may span over multiple lines if a
# backslash is supplied at the end of each line.
#
extract preproc {#([^\n]|\\\n)*} content
extract preprefix {#} preproc
# extract keywords
foreach keyword {
private public protected unsigned extern
while for if else switch do return typedef
static_cast reinterpret_cast dynamic_cast
using namespace class struct union enum template
const inline static virtual friend explicit
volatile case default operator new throw
2015-03-20 16:46:45 +00:00
try catch continue sizeof asm override typename constexpr
2011-12-22 15:19:25 +00:00
GENODE_RPC GENODE_RPC_THROW
GENODE_RPC_INTERFACE GENODE_RPC_INTERFACE_INHERIT
GENODE_TYPE_LIST
} {
set keytag $keyword
regsub -all {_} $keytag "" keytag
set keytag [string tolower $keytag]
extract "key$keytag" "\\m$keyword\\M" content
}
# extract extern "C"
extract "keyexternc" {<7B> keyextern\d+<2B> \s*<2A> string\d+<2B> } content
# fold parenthesis and blocks
extract parenblk {\([^()]*?\)} {content parenblk}
extract block {\{[^{}]*?\}} {content parenblk block}
extract openbrace "\{" block
extract closebrace "\}" block
extract openparen {\(} parenblk
extract closeparen {\)} parenblk
extract externcblk {<7B> keyexternc\d+<2B> \s*<2A> block\d+<2B> } content
# extract template argument blocks
extract tplargs {<[^<>{}]*>$} {content block parenblk}
extract tplargs {<[^<>{}]*>(?=[^>])} {content block parenblk}
# extract special characters
2015-03-20 16:46:45 +00:00
extract equal {==} {content block parenblk tplargs}
extract assignopplus {\+=} {content block parenblk tplargs}
extract assignopminus {\-=} {content block parenblk tplargs}
extract assignopmult {\*=} {content block parenblk tplargs}
extract assignopdiv {\/=} {content block parenblk tplargs}
extract assignopmod {%=} {content block parenblk tplargs}
extract assignopbitor {\|=} {content block parenblk tplargs}
extract assignopbitand {<7B> =} {content block parenblk tplargs}
extract assignopbitxor {\^=} {content block parenblk tplargs}
extract assignopneq {\!=} {content block parenblk tplargs}
extract assignoplshift {<<=} {content block parenblk tplargs}
extract assignoprshift {>>=} {content block parenblk tplargs}
extract incr {\+\+} {content block parenblk tplargs}
extract decr {\-\-} {content block parenblk tplargs}
extract doublecolon {::} {content block parenblk tplargs}
extract or {\|\|} {content block parenblk tplargs}
extract bitor {\|} {content block parenblk tplargs}
extract and {<7B> <> } {content block parenblk tplargs}
extract amper {<7B> } {content block parenblk tplargs}
extract plus {\+} {content block parenblk tplargs}
extract div {\/} {content block parenblk tplargs}
extract star {\*} {content block parenblk tplargs}
extract notequal {\!=} {content block parenblk tplargs}
extract not {\!} {content block parenblk tplargs}
extract deref {\->} {content block parenblk tplargs}
extract dot {\.} {content block parenblk tplargs}
extract tilde {~} {content block parenblk tplargs}
extract lshift {<<} {content block parenblk tplargs}
extract rshift {>>} {content block parenblk tplargs}
extract greaterequal {>=} {content block parenblk tplargs}
extract lessequal {<=} {content block parenblk tplargs}
extract greater {>} {content block parenblk tplargs}
extract less {<} {content block parenblk tplargs}
extract minus {\-} {content block parenblk tplargs}
extract mod {%} {content block parenblk tplargs}
extract xor {\^} {content block parenblk tplargs}
extract question {\?} {content block parenblk tplargs}
extract comma {,} {content block parenblk tplargs}
extract assign {=} {content block parenblk tplargs}
2011-12-22 15:19:25 +00:00
extract attribute {__attribute__\s*<2A> parenblk\d+<2B> } {content block parenblk}
# extract identifiers
2015-03-20 16:46:45 +00:00
extract identifier {([\w_][\w\d_]*)+(?=[^<5E> ]*(<28> |$))} {content parenblk block tplargs}
2011-12-22 15:19:25 +00:00
2015-03-20 16:46:45 +00:00
extract identifier {<7B> quotedchar\d+<2B> } {content parenblk block tplargs}
2011-12-22 15:19:25 +00:00
# merge template arguments with the predecessing identifier
2015-03-20 16:46:45 +00:00
extract identifier {<7B> identifier\d+<2B> \s*<2A> tplargs\d+<2B> } {content block parenblk tplargs}
2011-12-22 15:19:25 +00:00
# extract using namespace
extract using {<7B> keyusing\d+<2B> \s*<2A> keynamespace\d+<2B> \s*<2A> identifier\d+<2B> \s*;} {content block}
# extract casted identifiers and thereby potentially creating new valid assignments
extract identifier {<7B> key(static|dynamic|reinterpret)cast\d+<2B> \s*<2A> tplargs\d+<2B> \s*<2A> parenblk\d+<2B> } {block}
#
# XXX the C++ precedence rules are not fully implemented
#
# extract namespaced identifiers
2015-03-20 16:46:45 +00:00
extract identifier {<7B> identifier\d+<2B> \s*<2A> doublecolon\d+<2B> \s*<2A> identifier\d+<2B> } {content block}
2011-12-22 15:19:25 +00:00
# extract identifiers in the root namespace
2015-03-20 16:46:45 +00:00
extract identifier {<7B> doublecolon\d+<2B> \s*<2A> identifier\d+<2B> } {content block}
2011-12-22 15:19:25 +00:00
extract whilecond {<7B> keywhile\d+<2B> \s*<2A> parenblk\d+<2B> } block
extract forcond {<7B> keyfor\d+<2B> \s*<2A> parenblk\d+<2B> } block
extract ifcond {<7B> keyif\d+<2B> \s*<2A> parenblk\d+<2B> } block
extract switchcond {<7B> keyswitch\d+<2B> \s*<2A> parenblk\d+<2B> } block
extract catchcond {<7B> keycatch\d+<2B> \s*<2A> parenblk\d+<2B> } block
# extract forward declarations of structs and classes
extract classdecl {<7B> keyclass\d+<2B> \s*<2A> identifier\d+<2B> \s*;} {content block}
extract structdecl {<7B> keystruct\d+<2B> \s*<2A> identifier\d+<2B> \s*;} {content block}
# extract classes
extract class {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> keyclass\d+<2B> \s*<2A> identifier\d+<2B> [^;]*;} {content block}
extract struct {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> keystruct\d+<2B> \s*<2A> identifier\d+<2B> [^;]*;} {content block}
extract union {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> keyunion\d+<2B> \s*<2A> identifier\d+<2B> [^;]*;} {content block}
extract enum {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> keyenum\d+<2B> \s*[^;]*;} {content block}
extract inherit {:.*?(?=\s*<2A> block\d+<2B> )} {class struct union}
# partition block types into more expressive sub types
refine_sub_tokens class block classblock
refine_sub_tokens struct block classblock
refine_sub_tokens union block classblock
refine_sub_tokens enum block enumblock
extract_enum_operations enumblock
#enumvalue {<7B> identifier\d+<2B> [^,]*?(?=<3D> comma\d+<2B> )} enumblock
extract enumentry {<7B> identifier\d+<2B> \s*<2A> assign\d+<2B> \s*<2A> identifier\d+<2B> } enumblock
extract enumvalue {<7B> identifier\d+<2B> $} enumentry
extract enumentry {<7B> identifier\d+<2B> } enumblock
# extract template classes
extract tplclassdecl {(<28> mlcomment\d+<2B> [\t ]*\n[\t ]*)?<3F> keytemplate\d+<2B> \s*<2A> tplargs\d+<2B> \s*<2A> classdecl\d+<2B> } {content block classblock}
extract tplstructdecl {(<28> mlcomment\d+<2B> [\t ]*\n[\t ]*)?<3F> keytemplate\d+<2B> \s*<2A> tplargs\d+<2B> \s*<2A> structdecl\d+<2B> } {content block classblock}
extract tplclass {(<28> mlcomment\d+<2B> [\t ]*\n[\t ]*)?<3F> keytemplate\d+<2B> \s*<2A> tplargs\d+<2B> \s*<2A> class\d+<2B> } {content block classblock}
extract tplstruct {(<28> mlcomment\d+<2B> [\t ]*\n[\t ]*)?<3F> keytemplate\d+<2B> \s*<2A> tplargs\d+<2B> \s*<2A> struct\d+<2B> } {content block classblock}
refine_sub_tokens tplclassdecl classdecl class;
refine_sub_tokens tplstructdecl structdecl class;
extract arrayindex {\[[^\]]*\]} {content classblock block arrayindex}
# detect case labels within switch statements and protection labels
extract caselabel {<7B> keycase\d+<2B> [^:]+:} {block}
extract caselabel {<7B> keydefault\d+<2B> :} {block}
foreach keyword { private public protected } {
set label label
extract "$keyword$label" "<22> key$keyword\\d+<2B> :" {classblock} }
extract identifier {<7B> identifier\d+<2B> +\s*<2A> doublecolon\d+<2B> \s*<2A> identifier\d+<2B> } {content classblock}
# extract class initialize list
extract initializer {:\s*<2A> identifier\d+<2B> \s*<2A> parenblk\d+<2B> (\s*<2A> comma\d+<2B> \s*<2A> identifier\d+<2B> \s*<2A> parenblk\d+<2B> )*} {content classblock}
extract colon {:} {initializer inherit}
# extract asm blocks
extract asm {<7B> keyasm\d+<2B> \s*(<28> keyvolatile\d+<2B> )?\s*<2A> parenblk\d+<2B> } {content block}
# extract Genode-specific RPC declaration macros
set genode_macros { genoderpc genoderpcthrow genoderpcinterface genoderpcinterfaceinherit genodetypelist }
foreach key $genode_macros {
extract $key "<22> key$key\\d+<2B> \\s*<2A> parenblk\\d+<2B> \\s*" { classblock parenblk } }
foreach key $genode_macros {
refine_sub_tokens $key parenblk macroargblk }
# extract functions
extract operatorfunction {<7B> keyoperator\d+<2B> \s*<2A> [^<5E> ]+\d+<2B> \s*<2A> parenblk\d+<2B> } {content classblock}
extract funcptr {<7B> parenblk\d+<2B> \s*<2A> parenblk\d+<2B> (\s*<2A> attribute\d+<2B> )?} {content classblock block identifier parenblk}
extract function {<7B> identifier\d+<2B> \s*<2A> parenblk\d+<2B> (\s*<2A> attribute\d+<2B> )?} {content classblock block initializer}
2015-03-20 16:46:45 +00:00
extract operator {<7B> keyoperator\d+<2B> \s*<2A> [^ ]+\d+<2B> } operatorfunction
2011-12-22 15:19:25 +00:00
extract destfunction {(<28> identifier\d+<2B> <> doublecolon\d+<2B> )?<3F> tilde\d+<2B> <> identifier\d+<2B> \s*<2A> parenblk\d+<2B> } {content classblock}
extract identifier {(<28> identifier\d+<2B> <> doublecolon\d+<2B> )?<3F> tilde\d+<2B> <> identifier\d+<2B> } destfunction
2015-03-20 16:46:45 +00:00
extract identifier {<7B> identifier\d+<2B> \s*<2A> parenblk\d+<2B> } {parenblk block identifier initializer tplargs}
2011-12-22 15:19:25 +00:00
extract identifier {<7B> parenblk\d+<2B> } {parenblk block}
# extract arrays
extract array {(<28> identifier\d+<2B> \s*)(<28> arrayindex\d+<2B> \s*)+} {content classblock block}
extract identifier {<7B> array\d+<2B> } {content classblock block}
# extract assignments
extract identifier {(?=(\s*|;))(<28> star\d+<2B> \s*)*<2A> identifier\d+<2B> \s*<2A> assign\w*\d+<2B> [^;]*} block
# extract throw statements
extract identifier {(?=(\s*|;))<29> keythrow\d+<2B> \s*[^;]*} block
# extract stream operators
#extract lhidentifier {(?=(\s*|;))[^;]*?<3F> (lshift|rshift)\d+<2B> [^;]*} block
# extract uses of the new operator
extract identifier {<7B> keynew\d+<2B> \s*(<28> parenblk\d+<2B> \s*)?<3F> function\d+<2B> } block
# extract return statements
extract return {<7B> keyreturn\d+<2B> [^;]*} {block}
# extract modifiers
2015-03-20 16:46:45 +00:00
extract modifier {(<28> key(extern|externc|constexpr|static|inline|virtual|volatile)\d+<2B> \s*)+} {content classblock block}
2011-12-22 15:19:25 +00:00
# extract function declarations
2015-03-20 16:46:45 +00:00
extract funcdecl {(<28> mlcomment\d+<2B> *\n[ \t]*)?(<28> (modifier|keyunsigned|keyconst)\d+<2B> \s*)*<2A> (identifier|keyunsigned|keyconst)\d+<2B> (\s|(<28> amper\d+<2B> )|(<28> star\d+<2B> ))*<2A> (operator)?function\d+<2B> \s*(<28> (keyconst|keyoverride)\d+<2B> \s*)*(<28> assign\d+<2B> \s*<2A> identifier\d+<2B> )?\s*;} {content block classblock}
2011-12-22 15:19:25 +00:00
# extract function implementations
2015-03-20 16:46:45 +00:00
extract funcimpl {(<28> mlcomment\d+<2B> *\n[ \t]*)?(<28> (modifier|keyunsigned|keyconst)\d+<2B> \s*)*(<28> (identifier|keyunsigned|keyconst)\d+<2B> \s*)+(\s|(<28> amper\d+<2B> )|(<28> star\d+<2B> ))*<2A> (operator)?function\d+<2B> \s*(<28> (keyconst|keyoverride)\d+<2B> \s*)*<2A> block\d+<2B> [;\t ]*} {content block classblock}
2011-12-22 15:19:25 +00:00
extract funcimpl {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> operatorfunction\d+<2B> \s*(<28> modifier\d+<2B> \s*)?<3F> block\d+<2B> [;\t ]*} {content block classblock}
# extract template functions
extract tplfunc {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> keytemplate\d+<2B> \s*<2A> tplargs\d+<2B> \s*<2A> funcimpl\d+<2B> } {content block classblock}
# extract template functions declarations
extract tplfuncdecl {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> keytemplate\d+<2B> \s*<2A> tplargs\d+<2B> \s*<2A> funcdecl\d+<2B> } {content block classblock}
# extract destructor implementations
extract destimpl {(<28> mlcomment\d+<2B> *\n[ \t]*)?(<28> modifier\d+<2B> \s*)?<3F> tilde\d+<2B> <> function\d+<2B> \s*<2A> block\d+<2B> [;\t ]*} {content classblock}
refine_sub_tokens destimpl destfunction function
# extract constructor implementations
extract constimpl {(<28> mlcomment\d+<2B> *\n[ \t]*)?(<28> (modifier|keyexplicit)\d+<2B> \s*)*<2A> function\d+<2B> \s*(<28> initializer\d+<2B> \s*)?\s*<2A> block\d+<2B> [;\t ]*} {content classblock}
2015-03-20 16:46:45 +00:00
# extract template constructors
extract tplfunc {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> keytemplate\d+<2B> \s*<2A> tplargs\d+<2B> \s*<2A> constimpl\d+<2B> } {content block classblock}
2011-12-22 15:19:25 +00:00
# extract destructor declarations
extract destdecl {(<28> mlcomment\d+<2B> *\n[ \t]*)?(<28> modifier\d+<2B> \s*)?<3F> tilde\d+<2B> <> function\d+<2B> \s*(<28> assign\d+<2B> \s+<2B> identifier\d+<2B> )?\s*;} {classblock}
# extract constructor declarations
extract constdecl {(<28> mlcomment\d+<2B> *\n[ \t]*)?(<28> keyexplicit\d+<2B> [ \t]*)?<3F> function\d+<2B> \s*(<28> assign\d+<2B> \s+<2B> identifier\d+<2B> )?\s*;} {classblock}
# extract friendship declarations
extract frienddecl {<7B> keyfriend\d+<2B> \s*<2A> classdecl\d+<2B> } {classblock}
# classify function signatures and their containing argument-parenthesis blocks
foreach env_type [list destdecl constdecl destimpl constimpl funcimpl funcdecl] {
refine_sub_tokens $env_type function funcsignature }
refine_sub_tokens funcsignature parenblk argparenblk
2015-03-20 16:46:45 +00:00
refine_sub_tokens operatorfunction parenblk argparenblk
2011-12-22 15:19:25 +00:00
extract_operations parenblk
2015-03-20 16:46:45 +00:00
extract argmodifier {(<28> key(const|volatile)\d+<2B> \s*)+} {argparenblk}
2011-12-22 15:19:25 +00:00
# extract pure-virtual assignments
extract virtassign {<7B> assign\d+<2B> \s+<2B> identifier\d+<2B> } funcdecl
# extract return values
2015-03-20 16:46:45 +00:00
extract retval {(<28> (identifier|keyunsigned|keyconst|star|amper)\d+<2B> \s*)+(?=<3D> funcsignature)} {funcdecl funcimpl}
extract retval {(<28> (identifier|keyunsigned|keyconst|star|amper)\d+<2B> \s*)+(?=<3D> operatorfunction)} {funcdecl funcimpl}
extract identifier {<7B> (keyunsigned|keyconst)\d+<2B> \s*(<28> identifier\d+<2B> )?} {retval}
# extract argument declarations separated by commas
refine_sub_tokens tplargs greater closeparen
refine_sub_tokens tplargs less openparen
extract varargs {(<28> dot\d+<2B> ){3}} {argparenblk tplargs}
extract keytypename {<7B> keytypename\d+<2B> \s*<2A> varargs\d+<2B> } tplargs
extract argdecl {(<28> (argmodifier|keytypename|keyunsigned|identifier|tilde|minus|amper|star|and|varargs|assign|string)\d+<2B> \s*)+(?=<3D> comma)} {argparenblk tplargs}
extract argdecl {(<28> (argmodifier|keytypename|keyunsigned|identifier|tilde|minus|amper|star|and|varargs|assign|string)\d+<2B> \s*)+(?=<3D> closeparen)} {argparenblk tplargs}
extract argdefault {<7B> assign\d+<2B> .*} argdecl
2011-12-22 15:19:25 +00:00
2015-03-20 16:46:45 +00:00
extract argname {<7B> identifier\d+<2B> \s*(?=<3D> argdefault)} {argdecl}
2011-12-22 15:19:25 +00:00
2015-03-20 16:46:45 +00:00
# there may be just a type and no name
extract argtype {^\s*<2A> identifier\d+<2B> \s*$} {argdecl}
2011-12-22 15:19:25 +00:00
2015-03-20 16:46:45 +00:00
# the last identifier is the name
extract argname {<7B> identifier\d+<2B> \s*$} {argdecl}
extract argtype {^(<28> (argmodifier|keyunsigned)\d+<2B> \s*)*(<28> (identifier|keytypename|varargs|keyunsigned)\d+<2B> )(\s*|(<28> (amper|and|argmodifier)\d+<2B> )|(<28> star\d+<2B> ))*(<28> argmodifier\d+<2B> \s*)*(<28> varargs\d+<2B> )?} argdecl
2011-12-22 15:19:25 +00:00
# extract typedefs
2019-05-23 13:19:44 +00:00
extract typedef {(<28> mlcomment\d+<2B> *\n[ \t]*)?<3F> keytypedef\d+<2B> (\s*<2A> (identifier|keyunsigned|keytypename)\d+<2B> )+\s*;} {content classblock block}
2011-12-22 15:19:25 +00:00
extract typename {<7B> identifier\d+<2B> (?=;)} typedef
2015-03-20 16:46:45 +00:00
extract identifier {(\s*<2A> (identifier|keyunsigned)\d+<2B> ){2,}} typedef
extract identifier {\s*<2A> keyunsigned\d+<2B> } typedef
2011-12-22 15:19:25 +00:00
# extract function pointers
extract vardecl {(<28> (modifier|keyunsigned)\d+<2B> \s*)*(<28> (identifier|keyunsigned)\d+<2B> )((\s|(<28> amper\d+<2B> )|(<28> star\d+<2B> ))*(<28> modifier\d+<2B> \s*)*(<28> funcptr\d+<2B> )\s*(:\s*<2A> identifier\d+<2B> )?\s*(<28> assign\d+<2B> [^;]*?)?\s*(<28> comma\d+<2B> )?\s*)+;} {content classblock block}
# extract variable declarations (type + any number of comma-separated variables + optional tailing comment)
2019-05-23 13:19:44 +00:00
extract vardecl {(<28> (modifier|keyunsigned)\d+<2B> \s*)*(<28> (identifier|keyunsigned)\d+<2B> )((\s|(<28> amper\d+<2B> )|(<28> star\d+<2B> ))*(<28> (modifier|keyconst)\d+<2B> \s*)*(<28> (identifier|array)\d+<2B> )\s*(:\s*<2A> identifier\d+<2B> )?\s*(<28> assign\d+<2B> [^;]*?)?(<28> block\d+<2B> )?\s*(<28> comma\d+<2B> )?\s*)+;} {content classblock block}
2011-12-22 15:19:25 +00:00
# extract commented variable declaration
extract commentedvardecl {<7B> vardecl\d+<2B> \s*<2A> m?lcomment\d+<2B> (\s*<2A> lcomment\d<> )*} {content classblock block}
# extract valid declaration sequences
set elem "(mlcomment|lcomment|vardecl|array|commentedvardecl|typedef|funcimpl|funcdecl|enum|class|struct|union|constimpl|constdecl|destimpl|destdecl|tplfunc|tplfuncdecl|tplstruct|tplstructdecl|tplclass|tplclassdecl|frienddecl|classdecl|structdecl)"
extract declseq "<22> $elem\\d+<2B> (\\s*<2A> $elem\\d+<2B> )*" {classblock}
# group protection scopes with corresponding declaration sequences
foreach keyword { private public protected } {
set label label
extract $keyword "<22> $keyword$label\\d+<2B> \\s*<2A> declseq\\d+<2B> " {classblock} }
# extract protection-scope labels
extract label {<7B> key(private|public|protected)\d+<2B> :} {private public protected}
# extract name spaces
extract namespace {<7B> keynamespace\d+<2B> \s*<2A> identifier\d+<2B> \s*<2A> block\d+<2B> } {content block}
refine_sub_tokens namespace block namespaceblock
#
# The remaining block tokens are code blocks. So we can
# apply code-specific rules to them.
#
extract identifier {<7B> function\d+<2B> } block
extract_operations {block identifier}
# extract statements from remaining code blocks
extract statement {<7B> asm\d+<2B> ;} block
extract statement {<7B> identifier\d+<2B> ;} block
extract statement {<7B> return\d+<2B> ;} {block}
extract statement {<7B> keycontinue\d+<2B> \s*;} block
# extract try-catch statements
extract statement {<7B> keytry\d+<2B> \s*<2A> block\d+<2B> (\s*<2A> catchcond\d+<2B> \s*<2A> block\d+<2B> )+} {block}
# wrap blocks into statements
extract statement {<7B> block\d+<2B> } {block statement}
# empty statements (all normal semicolons should be encapsulated in statements now)
extract statement {;} {block}
# turn control structures into statements
set pattern_ifelse {(<28> ifcond\d+<2B> (\s|<7C> m?lcomment\d+<2B> )*<2A> statement\d+<2B> (\s|<7C> m?lcomment\d+<2B> )*<2A> keyelse\d+<2B> (\s|<7C> m?lcomment\d+<2B> )*<2A> statement\d+<2B> )}
set pattern_if {(<28> ifcond\d+<2B> (\s|<7C> m?lcomment\d+<2B> )*<2A> statement\d+<2B> (?!(\s|<7C> m?lcomment\d+<2B> )*<2A> keyelse))}
set pattern_for {(<28> (while|for|switch)cond\d+<2B> (\s|<7C> m?lcomment\d+<2B> )*<2A> statement\d+<2B> )}
extract statement "($pattern_ifelse|$pattern_if|$pattern_for)" {block statement}
# extract control-structure types
extract ifelse $pattern_ifelse {statement}
extract if $pattern_if {statement}
extract for {<7B> forcond\d+<2B> (\s|<7C> m?lcomment\d+<2B> )*(<28> statement\d+<2B> |;)} {statement}
extract while {<7B> whilecond\d+<2B> (\s|<7C> m?lcomment\d+<2B> )*(<28> statement\d+<2B> |;)} {statement}
extract switch {<7B> switchcond\d+<2B> (\s|<7C> m?lcomment\d+<2B> )*<2A> statement\d+<2B> } {statement}
# turn control-flow element into statements
foreach type { ifelse if while for switch try } {
extract statement "<22> $type\\d+<2B> " block }
# extract valid code sequences
set elem "(mlcomment|vardecl|statement|lcomment)"
extract codeseq "<22> $elem\\d+<2B> (\\s*<2A> $elem\\d+<2B> )*" {block}
#
# Extract line breaks, spaces, and tabs from all types
#
if {$config_whitespace} {
set all_types ""
for {set i 0} {$i < $num} {incr i} {
if {[lsearch $all_types $typ($i)] == -1} {
lappend all_types $typ($i) }}
extract line {\n} $all_types
extract align { +(?= )} $all_types
extract space { } $all_types
extract tab {\t} $all_types
}
###############################
## Back-end helper functions ##
###############################
##
# Return name of reference token with specified index
##
proc token_by_idx {idx} {
global typ;
return "$typ($idx)$idx"
}
##
# Return index of specified reference token
##
proc idx_of_token {token} {
regexp {[0-9]+} $token idx
return $idx
}
##
# Return type of specified reference token
##
proc type_of_token {token} {
regexp {[a-z]+} $token type
return $type
}
##
# Return marker for reference token
##
proc marker {token} {
return "<22> $token<65> "
}
##
# Return text referenced by token
##
proc token_text {token} {
global txt
return $txt([idx_of_token $token])
}
##
# Assign a line number to each reference token
#
# To be able to provide error messages including line numbers, we
# determine the line number for each reference token and store it
# as an attribute.
#
# The result of the function is stored in the global 'ln' array.
##
proc assign_line_numbers {{token content0}} {
global ln curr_ln config_whitespace
if {$token == "content0"} { set curr_ln 1 }
# assign current line number to current token
set ln([idx_of_token $token]) $curr_ln
# count occurrences of line breaks
if {[type_of_token $token] == "line"} { incr curr_ln }
if {!$config_whitespace && ($token == "\n")} { incr curr_ln }
# count lines for all sub-tokens
set tex [token_text $token]
while {$tex != ""} {
# count and eat raw line breaks (needed if 'whitespace' option is disabled)
if {[regexp {^\n} $tex dummy]} {
if {!$config_whitespace} { incr curr_ln }
regsub {\n} $tex "" tex
}
# ignore plain text
if {[regexp {^[^<5E> \n]+} $tex plain]} {
regsub {^[^<5E> \n]+} $tex "" tex }
# traverse into token
if {[regexp {^<5E> (.+?)<29> } $tex dummy token]} {
assign_line_numbers $token
regsub {<7B> (.+?)<29> } $tex "" tex
}
}
}
##
# Look up line number of specified reference token
##
proc line_number {token} {
global ln
return $ln([idx_of_token $token])
}
##
# Output tokens as valid Tcl List
#
# The result of this function can be used directly
# as input by another Tcl script.
##
proc dump_tokens { } {
global num typ txt
set tokens [list]
for {set i 0} {($i < $num)} {incr i} {
set token [token_by_idx $i]
set text $txt($i)
lappend tokens [list $token [line_number $token] $text]
}
puts $tokens
}
##########################
## Source-code back end ##
##########################
##
# Output syntax tree as source code
#
# This constructs the source code from the syntax tree. It is
# useful to check the result against the input to make sure that
# no information gets lost during the parsing procedure.
##
proc dump_source { } {
global num typ txt
set output $txt(0)
while {[regexp {<7B> (.+?)<29> } $output dummy token]} {
regsub $dummy $output [token_text $token] output
}
# revert character substitutions of '&'
regsub -all {<7B> } $output "\\\&" output
puts $output
}
##################
## XML back end ##
##################
proc dump_xml_subtree {token} {
global dump_xml_indent line
set type [type_of_token $token]
set tex [token_text $token]
set line [line_number $token]
# shorten frequent leaf nodes
if {$type == "line"} {
puts "$dump_xml_indent<linebreak line=\"$line\"/>"
} elseif {$type == "tab"} {
puts "$dump_xml_indent<tab line=\"$line\"/>"
} elseif {$type == "space"} {
puts "$dump_xml_indent<space line=\"$line\"/>"
} elseif {$type == "align"} {
puts "$dump_xml_indent<align line=\"$line\">$tex</align>"
} else {
puts "$dump_xml_indent<$type line=\"$line\">"
set dump_xml_indent " $dump_xml_indent"
while {$tex != ""} {
# consume plain text
if {[regexp {^[^<5E> ]+} $tex plain]} {
# perform character substitutions for xml compliance
regsub -all {<7B> } $plain "\\\&" plain
regsub -all {<} $plain "\\\<" plain
regsub -all {>} $plain "\\\>" plain
regsub -all "\"" $plain "\\\"" plain
regsub -all "'" $plain "\\\'" plain
puts "$dump_xml_indent<plain line=\"$line\">$plain</plain>"
regsub {^[^<5E> ]+} $tex "" tex
}
# consume token
if {[regexp {<7B> (.+?)<29> } $tex dummy token]} {
dump_xml_subtree $token
regsub {<7B> (.+?)<29> } $tex "" tex
}
}
regsub " " $dump_xml_indent "" dump_xml_indent
puts "$dump_xml_indent</$type>"
}
}
##
# Output syntax tree as xml
##
proc dump_xml { } {
# reset indentation level
global dump_xml_indent
set dump_xml_indent ""
# output subtree beginning with the root node
dump_xml_subtree content0
}
##################
## Main program ##
##################
assign_line_numbers
if {$config_out_tokens} { dump_tokens }
if {$config_out_xml} { dump_xml }
if {$config_out_source} { dump_source }