#!/usr/bin/tclsh # # \brief Regular-expression-based device-tree source parser # \author Norman Feske # \date 2021-05-14 # # read file set input_pathname [lindex $argv 0] if {[catch { # # Create root node of the syntax tree # set txt(0) [exec cat $input_pathname] set typ(0) content set num 1 }]} { foreach line { "" "Parse device-tree source file and output syntax tree." "" " usage: parse " "" } { puts stderr $line } exit -1; } # do not stop parsing (this variable is only used for debugging) set stop 0 # # Replace all '&' characters from the original input # because they cause trouble with the regexp command. # regsub -all {&} $txt(0) "³" txt(0) ## # Extract expression from content # # All expressions that occur in the token types 'typelist' # and that match the 'subexpr' criterion get replaced in # their original token by a reference tag and form a new # token of the type 'newtype'. # # The reference is coded as §°. # Since the reference has the type coded in, we can # match sub tokens of specific types via plain regular # expressions. ## proc extract {newtype subexpr typelist} { global num txt typ stop set old_num $num if {$stop} { return } for {set i 0} {$i < $old_num} {incr i} { if {[lsearch $typelist $typ($i)] > -1} { while {[regexp $subexpr $txt($i) mid]} { # new sub text # set typ($num) $newtype set txt($num) $mid # substitute expression by a reference # regsub $subexpr $txt($i) "§$newtype$num°" txt($i) incr num } } } } ##################################################### ## Rules for splitting the input into its elements ## ##################################################### # # Starting with only the root token (content0) of the syntax tree # containing the whole source code as one string, we extract # typed sub tokens to partition the string into parts of distinct # meanings (token types). In the process of subsequently # applying extraction rules to specific token types, a syntax # tree is formed. # # extract C-style line comments extract lcomment {/\*[^\n]*?\*/} content # extract C-style multi-line comments extract mlcomment {/\*.*?\*/} content # extract strings # # Strings may contain quoted '"' characters. # extract string {\"([^\"]|\")*?\"} content # extract labels and references to labels extract labeldef {[\w_][\w\d_-]*:} {content} extract labelname {^[\w_][\w\d_-]*(?=(:|$))} {labeldef} extract reflabel {³[\w_][\w\d_-]*} {content} extract reflabelname {[\w_][\w\d_-]*$} {reflabel} # fold parenthesis and blocks extract parenblk {\([^()]*?\)} {content parenblk} extract block {\{[^{}]*?\}} {content parenblk block} extract cell {\<[^<>]*?\>} {content parenblk block} extract openbrace "\{" block extract closebrace "\}" block extract openparen {\(} parenblk extract closeparen {\)} parenblk # property assignments extract property {[\w_#][\w\d_\-,.+?#]*\s*=[^;]*;\n?} {block} extract propertyname {^[\w_#][\w\d_\-,.+?#]*(?=\s*=)} {property} extract propertyvalue {§(cell|string)\d+°} {property} # root nodes extract node {\s*/\s*§block\d+°;\n?} {content} extract nodename {/(?=\s*§block)} {node} # nodes extract node {(/omit-if-no-ref/)?\s*(§labeldef\d+°\s*)?\s*[\w_][\w\d_\-,.+@]*\s*§block\d+°;\n?} {content block} extract nodename {[\w_][\w\d_\-,.+@]*(?=\s*§block)} {node} # supplemental node content provided via '&label { ... }' at the top level extract nodesupplement {§reflabel\d+°\s*§block\d+°;\n?} {content} ############################### ## Back-end helper functions ## ############################### ## # Return name of reference token with specified index ## proc token_by_idx {idx} { global typ; return "$typ($idx)$idx" } ## # Return index of specified reference token ## proc idx_of_token {token} { regexp {[0-9]+} $token idx return $idx } ## # Return type of specified reference token ## proc type_of_token {token} { regexp {[a-z]+} $token type return $type } ## # Return marker for reference token ## proc marker {token} { return "§$token°" } ## # Return text referenced by token ## proc token_text {token} { global txt return $txt([idx_of_token $token]) } ## # Assign a line number to each reference token # # To be able to provide error messages including line numbers, we # determine the line number for each reference token and store it # as an attribute. # # The result of the function is stored in the global 'ln' array. ## proc assign_line_numbers {{token content0}} { global ln curr_ln if {$token == "content0"} { set curr_ln 1 } # assign current line number to current token set ln([idx_of_token $token]) $curr_ln # count occurrences of line breaks if {[type_of_token $token] == "line"} { incr curr_ln } if {$token == "\n"} { incr curr_ln } # count lines for all sub-tokens set tex [token_text $token] while {$tex != ""} { # count and eat raw line breaks (needed if 'whitespace' option is disabled) if {[regexp {^\n} $tex dummy]} { incr curr_ln regsub {\n} $tex "" tex } # ignore plain text if {[regexp {^[^§\n]+} $tex plain]} { regsub {^[^§\n]+} $tex "" tex } # traverse into token if {[regexp {^§(.+?)°} $tex dummy token]} { assign_line_numbers $token regsub {§(.+?)°} $tex "" tex } } } ## # Look up line number of specified reference token ## proc line_number {token} { global ln return $ln([idx_of_token $token]) } ## # Output tokens as valid Tcl List # # The result of this function can be used directly # as input by another Tcl script. ## proc dump_tokens { } { global num typ txt set tokens [list] for {set i 0} {($i < $num)} {incr i} { set token [token_by_idx $i] set text $txt($i) lappend tokens [list $token [line_number $token] $text] } puts $tokens } ################## ## Main program ## ################## assign_line_numbers dump_tokens