AFLplusplus/custom_mutators/gramatron/preprocess/construct_automata.py

import sys
import json
import re
from collections import defaultdict
# import pygraphviz as pgv

gram_data = None
state_count = 1
pda = []
worklist = []
state_stacks = {}

# === If user provides upper bound on the stack size during FSA creation ===
# Specifies the upper bound to which the stack is allowed to grow
# If for any generated state, the stack size is >= stack_limit then this
# state is not expanded further.
stack_limit = None
# Holds the set of unexpanded rules owing to the user-passed stack constraint limit
unexpanded_rules = set()

def main(grammar, limit):
    global worklist, gram_data, stack_limit
    current = '0'
    stack_limit = limit
    if stack_limit:
        print ('[X] Operating in bounded stack mode')

    with open(grammar, 'r') as fd:
        gram_data = json.load(fd)
    start_symbol = gram_data["Start"][0]
    worklist.append([current, [start_symbol]])
    # print (grammar)
    filename = (grammar.split('/')[-1]).split('.')[0]


    while worklist:
        # Take an element from the worklist
        # print ('================')
        # print ('Worklist:', worklist)
        element = worklist.pop(0)
        prep_transitions(element)

    pda_file = filename + '_transition.json'
    graph_file = filename + '.png'
    # print ('XXXXXXXXXXXXXXXX')
    # print ('PDA file:%s Png graph file:%s' % (pda_file, graph_file))
    # XXX Commented out because visualization of current version of PHP causes segfault
    # Create the graph and dump the transitions to a file
    # create_graph(filename)
    transformed = postprocess()
    with open(filename + '_automata.json', 'w+') as fd:
        json.dump(transformed, fd)
    with open(filename + '_transition.json', 'w+') as fd:
        json.dump(pda, fd)
    if not unexpanded_rules:
        print ('[X] No unexpanded rules, absolute FSA formed')
        exit(0)
    else:
        print ('[X] Certain rules were not expanded due to stack size limit. Inexact approximation has been created and the disallowed rules have been put in {}_disallowed.json'.format(filename))
        print ('[X] Number of unexpanded rules:', len(unexpanded_rules))
        with open(filename + '_disallowed.json', 'w+') as fd:
            json.dump(list(unexpanded_rules), fd)

def create_graph(filename):
    '''
    Creates a DOT representation of the PDA
    '''
    global pda
    G = pgv.AGraph(strict = False, directed = True)
    for transition in pda:
        print ('Transition:', transition)
        G.add_edge(transition['source'], transition['dest'],
                label = 'Term:{}'.format(transition['terminal']))
    G.layout(prog = 'dot')
    print ('Do it up 2')
    G.draw(filename + '.png')

def prep_transitions(element):
    '''
    Generates transitions
    '''
    global gram_data, state_count, pda, worklist, state_stacks, stack_limit, unexpanded_rules
    state = element[0]
    try:
        nonterminal = element[1][0]
    except IndexError:
        # Final state was encountered, pop from worklist without doing anything
        return
    rules = gram_data[nonterminal]
    count = 1
    for rule in rules:
        isRecursive  = False
        # print ('Current state:', state)
        terminal, ss, termIsRegex = tokenize(rule)
        transition = get_template()
        transition['trigger'] = '_'.join([state, str(count)])
        transition['source'] = state
        transition['dest'] = str(state_count)
        transition['ss'] = ss
        transition['terminal'] = terminal
        transition['rule'] = "{} -> {}".format(nonterminal, rule )
        if termIsRegex:
            transition['termIsRegex'] = True

        # Creating a state stack for the new state
        try:
            state_stack = state_stacks[state][:]
        except:
            state_stack = []
        if len(state_stack):
            state_stack.pop(0)
        if ss:
            for symbol in ss[::-1]:
                state_stack.insert(0, symbol)
        transition['stack'] = state_stack

        # Check if a recursive transition state being created, if so make a backward
        # edge and don't add anything to the worklist
        # print (state_stacks)
        if state_stacks:
            for state_element, stack in state_stacks.items():
                # print ('Stack:', sorted(stack))
                # print ('State stack:', sorted(state_stack))
                if sorted(stack) == sorted(state_stack):
                    transition['dest'] = state_element
                    # print ('Recursive:', transition)
                    pda.append(transition)
                    count += 1
                    isRecursive = True
                    break
        # If a recursive transition exercised don't add the same transition as a new
        # edge, continue onto the next transitions
        if isRecursive:
            continue

        # If the generated state has a stack size > stack_limit then that state is abandoned
        # and not added to the FSA or the worklist for further expansion
        if stack_limit:
            if (len(transition['stack']) > stack_limit):
                unexpanded_rules.add(transition['rule'])
                continue

        # Create transitions for the non-recursive relations and add to the worklist
        # print ('Normal:', transition)
        # print ('State2:', state)
        pda.append(transition)
        worklist.append([transition['dest'], transition['stack']])
        state_stacks[transition['dest']] = state_stack
        state_count += 1
        count += 1

def tokenize(rule):
    '''
    Gets the terminal and the corresponding stack symbols from a rule in GNF form
    '''
    pattern = re.compile("([r])*\'([\s\S]+)\'([\s\S]*)")
    terminal = None
    ss = None
    termIsRegex = False
    match = pattern.match(rule)
    if match.group(1):
        termIsRegex = True
    if match.group(2):
        terminal = match.group(2)
    else:
        raise AssertionError("Rule is not in GNF form")

    if match.group(3):
        ss = (match.group(3)).split()

    return terminal, ss, termIsRegex

def get_template():
    transition_template = {
            'trigger':None,
            'source': None,
            'dest': None,
            'termIsRegex': False,
            'terminal' : None,
            'stack': []
            }
    return transition_template

def postprocess():
    '''
    Creates a representation to be passed on to the C-module
    '''
    global pda
    final_struct = {}
    memoized = defaultdict(list)
    # Supporting data structures for if stack limit is imposed
    culled_pda = []
    culled_final = []
    num_transitions = 0 # Keep track of number of transitions


    states, final, initial = _get_states()

    print (initial)
    assert len(initial) == 1, 'More than one init state found'

    # Cull transitions to states which were not expanded owing to the stack limit
    if stack_limit:

        blocklist = []
        for final_state in final:
            for transition in pda:
                if (transition["dest"] == final_state) and (len(transition["stack"]) > 0):
                    blocklist.append(transition["dest"])
                    continue
                else:
                    culled_pda.append(transition)

        culled_final = [state for state in final if state not in blocklist]

        assert len(culled_final) == 1, 'More than one final state found'

        for transition in culled_pda:
            state = transition["source"]
            if transition["dest"] in blocklist:
                    continue
            num_transitions += 1
            memoized[state].append([transition["trigger"], transition["dest"],
                transition["terminal"]])
        final_struct["init_state"] = initial
        final_struct["final_state"] = culled_final[0]
        # The reason we do this is because when states are culled, the indexing is
        # still relative to the actual number of states hence we keep numstates recorded
        # as the original number of states
        print ('[X] Actual Number of states:', len(memoized.keys()))
        print ('[X] Number of transitions:', num_transitions)
        print ('[X] Original Number of states:', len(states))
        final_struct["numstates"] = len(states)
        final_struct["pda"] = memoized
        return final_struct

    # Running FSA construction in exact approximation mode and postprocessing it like so
    for transition in pda:
       state = transition["source"]
       memoized[state].append([transition["trigger"], transition["dest"],
           transition["terminal"]])

    final_struct["init_state"] = initial
    final_struct["final_state"] = final[0]
    print ('[X] Actual Number of states:', len(memoized.keys()))
    final_struct["numstates"] = len(memoized.keys())
    final_struct["pda"] = memoized
    return final_struct


def _get_states():
    source = set()
    dest = set()
    global pda
    for transition in pda:
        source.add(transition["source"])
        dest.add(transition["dest"])
    source_copy = source.copy()
    source_copy.update(dest)
    return list(source_copy), list(dest.difference(source)), str(''.join(list(source.difference(dest))))

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description = 'Script to convert GNF grammar to PDA')
    parser.add_argument(
            '--gf',
            type = str,
            help = 'Location of GNF grammar')
    parser.add_argument(
            '--limit',
            type = int,
            default = None,
            help = 'Specify the upper bound for the stack size')
    args = parser.parse_args()
    main(args.gf, args.limit)