mirror of
https://github.com/AFLplusplus/AFLplusplus.git
synced 2025-06-08 00:01:33 +00:00
290 lines
9.7 KiB
Python
290 lines
9.7 KiB
Python
import sys
|
|
import re
|
|
import copy
|
|
import json
|
|
from string import ascii_uppercase
|
|
from itertools import combinations
|
|
from collections import defaultdict
|
|
|
|
NONTERMINALSET = []
|
|
COUNT = 1
|
|
|
|
def main(grammar_file, out, start):
|
|
grammar = None
|
|
# If grammar file is a preprocessed NT file, then skip preprocessing
|
|
if '.json' in grammar_file:
|
|
with open(grammar_file, 'r') as fd:
|
|
grammar = json.load(fd)
|
|
elif '.g4' in grammar_file:
|
|
with open(grammar_file, 'r') as fd:
|
|
data = fd.readlines()
|
|
grammar = preprocess(data)
|
|
else:
|
|
raise('Unknwown file format passed. Accepts (.g4/.json)')
|
|
|
|
with open('debug_preprocess.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
grammar = remove_unit(grammar) # eliminates unit productions
|
|
with open('debug_unit.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
grammar = remove_mixed(grammar) # eliminate terminals existing with non-terminals
|
|
with open('debug_mixed.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
grammar = break_rules(grammar) # eliminate rules with more than two non-terminals
|
|
with open('debug_break.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
grammar = gnf(grammar)
|
|
|
|
# Dump GNF form of the grammar with only reachable rules
|
|
# reachable_grammar = get_reachable(grammar, start)
|
|
# with open('debug_gnf_reachable.json', 'w+') as fd:
|
|
# json.dump(reachable_grammar, fd)
|
|
with open('debug_gnf.json', 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
|
|
grammar["Start"] = [start]
|
|
with open(out, 'w+') as fd:
|
|
json.dump(grammar, fd)
|
|
|
|
def get_reachable(grammar, start):
|
|
'''
|
|
Returns a grammar without dead rules
|
|
'''
|
|
reachable_nt = set()
|
|
worklist = list()
|
|
processed = set()
|
|
reachable_grammar = dict()
|
|
worklist.append(start)
|
|
|
|
while worklist:
|
|
nt = worklist.pop(0)
|
|
processed.add(nt)
|
|
reachable_grammar[nt] = grammar[nt]
|
|
rules = grammar[nt]
|
|
for rule in rules:
|
|
tokens = gettokens(rule)
|
|
for token in tokens:
|
|
if not isTerminal(token):
|
|
if token not in processed:
|
|
worklist.append(token)
|
|
return reachable_grammar
|
|
|
|
|
|
def gettokens(rule):
|
|
pattern = re.compile("([^\s\"\']+)|\"([^\"]*)\"|\'([^\']*)\'")
|
|
return [matched.group(0) for matched in pattern.finditer(rule)]
|
|
|
|
def gnf(grammar):
|
|
old_grammar = copy.deepcopy(grammar)
|
|
new_grammar = defaultdict(list)
|
|
isgnf = False
|
|
while not isgnf:
|
|
for lhs, rules in old_grammar.items():
|
|
for rule in rules:
|
|
tokens = gettokens(rule)
|
|
if len(tokens) == 1 and isTerminal(rule):
|
|
new_grammar[lhs].append(rule)
|
|
continue
|
|
startoken = tokens[0]
|
|
endrule = tokens[1:]
|
|
if not isTerminal(startoken):
|
|
newrules = []
|
|
extendrules = old_grammar[startoken]
|
|
for extension in extendrules:
|
|
temprule = endrule[:]
|
|
temprule.insert(0, extension)
|
|
newrules.append(temprule)
|
|
for newnew in newrules:
|
|
new_grammar[lhs].append(' '.join(newnew))
|
|
else:
|
|
new_grammar[lhs].append(rule)
|
|
isgnf = True
|
|
for lhs, rules in new_grammar.items():
|
|
for rule in rules:
|
|
# if "\' \'" or isTerminal(rule):
|
|
tokens = gettokens(rule)
|
|
if len(tokens) == 1 and isTerminal(rule):
|
|
continue
|
|
startoken = tokens[0]
|
|
if not isTerminal(startoken):
|
|
isgnf = False
|
|
break
|
|
if not isgnf:
|
|
old_grammar = copy.deepcopy(new_grammar)
|
|
new_grammar = defaultdict(list)
|
|
return new_grammar
|
|
|
|
|
|
def preprocess(data):
|
|
productions = []
|
|
production = []
|
|
for line in data:
|
|
if line != '\n':
|
|
production.append(line)
|
|
else:
|
|
productions.append(production)
|
|
production = []
|
|
final_rule_set = {}
|
|
for production in productions:
|
|
rules = []
|
|
init = production[0]
|
|
nonterminal = init.split(':')[0]
|
|
rules.append(strip_chars(init.split(':')[1]).strip('| '))
|
|
for production_rule in production[1:]:
|
|
rules.append(strip_chars(production_rule.split('|')[0]))
|
|
final_rule_set[nonterminal] = rules
|
|
# for line in data:
|
|
# if line != '\n':
|
|
# production.append(line)
|
|
return final_rule_set
|
|
|
|
def remove_unit(grammar):
|
|
nounitproductions = False
|
|
old_grammar = copy.deepcopy(grammar)
|
|
new_grammar = defaultdict(list)
|
|
while not nounitproductions:
|
|
for lhs, rules in old_grammar.items():
|
|
for rhs in rules:
|
|
# Checking if the rule is a unit production rule
|
|
if len(gettokens(rhs)) == 1:
|
|
if not isTerminal(rhs):
|
|
new_grammar[lhs].extend([rule for rule in old_grammar[rhs]])
|
|
else:
|
|
new_grammar[lhs].append(rhs)
|
|
else:
|
|
new_grammar[lhs].append(rhs)
|
|
# Checking there are no unit productions left in the grammar
|
|
nounitproductions = True
|
|
for lhs, rules in new_grammar.items():
|
|
for rhs in rules:
|
|
if len(gettokens(rhs)) == 1:
|
|
if not isTerminal(rhs):
|
|
nounitproductions = False
|
|
break
|
|
if not nounitproductions:
|
|
break
|
|
# Unit productions are still there in the grammar -- repeat the process
|
|
if not nounitproductions:
|
|
old_grammar = copy.deepcopy(new_grammar)
|
|
new_grammar = defaultdict(list)
|
|
return new_grammar
|
|
|
|
def isTerminal(rule):
|
|
# pattern = re.compile("([r]*\'[\s\S]+\')")
|
|
pattern = re.compile("\'(.*?)\'")
|
|
match = pattern.match(rule)
|
|
if match:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def remove_mixed(grammar):
|
|
'''
|
|
Remove rules where there are terminals mixed in with non-terminals
|
|
'''
|
|
new_grammar = defaultdict(list)
|
|
for lhs, rules in grammar.items():
|
|
for rhs in rules:
|
|
# tokens = rhs.split(' ')
|
|
regen_rule = []
|
|
tokens = gettokens(rhs)
|
|
if len(gettokens(rhs)) == 1:
|
|
new_grammar[lhs].append(rhs)
|
|
continue
|
|
for token in tokens:
|
|
# Identify if there is a terminal in the RHS
|
|
if isTerminal(token):
|
|
# Check if a corresponding nonterminal already exists
|
|
nonterminal = terminal_exist(token, new_grammar)
|
|
if nonterminal:
|
|
regen_rule.append(nonterminal)
|
|
else:
|
|
new_nonterm = get_nonterminal()
|
|
new_grammar[new_nonterm].append(token)
|
|
regen_rule.append(new_nonterm)
|
|
else:
|
|
regen_rule.append(token)
|
|
new_grammar[lhs].append(' '.join(regen_rule))
|
|
return new_grammar
|
|
|
|
def break_rules(grammar):
|
|
new_grammar = defaultdict(list)
|
|
old_grammar = copy.deepcopy(grammar)
|
|
nomulti = False
|
|
while not nomulti:
|
|
for lhs, rules in old_grammar.items():
|
|
for rhs in rules:
|
|
tokens = gettokens(rhs)
|
|
if len(tokens) > 2 and (not isTerminal(rhs)):
|
|
split = tokens[:-1]
|
|
nonterminal = terminal_exist(' '.join(split), new_grammar)
|
|
if nonterminal:
|
|
newrule = ' '.join([nonterminal, tokens[-1]])
|
|
new_grammar[lhs].append(newrule)
|
|
else:
|
|
nonterminal = get_nonterminal()
|
|
new_grammar[nonterminal].append(' '.join(split))
|
|
newrule = ' '.join([nonterminal, tokens[-1]])
|
|
new_grammar[lhs].append(newrule)
|
|
else:
|
|
new_grammar[lhs].append(rhs)
|
|
nomulti = True
|
|
for lhs, rules in new_grammar.items():
|
|
for rhs in rules:
|
|
# tokens = rhs.split(' ')
|
|
tokens = gettokens(rhs)
|
|
if len(tokens) > 2 and (not isTerminal(rhs)):
|
|
nomulti = False
|
|
break
|
|
if not nomulti:
|
|
old_grammar = copy.deepcopy(new_grammar)
|
|
new_grammar = defaultdict(list)
|
|
return new_grammar
|
|
|
|
def strip_chars(rule):
|
|
return rule.strip('\n\t ')
|
|
|
|
def get_nonterminal():
|
|
global NONTERMINALSET
|
|
if NONTERMINALSET:
|
|
return NONTERMINALSET.pop(0)
|
|
else:
|
|
_repopulate()
|
|
return NONTERMINALSET.pop(0)
|
|
|
|
def _repopulate():
|
|
global COUNT
|
|
global NONTERMINALSET
|
|
NONTERMINALSET = [''.join(x) for x in list(combinations(ascii_uppercase, COUNT))]
|
|
COUNT += 1
|
|
|
|
def terminal_exist(token, grammar):
|
|
for nonterminal, rules in grammar.items():
|
|
if token in rules:
|
|
return nonterminal
|
|
return None
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description = 'Script to convert grammar to GNF form')
|
|
parser.add_argument(
|
|
'--gf',
|
|
type = str,
|
|
required = True,
|
|
help = 'Location of grammar file')
|
|
parser.add_argument(
|
|
'--out',
|
|
type = str,
|
|
required = True,
|
|
help = 'Location of output file')
|
|
parser.add_argument(
|
|
'--start',
|
|
type = str,
|
|
required = True,
|
|
help = 'Start token')
|
|
args = parser.parse_args()
|
|
|
|
main(args.gf, args.out, args.start)
|