update the codes, readme

- add readme - add required qlpack.yml
2025-06-17 20:28:08 +00:00 · 2021-04-03 02:02:45 +04:00
parent 565f61a6ab
commit a26ed3b758
6 changed files with 378 additions and 3 deletions
--- a/utils/autodict_ql/autodict_ql.py
+++ b/utils/autodict_ql/autodict_ql.py
@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+import os
+import string
+import binascii 
+import codecs
+import errno
+import struct
+import argparse
+import shutil
+import subprocess
+
+from binascii import unhexlify
+
+def ensure_dir(dir):
+    try:
+        os.makedirs(dir)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=(
+        "Helper - Specify input file analysis and output folder to save corpus for strings in the overall project ---------------------------------------------------------------------------  Example usage : python2 thisfile.py outdir str.txt"    ))
+    
+    #parser.add_argument("tokenpath",
+        #help="Destination directory for tokens")
+    parser.add_argument("cur",
+            help = "Current Path")
+    parser.add_argument("db",
+            help = "CodeQL database Path")
+    parser.add_argument("tokenpath",
+            help="Destination directory for tokens")
+
+    return parser.parse_args()
+
+def static_analysis(file,file2,cur,db) :
+    with open(cur+"/"+file, "w") as f:
+        print(cur+"/"+file)
+        stream = os.popen("codeql query run " + cur +"/"+ file2 +  " -d " + db )
+        output = stream.read()
+        f.write(output)
+        f.close()
+
+def copy_tokens(cur, tokenpath) :
+    subprocess.call(["cp " + cur  + "/" + "arrays-lits/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "strstr-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "strcmp-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "strncmp-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "local-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "memcmp-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "global-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "lits/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "arrays-lits/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "arrays-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    subprocess.call(["cp " + cur  + "/" + "strtool-strs/*" + " " + cur + "/" + tokenpath + "/."] ,shell=True)
+    #strtool-strs
+
+
+def codeql_analysis(cur, db) :
+    static_analysis("litout.out","litool.ql", cur, db)
+    static_analysis("strcmp-strings.out","strcmp-str.ql", cur, db)
+    static_analysis("strncmp-strings.out","strncmp-str.ql", cur, db)
+    static_analysis("strstr-strings.out","strstr-str.ql", cur, db)
+    static_analysis("memcmp-strings.out","memcmp-str.ql", cur, db)
+    static_analysis("global-values-strings.out","globals-values.ql", cur, db)
+    static_analysis("local-strings.out","locals-strs.ql", cur, db)
+    static_analysis("strtool-strings.out","strtool.ql", cur, db)
+    static_analysis("arrays.out","array-literals.ql", cur, db)
+    start_aflql(0,cur)
+    #command1 = [
+    #       'codeql','query', 'run',
+    #       cur + '/litool.ql',
+    #       '-d',
+    #       db, '>','fff.txt'
+    #    ]
+    #with open("litool2.log", "w") as f:
+    #    stream = os.popen("codeql query run litool.ql -d " + db )
+    #    output = stream.read()
+    #    f.write(output)
+    #    f.close()
+    #worker1 = subprocess.Popen(command1)
+    #print(worker1.communicate())
+
+
+def start_aflql(tokenpath, cur):
+    command = [
+           'python3',
+           cur + '/litan.py',
+           cur+'/lits/',
+           cur+'/litout.out'
+        ]
+    worker1 = subprocess.Popen(command)
+    print(worker1.communicate())
+    
+    command1 = [
+           'python3',
+           cur + '/strcmp-strings.py',
+           cur + '/strcmp-strs/',
+           cur + '/strcmp-strings.out'
+        ]
+    worker2 = subprocess.Popen(command1)
+    print(worker2.communicate())
+
+    command2 = [
+           'python3',
+           cur + '/strncmp-strings.py',
+           cur + '/strncmp-strs/',
+           cur + '/strncmp-strings.out'
+        ]
+    worker3 = subprocess.Popen(command2)
+    print(worker3.communicate())
+
+    command3 = [
+           'python3',
+           cur + '/array-lits.py',
+           cur + '/arrays-lits/',
+           cur + '/arrays.out'
+        ]
+    worker4 = subprocess.Popen(command3)
+    print(worker4.communicate())
+
+    command4 = [
+           'python3',
+           cur + '/array-strings.py',
+           cur + '/arrays-strs/',
+           cur + '/arrays.out'
+        ]
+    worker5 = subprocess.Popen(command4)
+    print(worker5.communicate())
+
+
+    command5 = [
+           'python3',
+           cur + '/memcmp-strings.py',
+           cur + '/memcmp-strs/',
+           cur + '/memcmp-strings.out'
+        ]
+    worker6 = subprocess.Popen(command5)
+    print(worker6.communicate())
+
+    command6 = [
+           'python3',
+           cur + '/globals-strings.py',
+           cur + '/global-strs/',
+           cur + '/global-values-strings.out'
+        ]
+    worker7 = subprocess.Popen(command6)
+    print(worker7.communicate())
+
+    command7 = [
+           'python3',
+           cur + '/strstr-strings.py',
+           cur + '/strstr-strs/',
+           cur + '/strstr-strings.out'
+        ]
+    worker8 = subprocess.Popen(command7)
+    print(worker8.communicate())
+
+
+    #strtool-strings.out
+
+    command8 = [
+           'python3',
+           cur + '/stan-strings.py',
+           cur + '/strtool-strs/',
+           cur + '/strtool-strings.out'
+        ]
+    worker9 = subprocess.Popen(command8)
+    print(worker9.communicate())
+
+    command9 = [
+           'python3',
+           cur + '/local-strings.py',
+           cur + '/local-strs/',
+           cur + '/local-strings.out'
+        ]
+    worker10 = subprocess.Popen(command9)
+    print(worker10.communicate())
+
+def main():
+    args = parse_args()    
+    ensure_dir(args.tokenpath)
+    #copy_tokens(args.cur, args.tokenpath)
+    codeql_analysis(args.cur, args.db)
+    copy_tokens(args.cur, args.tokenpath)
+    #start_aflql(args.tokenpath, args.cur)
+if __name__ == '__main__':
+    main()
--- a/utils/autodict_ql/build-codeql.sh
+++ b/utils/autodict_ql/build-codeql.sh
@ -0,0 +1,17 @@
+cd ~
+if [ -d "codeql-home" ]; then
+    echo "Exist !"
+    exit 1
+fi
+sudo apt install build-essential libtool-bin python3-dev automake git vim wget -y
+mkdir codeql-home
+cd codeql-home
+git clone https://github.com/github/codeql.git codeql-repo
+git clone https://github.com/github/codeql-go.git
+wget https://github.com/github/codeql-cli-binaries/releases/download/v2.4.6/codeql-linux64.zip
+unzip codeql-linux64.zip 
+mv codeql codeql-cli
+export "PATH=~/codeql-home/codeql-cli/:$PATH"
+codeql resolve languages
+codeql resolve qlpacks
+echo "export PATH=~/codeql-home/codeql-cli/:$PATH" >> ~/.bashrc
--- a/utils/autodict_ql/litan.py
+++ b/utils/autodict_ql/litan.py
@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# Autodict-QL - Optimal token generation for fuzzing
+# Part of AFL++ Project
+# Author : Microsvuln - Arash.vre@gmail.com
+import string
+import os
+import binascii 
+import codecs
+import struct
+import errno
+import argparse
+import re
+import base64
+from binascii import unhexlify
+def parse_args():
+    parser = argparse.ArgumentParser(description=(
+        "Helper - Specify input file to analysis and output folder to save corpdirus for constants in the overall project -------  Example usage : python2 thisfile.py outdir o.txt"))
+    parser.add_argument("corpdir",
+        help="The path to the corpus directory to generate files.")
+    parser.add_argument("infile",
+        help="Specify file output of codeql analysis - ex. ooo-hex.txt, analysis take place on this file, example : python2 thisfile.py outdir out.txt")
+    return parser.parse_args()    
+def ensure_dir(dir):
+    try:
+        os.makedirs(dir)
+    except OSError as e:
+        if e.errno == errno.EEXIST:
+            #print "[-] Directory exists, specify another directory"
+            exit(1)
+def do_analysis1(corpdir, infile):
+    with open(infile, "rb") as f:        
+        lines = f.readlines()[1:]       
+        f.close()       
+        new_lst = []
+        n = 1
+        for i, num in enumerate(lines):
+            if i != 0:
+                new_lst.append(num)                               
+                str1 = str(num)
+                print ("num is " + str1)
+                str1 = str1.rstrip('\n\n')
+                #str1 = str1.replace("0x","");
+                str1 = str1.replace("|","")
+                str1 = str1.rstrip('\r\n')
+                str1 = str1.rstrip('\n')
+                str1 = str1.replace(" ","")
+                    #str1 = str1.translate(None, string.punctuation)
+                translator=str.maketrans('','',string.punctuation)
+                str1=str1.translate(translator)
+                str1 = str1[1:]
+                str1 = str1[:-1]
+                print("After cleanup : " + str1)
+                if (str1 != '0') and (str1 != 'ffffffff') and (str1 != 'fffffffe') or (len(str1) == 4) or (len(str1) == 8):
+                    print ("first : "+str1)
+                    if len(str1) > 8 :
+                        str1 = str1[:-1]
+                    elif (len(str1) == 5) :
+                        str1 = str1 = "0"
+                    try:
+                            #str1 = str1.decode("hex")
+                            with open(corpdir+'/lit-seed{0}'.format(n), 'w') as file:                    
+                                    str1 = str1.replace("0x","");
+                                    print (str1)                                    
+                                    str1 = int(str1,base=16)                                    
+                                    str1 = str1.to_bytes(4, byteorder='little')                                                          
+                                    file.write(str(str1))                                    
+                                    file.close()
+                                    with open (corpdir+'/lit-seed{0}'.format(n), 'r') as q :
+                                        a = q.readline()                                        
+                                        a = a[1:]
+                                        print ("AFL++ Autodict-QL by Microsvuln : Writing Token :" + str(a))
+                                        q.close()
+                                        with open (corpdir+'/lit-seed{0}'.format(n), 'w') as w1 :
+                                                w1.write(str(a))
+                                                print ("Done!")
+                                                w1.close()                                                                                
+                    except:                                 
+                            print("Error!") 
+                    n = n+1
+
+def main():
+    args = parse_args()    
+    ensure_dir(args.corpdir)
+    do_analysis1(args.corpdir, args.infile)
+if __name__ == '__main__':
+    main()
--- a/utils/autodict_ql/qlpack.yml
+++ b/utils/autodict_ql/qlpack.yml
@ -0,0 +1,3 @@
+name: automate
+version: 0.0.0
+libraryPathDependencies: codeql-cpp
--- a/utils/autodict_ql/readme.md
+++ b/utils/autodict_ql/readme.md
@ -0,0 +1,81 @@
+# Autodict-QL - Optimal Token Generation for Fuzzing
+
+## What is this?
+
+Autodict-QL is a plugin system that enables fast generation of Tokens/Dictionaries in a handy way that can be manipulated by the user (Unlike The LLVM Passes that are hard to modify). This means that autodict-ql is a scriptable feature which basically uses the CodeQL (A powerful semantic code analysis engine) to fetch information from a code base.
+
+Tokens are useful when you perform fuzzing on different parsers. AFL++ `-x` switch enables the usage of dictionaries through your fuzzing campagin. if you are not familiar with Dictionaries in fuzzing, take a look [here](https://github.com/AFLplusplus/AFLplusplus/tree/stable/dictionaries) .
+
+
+## Why CodeQL ?
+We basically developed this plugin on top of CodeQL engine because it gives the user scripting features, it's easier and it's independent of the LLVM system. This means that a user can write his CodeQL scripts or modify the current scripts to improve or change the token generation algorithms based on different program analysis concepts.
+
+
+## CodeQL scripts
+Currently, we pushed some scripts as defaults for Token generation. In addition, we provide every CodeQL script as an standalone script because it's easier to modify or test.
+
+Currently we provided the following CodeQL scripts :
+
+`strcmp-str.ql` is used to extract strings that are related to `strcmp` function.
+
+`strncmp-str.ql` is used to extract the strings from the `strncmp` function.
+
+`memcmp-str.ql` is used to extract the strings from the `memcmp` function.
+
+`litool.ql` extracts Magic numbers as Hexadecimal format.
+
+`strtool.ql` extracts strings with uses of a regex and dataflow concept to capture the string comparison functions. if strcmp is rewritten in a project as Mystrcmp or something like strmycmp, then this script can catch the arguments and these are valuable tokens.
+
+You can write other CodeQL scripts to extract possible effective tokens if you think they can be useful.
+
+
+## Usage
+The usage of Autodict-QL is pretty easy. But let's describe it as :
+
+1. First of all, you need to have CodeQL installed on the system. we make this possible with `build-codeql.sh` bash script. This script will install CodeQL completety and will set the required environment variables for your system, so :
+
+` # chmod +x codeql-build.sh`
+
+` # codeql `
+
+Then you should get :
+
+` Usage: codeql <command> <argument>...
+Create and query CodeQL databases, or work with the QL language.
+
+GitHub makes this program freely available for the analysis of open-source software and certain other uses, but it is
+not itself free software. Type codeql --license to see the license terms.
+
+      --license              Show the license terms for the CodeQL toolchain.
+Common options:
+  -h, --help                 Show this help text.
+  -v, --verbose              Incrementally increase the number of progress messages printed.
+  -q, --quiet                Incrementally decrease the number of progress messages printed.
+Some advanced options have been hidden; try --help -v for a fuller view.
+Commands:
+  query     Compile and execute QL code.
+  bqrs      Get information from .bqrs files.
+  database  Create, analyze and process CodeQL databases.
+  dataset   [Plumbing] Work with raw QL datasets.
+  test      Execute QL unit tests.
+  resolve   [Deep plumbing] Helper commands to resolve disk locations etc.
+  execute   [Deep plumbing] Low-level commands that need special JVM options.
+  version   Show the version of the CodeQL toolchain.
+  generate  Generate formatted QL documentation.
+  github    Commands useful for interacting with the GitHub API through CodeQL.
+`
+
+2. Compiler your project with CodeQL: For using the Autodict-QL plugin, you need to compile the source of the target you want to fuzz with CodeQL. This is not something hard .
+	- First you need to create a CodeQL database of the project codebase, suppose we want to compile the libxml with codeql. go to libxml and issue the following commands:
+		- `./configure --disable-shared`
+		- `codeql create database libxml-db --language=cpp --command=make
+			- Now you have the CodeQL database of the project :-)
+3. To run the Autodict-QL, the final step is to just create a folder named `automate` in the project you want to fuzz.
+	- `mkdir automate` (inside the libxml directory)
+4. The final step is to update the CodeQL database you created in the step 2 inside the automate dir you created at step 3 :
+	- `codeql database upgrade ../libxml-db`
+5. Everything is set! :-), now you should issue the following to get the tokens :
+		- `python3 autodict-ql.py [CURRECT_DIR] [CODEQL_DATABASE_PATH] [TOKEN_PATH]`
+			- example : `python3 autodict-ql.py /home/user/libxml/automate /home/user/libxml/libxml-db tokens`
+				- This will create the final `tokens` dir for you and you are done, then pass the tokens path to afl `-x` flag.
+6. Done! 
--- a/utils/autodict_ql/strtool.ql
+++ b/utils/autodict_ql/strtool.ql
@ -3,8 +3,8 @@ import semmle.code.cpp.dataflow.DataFlow
 class StringLiteralNode extends DataFlow::Node {
  StringLiteralNode() { this.asExpr() instanceof StringLiteral }
 }
-class MemcmpArgNode extends DataFlow::Node {
-   MemcmpArgNode() {
+class CmpArgNode extends DataFlow::Node {
+   CmpArgNode() {
    exists(FunctionCall fc |
      fc.getTarget().getName().regexpMatch(".*(str|mem|strn|b)*(cmp|str)*") and
      fc.getArgument(0) = this.asExpr() 
@ -17,7 +17,7 @@ class MemcmpArgNode extends DataFlow::Node {
  }
 }

-from StringLiteralNode src, MemcmpArgNode arg
+from StringLiteralNode src, CmpArgNode arg
 where
  DataFlow::localFlow(src, arg)