Dynamic instrumentation filtering for LLVM native (#1971)

* Add two dynamic instrumentation filter methods to runtime

* Always use pc-table with native pcguard

* Add make_symbol_list.py and README
This commit is contained in:
Christian Holler (:decoder)
2024-01-26 15:46:56 +01:00
committed by GitHub
parent 2f9eeef60c
commit 58b80b68bc
4 changed files with 393 additions and 40 deletions

View File

@ -22,6 +22,10 @@
#define __USE_GNU #define __USE_GNU
#endif #endif
#include <dlfcn.h> #include <dlfcn.h>
__attribute__((weak)) void __sanitizer_symbolize_pc(void *, const char *fmt,
char *out_buf,
size_t out_buf_size);
#endif #endif
#ifdef __ANDROID__ #ifdef __ANDROID__
@ -124,8 +128,8 @@ struct afl_module_info_t {
uintptr_t base_address; uintptr_t base_address;
// PC Guard start/stop // PC Guard start/stop
u32 start; u32 *start;
u32 stop; u32 *stop;
// PC Table begin/end // PC Table begin/end
const uintptr_t *pcs_beg; const uintptr_t *pcs_beg;
@ -147,6 +151,18 @@ afl_module_info_t *__afl_module_info = NULL;
u32 __afl_pcmap_size = 0; u32 __afl_pcmap_size = 0;
uintptr_t *__afl_pcmap_ptr = NULL; uintptr_t *__afl_pcmap_ptr = NULL;
typedef struct {
uintptr_t start;
u32 len;
} FilterPCEntry;
u32 __afl_filter_pcs_size = 0;
FilterPCEntry *__afl_filter_pcs = NULL;
u8 *__afl_filter_pcs_module = NULL;
#endif // __AFL_CODE_COVERAGE #endif // __AFL_CODE_COVERAGE
/* 1 if we are running in afl, and the forkserver was started, else 0 */ /* 1 if we are running in afl, and the forkserver was started, else 0 */
@ -1587,15 +1603,116 @@ void __sanitizer_cov_trace_pc_guard(uint32_t *guard) {
} }
#ifdef __AFL_CODE_COVERAGE #ifdef __AFL_CODE_COVERAGE
void __sanitizer_cov_pcs_init(const uintptr_t *pcs_beg, void afl_read_pc_filter_file(const char *filter_file) {
const uintptr_t *pcs_end) {
if (__afl_debug) { FILE *file;
char ch;
fprintf(stderr, "DEBUG: __sanitizer_cov_pcs_init called\n"); file = fopen(filter_file, "r");
if (file == NULL) {
perror("Error opening file");
return;
} }
// Check how many PCs we expect to read
while ((ch = fgetc(file)) != EOF) {
if (ch == '\n') { __afl_filter_pcs_size++; }
}
// Rewind to actually read the PCs
fseek(file, 0, SEEK_SET);
__afl_filter_pcs = malloc(__afl_filter_pcs_size * sizeof(FilterPCEntry));
if (!__afl_filter_pcs) {
perror("Error allocating PC array");
return;
}
for (size_t i = 0; i < __afl_filter_pcs_size; i++) {
fscanf(file, "%lx", &(__afl_filter_pcs[i].start));
ch = fgetc(file); // Read tab
fscanf(file, "%u", &(__afl_filter_pcs[i].len));
ch = fgetc(file); // Read tab
if (!__afl_filter_pcs_module) {
// Read the module name and store it.
// TODO: We only support one module here right now although
// there is technically no reason to support multiple modules
// in one go.
size_t max_module_len = 255;
size_t i = 0;
__afl_filter_pcs_module = malloc(max_module_len);
while (i < max_module_len - 1 &&
(__afl_filter_pcs_module[i] = fgetc(file)) != '\t') {
++i;
}
__afl_filter_pcs_module[i] = '\0';
fprintf(stderr, "DEBUGXXX: Read module name %s\n",
__afl_filter_pcs_module);
}
while ((ch = fgetc(file)) != '\n' && ch != EOF)
;
}
fclose(file);
}
u32 locate_in_pcs(uintptr_t needle, u32 *index) {
size_t lower_bound = 0;
size_t upper_bound = __afl_filter_pcs_size - 1;
while (lower_bound < __afl_filter_pcs_size && lower_bound <= upper_bound) {
size_t current_index = lower_bound + (upper_bound - lower_bound) / 2;
if (__afl_filter_pcs[current_index].start <= needle) {
if (__afl_filter_pcs[current_index].start +
__afl_filter_pcs[current_index].len >
needle) {
// Hit
*index = current_index;
return 1;
} else {
lower_bound = current_index + 1;
}
} else {
if (!current_index) { break; }
upper_bound = current_index - 1;
}
}
return 0;
}
void __sanitizer_cov_pcs_init(const uintptr_t *pcs_beg,
const uintptr_t *pcs_end) {
// If for whatever reason, we cannot get dlinfo here, then pc_guard_init also // If for whatever reason, we cannot get dlinfo here, then pc_guard_init also
// couldn't get it and we'd end up attributing to the wrong module. // couldn't get it and we'd end up attributing to the wrong module.
Dl_info dlinfo; Dl_info dlinfo;
@ -1608,6 +1725,16 @@ void __sanitizer_cov_pcs_init(const uintptr_t *pcs_beg,
} }
if (__afl_debug) {
fprintf(
stderr,
"DEBUG: (%u) __sanitizer_cov_pcs_init called for module %s with %ld "
"PCs\n",
getpid(), dlinfo.dli_fname, pcs_end - pcs_beg);
}
afl_module_info_t *last_module_info = __afl_module_info; afl_module_info_t *last_module_info = __afl_module_info;
while (last_module_info && last_module_info->next) { while (last_module_info && last_module_info->next) {
@ -1623,34 +1750,78 @@ void __sanitizer_cov_pcs_init(const uintptr_t *pcs_beg,
} }
if (strcmp(dlinfo.dli_fname, last_module_info->name)) {
// This can happen with modules being loaded after the forkserver
// where we decide to not track the module. In that case we must
// not track it here either.
fprintf(
stderr,
"WARNING: __sanitizer_cov_pcs_init module info mismatch: %s vs %s\n",
dlinfo.dli_fname, last_module_info->name);
return;
}
last_module_info->pcs_beg = pcs_beg; last_module_info->pcs_beg = pcs_beg;
last_module_info->pcs_end = pcs_end; last_module_info->pcs_end = pcs_end;
// This is a direct filter based on symbolizing inside the runtime.
// It should only be used with smaller binaries to avoid long startup
// times. Currently, this only supports a single token to scan for.
const char *pc_filter = getenv("AFL_PC_FILTER");
// This is a much faster PC filter based on pre-symbolized input data
// that is sorted for fast lookup through binary search. This method
// of filtering is suitable even for very large binaries.
const char *pc_filter_file = getenv("AFL_PC_FILTER_FILE");
if (pc_filter_file && !__afl_filter_pcs) {
afl_read_pc_filter_file(pc_filter_file);
}
// Now update the pcmap. If this is the last module coming in, after all // Now update the pcmap. If this is the last module coming in, after all
// pre-loaded code, then this will also map all of our delayed previous // pre-loaded code, then this will also map all of our delayed previous
// modules. // modules.
//
if (!__afl_pcmap_ptr) { return; }
for (afl_module_info_t *mod_info = __afl_module_info; mod_info; for (afl_module_info_t *mod_info = __afl_module_info; mod_info;
mod_info = mod_info->next) { mod_info = mod_info->next) {
if (mod_info->mapped) { continue; } if (mod_info->mapped) { continue; }
if (!mod_info->start) {
fprintf(stderr,
"ERROR: __sanitizer_cov_pcs_init called with mod_info->start == "
"NULL (%s)\n",
mod_info->name);
abort();
}
PCTableEntry *start = (PCTableEntry *)(mod_info->pcs_beg); PCTableEntry *start = (PCTableEntry *)(mod_info->pcs_beg);
PCTableEntry *end = (PCTableEntry *)(mod_info->pcs_end); PCTableEntry *end = (PCTableEntry *)(mod_info->pcs_end);
if (!*mod_info->stop) { continue; }
u32 in_module_index = 0; u32 in_module_index = 0;
while (start < end) { while (start < end) {
if (mod_info->start + in_module_index >= __afl_map_size) { if (*mod_info->start + in_module_index >= __afl_map_size) {
fprintf(stderr, "ERROR: __sanitizer_cov_pcs_init out of bounds?!\n"); fprintf(stderr,
"ERROR: __sanitizer_cov_pcs_init out of bounds?! Start: %u "
"Stop: %u Map Size: %u (%s)\n",
*mod_info->start, *mod_info->stop, __afl_map_size,
mod_info->name);
abort(); abort();
} }
u32 orig_start_index = *mod_info->start;
uintptr_t PC = start->PC; uintptr_t PC = start->PC;
// This is what `GetPreviousInstructionPc` in sanitizer runtime does // This is what `GetPreviousInstructionPc` in sanitizer runtime does
@ -1660,7 +1831,58 @@ void __sanitizer_cov_pcs_init(const uintptr_t *pcs_beg,
// Calculate relative offset in module // Calculate relative offset in module
PC = PC - mod_info->base_address; PC = PC - mod_info->base_address;
__afl_pcmap_ptr[mod_info->start + in_module_index] = PC; if (__afl_pcmap_ptr) {
__afl_pcmap_ptr[orig_start_index + in_module_index] = PC;
}
if (pc_filter) {
char PcDescr[1024];
// This function is a part of the sanitizer run-time.
// To use it, link with AddressSanitizer or other sanitizer.
__sanitizer_symbolize_pc((void *)start->PC, "%p %F %L", PcDescr,
sizeof(PcDescr));
if (strstr(PcDescr, pc_filter)) {
if (__afl_debug)
fprintf(
stderr,
"DEBUG: Selective instrumentation match: %s (PC %p Index %u)\n",
PcDescr, (void *)start->PC,
*(mod_info->start + in_module_index));
// No change to guard needed
} else {
// Null out the guard to disable this edge
*(mod_info->start + in_module_index) = 0;
}
}
if (__afl_filter_pcs && strstr(mod_info->name, __afl_filter_pcs_module)) {
u32 result_index;
if (locate_in_pcs(PC, &result_index)) {
if (__afl_debug)
fprintf(stderr,
"DEBUG: Selective instrumentation match: (PC %lx File "
"Index %u PC Index %u)\n",
PC, result_index, in_module_index);
} else {
// Null out the guard to disable this edge
*(mod_info->start + in_module_index) = 0;
}
}
start++; start++;
in_module_index++; in_module_index++;
@ -1671,8 +1893,10 @@ void __sanitizer_cov_pcs_init(const uintptr_t *pcs_beg,
if (__afl_debug) { if (__afl_debug) {
fprintf(stderr, "DEBUG: __sanitizer_cov_pcs_init initialized %u PCs\n", fprintf(stderr,
in_module_index); "DEBUG: __sanitizer_cov_pcs_init successfully mapped %s with %u "
"PCs\n",
mod_info->name, in_module_index);
} }
@ -1706,9 +1930,9 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop) {
fprintf( fprintf(
stderr, stderr,
"DEBUG: Running __sanitizer_cov_trace_pc_guard_init: %p-%p (%lu edges) " "DEBUG: Running __sanitizer_cov_trace_pc_guard_init: %p-%p (%lu edges) "
"after_fs=%u\n", "after_fs=%u *start=%u\n",
start, stop, (unsigned long)(stop - start), start, stop, (unsigned long)(stop - start),
__afl_already_initialized_forkserver); __afl_already_initialized_forkserver, *start);
} }
@ -1740,8 +1964,8 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop) {
mod_info->id = last_module_info ? last_module_info->id + 1 : 0; mod_info->id = last_module_info ? last_module_info->id + 1 : 0;
mod_info->name = strdup(dlinfo.dli_fname); mod_info->name = strdup(dlinfo.dli_fname);
mod_info->base_address = (uintptr_t)dlinfo.dli_fbase; mod_info->base_address = (uintptr_t)dlinfo.dli_fbase;
mod_info->start = 0; mod_info->start = NULL;
mod_info->stop = 0; mod_info->stop = NULL;
mod_info->pcs_beg = NULL; mod_info->pcs_beg = NULL;
mod_info->pcs_end = NULL; mod_info->pcs_end = NULL;
mod_info->mapped = 0; mod_info->mapped = 0;
@ -1757,8 +1981,12 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop) {
} }
fprintf(stderr, "[pcmap] Module: %s Base Address: %p\n", dlinfo.dli_fname, if (__afl_debug) {
dlinfo.dli_fbase);
fprintf(stderr, "[pcmap] Module: %s Base Address: %p\n",
dlinfo.dli_fname, dlinfo.dli_fbase);
}
} }
@ -1861,12 +2089,17 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop) {
#ifdef __AFL_CODE_COVERAGE #ifdef __AFL_CODE_COVERAGE
if (mod_info) { if (mod_info) {
mod_info->start = *orig_start; if (!mod_info->start) {
mod_info->stop = *(stop - 1);
mod_info->start = orig_start;
mod_info->stop = stop - 1;
}
if (__afl_debug) { if (__afl_debug) {
fprintf(stderr, "DEBUG: [pcmap] Start Index: %u Stop Index: %u\n", fprintf(stderr, "DEBUG: [pcmap] Start Index: %u Stop Index: %u\n",
mod_info->start, mod_info->stop); *(mod_info->start), *(mod_info->stop));
} }

View File

@ -1920,8 +1920,6 @@ void add_native_pcguard(aflcc_state_t *aflcc) {
/* If llvm-config doesn't figure out LLVM_MAJOR, just /* If llvm-config doesn't figure out LLVM_MAJOR, just
go on anyway and let compiler complain if doesn't work. */ go on anyway and let compiler complain if doesn't work. */
if (aflcc->instrument_opt_mode & INSTRUMENT_OPT_CODECOV) {
#if LLVM_MAJOR > 0 && LLVM_MAJOR < 6 #if LLVM_MAJOR > 0 && LLVM_MAJOR < 6
FATAL("pcguard instrumentation with pc-table requires LLVM 6.0.1+"); FATAL("pcguard instrumentation with pc-table requires LLVM 6.0.1+");
#else #else
@ -1930,25 +1928,19 @@ void add_native_pcguard(aflcc_state_t *aflcc) {
"pcguard instrumentation with pc-table requires LLVM 6.0.1+" "pcguard instrumentation with pc-table requires LLVM 6.0.1+"
" otherwise the compiler will fail"); " otherwise the compiler will fail");
#endif #endif
if (aflcc->instrument_opt_mode & INSTRUMENT_OPT_CODECOV) {
insert_param(aflcc, insert_param(aflcc,
"-fsanitize-coverage=trace-pc-guard,bb,no-prune,pc-table"); "-fsanitize-coverage=trace-pc-guard,bb,no-prune,pc-table");
#endif
} else { } else {
#if LLVM_MAJOR > 0 && LLVM_MAJOR < 4 insert_param(aflcc, "-fsanitize-coverage=trace-pc-guard,pc-table");
FATAL("pcguard instrumentation requires LLVM 4.0.1+");
#else
#if LLVM_MAJOR == 0
WARNF(
"pcguard instrumentation requires LLVM 4.0.1+"
" otherwise the compiler will fail");
#endif
insert_param(aflcc, "-fsanitize-coverage=trace-pc-guard");
#endif
} }
#endif
} }
void add_optimized_pcguard(aflcc_state_t *aflcc) { void add_optimized_pcguard(aflcc_state_t *aflcc) {

View File

@ -0,0 +1,55 @@
# Dynamic Instrumentation Filter
Sometimes it can be beneficial to limit the instrumentation feedback to
specific code locations. It is possible to do so at compile-time by simply
not instrumenting any undesired locations. However, there are situations
where doing this dynamically without requiring a new build can be beneficial.
Especially when dealing with larger builds, it is much more convenient to
select the target code locations at runtime instead of doing so at build time.
There are two ways of doing this in AFL++:
## Simple Selection with `AFL_PC_FILTER`
This approach requires a build with `AFL_INSTRUMENTATION=llvmnative` or
`llvmcodecov` as well as an AddressSanitizer build with debug information.
By setting the environment variable `AFL_PC_FILTER` to a string, the runtime
symbolizer is enabled in the AFL++ runtime. At startup, the runtime will call
the `__sanitizer_symbolize_pc` API to resolve every PC in every loaded module.
The runtime then matches the result using `strstr` and disables the PC guard
if the symbolized PC does not contain the specified string.
This approach has the benefit of being very easy to use. The downside is that
it causes significant startup delays with large binaries and that it requires
an AddressSanitizer build.
This method has no additional runtime overhead after startup.
## Selection using pre-symbolized data file with `AFL_PC_FILTER_FILE`
To avoid large startup time delays, a specific module can be pre-symbolized
using the `make_symbol_list.py` script. This script outputs a sorted list of
functions with their respective relative offsets and lengths in the target
binary:
`python3 make_symbol_list.py libxul.so > libxul.symbols.txt`
The resulting list can be filtered, e.g. using grep:
`grep -i "webgl" libxul.symbols.txt > libxul.webgl.symbols.txt`
Finally, you can run with `AFL_PC_FILTER_FILE=libxul.webgl.symbols.txt` to
restrict instrumentation feedback to the given locations. This approach only
has a minimal startup time delay due to the implementation only using binary
search on the given file per PC rather than reading debug information for every
PC. It also works well with Nyx, where symbolizing is usually disabled for the
target process to avoid delays with frequent crashes.
Similar to the previous method, This approach requires a build with
`AFL_INSTRUMENTATION=llvmnative` or `llvmcodecov` as well debug information.
However, it does not require the ASan runtime as it doesn't do the symbolizing
in process. Due to the way it maps PCs to symbols, it is less accurate when it
comes to includes and inlines (it assumes all PCs within a function belong to
that function and originate from the same file). For most purposes, this should
be a reasonable simplification to quickly process even the largest binaries.

View File

@ -0,0 +1,73 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Written by Christian Holler <decoder at mozilla dot com>
import json
import os
import sys
import subprocess
if len(sys.argv) != 2:
print("Usage: %s binfile" % os.path.basename(sys.argv[0]))
sys.exit(1)
binfile = sys.argv[1]
addr2len = {}
addrs = []
output = subprocess.check_output(["objdump", "-t", binfile]).decode("utf-8")
for line in output.splitlines():
line = line.replace("\t", " ")
components = [x for x in line.split(" ") if x]
if not components:
continue
try:
start_addr = int(components[0], 16)
except ValueError:
continue
# Length has variable position in objdump output
length = None
for comp in components[1:]:
if len(comp) == 16:
try:
length = int(comp, 16)
break
except:
continue
if length is None:
print("ERROR: Couldn't determine function section length: %s" % line)
func = components[-1]
addrs.append(start_addr)
addr2len[str(hex(start_addr))] = str(length)
# The search implementation in the AFL runtime expects everything sorted.
addrs.sort()
addrs = [str(hex(addr)) for addr in addrs]
# We symbolize in one go to speed things up with large binaries.
output = subprocess.check_output([
"llvm-addr2line",
"--output-style=JSON",
"-f", "-C", "-a", "-e",
binfile],
input="\n".join(addrs).encode("utf-8")).decode("utf-8")
output = output.strip().splitlines()
for line in output:
output = json.loads(line)
if "Symbol" in output and output["Address"] in addr2len:
final_output = [
output["Address"],
addr2len[output["Address"]],
os.path.basename(output["ModuleName"]),
output["Symbol"][0]["FileName"],
output["Symbol"][0]["FunctionName"]
]
print("\t".join(final_output))