125 KiB
# benchmark.ipynb # Part of the aflplusplus project, requires an ipynb (Jupyter) editor or viewer. # Author: Chris Ball <chris@printf.net> import json import pandas as pd with open("benchmark-results.jsonl") as f: lines = f.read().splitlines() json_lines = [json.loads(line) for line in lines]
Translate the JSON Lines entries into a single pandas DataFrame¶
We have JSON Lines in benchmark-results.jsonl that look like this:
print(json.dumps(json.loads(lines[0]), indent=2))
{ "config": { "afl_persistent_config": true, "afl_system_config": true, "afl_version": "++4.09a", "comment": "i9-9900k, 16GB DDR4-3000, Arch Linux", "compiler": "clang version 16.0.6", "target_arch": "x86_64-pc-linux-gnu" }, "hardware": { "cpu_fastest_core_mhz": 4788.77, "cpu_model": "Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz", "cpu_threads": 16 }, "targets": { "test-instr": { "singlecore": { "execs_per_sec": 9845.64, "execs_total": 98545, "fuzzers_used": 1 } } } }
The pd.json_normalize() method translates this into a flat table that we can perform queries against:
df = pd.json_normalize(json_lines) df.head()
config.afl_persistent_config | config.afl_system_config | config.afl_version | config.comment | config.compiler | config.target_arch | hardware.cpu_fastest_core_mhz | hardware.cpu_model | hardware.cpu_threads | targets.test-instr.singlecore.execs_per_sec | ... | targets.test-instr.singlecore.fuzzers_used | targets.test-instr-persist-shmem.singlecore.execs_per_sec | targets.test-instr-persist-shmem.singlecore.execs_total | targets.test-instr-persist-shmem.singlecore.fuzzers_used | targets.test-instr-persist-shmem.multicore.execs_per_sec | targets.test-instr-persist-shmem.multicore.execs_total | targets.test-instr-persist-shmem.multicore.fuzzers_used | targets.test-instr.multicore.execs_per_sec | targets.test-instr.multicore.execs_total | targets.test-instr.multicore.fuzzers_used | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | True | True | ++4.09a | i9-9900k, 16GB DDR4-3000, Arch Linux | clang version 16.0.6 | x86_64-pc-linux-gnu | 4788.770 | Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz | 16 | 9845.64 | ... | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | True | True | ++4.09a | i9-9900k, 16GB DDR4-3000, Arch Linux | clang version 16.0.6 | x86_64-pc-linux-gnu | 4989.281 | Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz | 16 | NaN | ... | NaN | 125682.73 | 1257330.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | True | True | ++4.09a | i9-9900k, 16GB DDR4-3000, Arch Linux | clang version 16.0.6 | x86_64-pc-linux-gnu | 4799.415 | Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz | 16 | NaN | ... | NaN | NaN | NaN | NaN | 120293.77 | 1203058.0 | 1.0 | NaN | NaN | NaN |
3 | True | True | ++4.09a | i9-9900k, 16GB DDR4-3000, Arch Linux | clang version 16.0.6 | x86_64-pc-linux-gnu | 4703.293 | Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz | 16 | NaN | ... | NaN | NaN | NaN | NaN | 231429.96 | 2314531.0 | 2.0 | NaN | NaN | NaN |
4 | True | True | ++4.09a | i9-9900k, 16GB DDR4-3000, Arch Linux | clang version 16.0.6 | x86_64-pc-linux-gnu | 4800.375 | Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz | 16 | NaN | ... | NaN | NaN | NaN | NaN | 346759.33 | 3468290.0 | 3.0 | NaN | NaN | NaN |
5 rows × 21 columns
Graph prep¶
We're looking for a line graph showing lines for each fuzz target, in both singlecore and multicore modes, in each config setting -- where the x-axis is number of cores, and the y-axis is execs_per_sec.
First, a quick check that the number of rows matched what we'd intuitively expect:
i7 = df.query("`config.comment` == 'i9-9900k, 16GB DDR4-3000, Arch Linux'") assert len(i7) == 185
def build_graphdf_from_query(query: pd.DataFrame): """Build a table suitable for graphing from a subset of the dataframe.""" graphdata = [] max_fuzzers = int(query[["targets.test-instr-persist-shmem.multicore.fuzzers_used", "targets.test-instr.multicore.fuzzers_used"]].max(axis=1).max(axis=0)) for _, row in query.iterrows(): for target in ["test-instr-persist-shmem", "test-instr"]: for mode in ["multicore", "singlecore"]: label = "" if not row[f"targets.{target}.{mode}.execs_per_sec"] > 0: continue execs_per_sec = row[f"targets.{target}.{mode}.execs_per_sec"] parallel_fuzzers = row[f"targets.{target}.{mode}.fuzzers_used"] afl_persistent_config = row["config.afl_persistent_config"] afl_system_config = row["config.afl_system_config"] if target == "test-instr-persist-shmem": label += "shmem" else: label += "base" if mode == "multicore": label += "-multicore" else: label += "-singlecore" if afl_persistent_config: label += "+persist-conf" if afl_system_config: label += "+system-conf" if label == "shmem-multicore+persist-conf+system-conf": graphdata.append({"execs_per_sec": execs_per_sec, "parallel_fuzzers": parallel_fuzzers, "afl_persistent_config": afl_persistent_config, "afl_system_config": afl_system_config, "label": "Multicore: Persistent mode/shared memory + kernel config"}) if label == "shmem-multicore": graphdata.append({"execs_per_sec": execs_per_sec, "parallel_fuzzers": parallel_fuzzers, "afl_persistent_config": afl_persistent_config, "afl_system_config": afl_system_config, "label": "Multicore: Persistent mode/shared memory without kernel config"}) if label == "base-multicore+persist-conf+system-conf": graphdata.append({"execs_per_sec": execs_per_sec, "parallel_fuzzers": parallel_fuzzers, "afl_persistent_config": afl_persistent_config, "afl_system_config": afl_system_config, "label": "Multicore: Non-persistent mode + kernel config"}) if label == "shmem-singlecore+persist-conf+system-conf": for i in range(1, max_fuzzers + 1): graphdata.append({"execs_per_sec": execs_per_sec, "parallel_fuzzers": float(i), "afl_persistent_config": afl_persistent_config, "afl_system_config": afl_system_config, "label": "Singlecore: Persistent mode/shared memory + kernel config"}) if label == "base-singlecore+persist-conf+system-conf": for i in range(1, max_fuzzers + 1): graphdata.append({"execs_per_sec": execs_per_sec, "parallel_fuzzers": float(i), "afl_persistent_config": afl_persistent_config, "afl_system_config": afl_system_config, "label": "Singlecore: Non-persistent mode + kernel config"}) return pd.DataFrame.from_records(graphdata).sort_values("label", ascending=False) graphdf = build_graphdf_from_query(i7)
import numpy as np pd.options.plotting.backend = "plotly" # Right now our table has absolute values of execs per sec, but it's more useful # to show relative perf (vs 1.0x baseline) pivotdf = graphdf.pivot(index="parallel_fuzzers", columns="label", values="execs_per_sec") fig = pivotdf.plot( title="Fuzzer performance", labels={ "label": "Configuration", "parallel_fuzzers": "Number of parallel fuzzers", "value": "Fuzz target executions per second" } ) # Compute tick values and their labels for the primary Y-axis tickvals = np.linspace(graphdf['execs_per_sec'].min(), graphdf['execs_per_sec'].max(), 6) ticktext = [f"{val:.0f}x" for val in tickvals / graphdf['execs_per_sec'].min()] # Update the primary Y-axis with custom tick labels fig.update_yaxes(tickvals=tickvals, ticktext=ticktext) fig.update_xaxes(tickvals=list(range(1,36+1))) fig.update_layout(width=1200, height=400) fig.show("svg")
Here's what the table that produced this graph looks like:
pivotdf
label | Multicore: Non-persistent mode + kernel config | Multicore: Persistent mode/shared memory + kernel config | Multicore: Persistent mode/shared memory without kernel config | Singlecore: Non-persistent mode + kernel config | Singlecore: Persistent mode/shared memory + kernel config |
---|---|---|---|---|---|
parallel_fuzzers | |||||
1.0 | 10714.79 | 120293.77 | 90641.62 | 9845.64 | 125682.73 |
2.0 | 20493.07 | 231429.96 | 178184.19 | 9845.64 | 125682.73 |
3.0 | 29660.06 | 346759.33 | 262652.86 | 9845.64 | 125682.73 |
4.0 | 37875.57 | 455340.06 | 339119.32 | 9845.64 | 125682.73 |
5.0 | 46326.75 | 568405.15 | 420239.94 | 9845.64 | 125682.73 |
6.0 | 54595.48 | 678030.96 | 498062.02 | 9845.64 | 125682.73 |
7.0 | 62720.98 | 782585.04 | 578495.44 | 9845.64 | 125682.73 |
8.0 | 70777.99 | 893618.35 | 661836.22 | 9845.64 | 125682.73 |
9.0 | 74236.02 | 956026.15 | 684808.49 | 9845.64 | 125682.73 |
10.0 | 78134.94 | 984942.13 | 707094.65 | 9845.64 | 125682.73 |
11.0 | 81886.33 | 1016758.62 | 732106.17 | 9845.64 | 125682.73 |
12.0 | 85923.44 | 1053087.90 | 752910.17 | 9845.64 | 125682.73 |
13.0 | 89696.95 | 1085797.87 | 776179.85 | 9845.64 | 125682.73 |
14.0 | 93540.52 | 1110640.20 | 797520.58 | 9845.64 | 125682.73 |
15.0 | 97641.51 | 1138984.22 | 822235.41 | 9845.64 | 125682.73 |
16.0 | 101692.65 | 1168943.19 | 843897.51 | 9845.64 | 125682.73 |
17.0 | 101236.75 | 1135093.91 | 843177.15 | 9845.64 | 125682.73 |
18.0 | 101006.28 | 1160430.45 | 844779.09 | 9845.64 | 125682.73 |
19.0 | 99952.26 | 1155769.97 | 846060.74 | 9845.64 | 125682.73 |
20.0 | 99798.64 | 1150156.26 | 847556.23 | 9845.64 | 125682.73 |
21.0 | 99018.86 | 1136873.58 | 844022.97 | 9845.64 | 125682.73 |
22.0 | 98600.87 | 1112404.25 | 845818.70 | 9845.64 | 125682.73 |
23.0 | 98634.02 | 1143131.72 | 844118.27 | 9845.64 | 125682.73 |
24.0 | 98352.90 | 1143931.38 | 837189.02 | 9845.64 | 125682.73 |
25.0 | 98118.63 | 1102090.61 | 834712.31 | 9845.64 | 125682.73 |
26.0 | 97752.45 | 1116518.70 | 836344.12 | 9845.64 | 125682.73 |
27.0 | 97864.07 | 1099224.19 | 827784.91 | 9845.64 | 125682.73 |
28.0 | 97821.80 | 1114945.37 | 828641.27 | 9845.64 | 125682.73 |
29.0 | 97564.87 | 1110889.91 | 826123.67 | 9845.64 | 125682.73 |
30.0 | 98508.10 | 1058548.28 | 817765.77 | 9845.64 | 125682.73 |
31.0 | 98238.96 | 1119804.85 | 816556.66 | 9845.64 | 125682.73 |
32.0 | 98363.93 | 1118828.99 | 812661.77 | 9845.64 | 125682.73 |
33.0 | 96758.69 | 1093426.61 | 805352.16 | 9845.64 | 125682.73 |
34.0 | 96327.00 | 1108123.59 | 815888.26 | 9845.64 | 125682.73 |
35.0 | 95913.98 | 1041486.52 | 812348.56 | 9845.64 | 125682.73 |
36.0 | 95871.39 | 1092395.61 | 817278.03 | 9845.64 | 125682.73 |
You can totally ignore the code cell directly below (unless you're curious). It's just preparing Markdown for the block below it to render. Jupyter Notebooks aren't able to use code variables inside Markdown blocks, so I have to do this instead.
# (You can ignore reading this code cell.) from IPython.display import Markdown as md singlecore_base_execs = pivotdf.iloc[0]["Singlecore: Non-persistent mode + kernel config"] singlecore_persist_execs = pivotdf.iloc[0]["Singlecore: Persistent mode/shared memory + kernel config"] multicore_fuzzers_max_execs = int(pivotdf["Multicore: Persistent mode/shared memory + kernel config"].idxmax()) multicore_base_max_execs = pivotdf["Multicore: Non-persistent mode + kernel config"].max() factor_for_execs = lambda execs: round(execs / singlecore_base_execs, 1) multicore_persistent_without_mitigations_label = "Multicore: Persistent mode/shared memory + kernel config" multicore_max_execs_mitigations_off = pivotdf[multicore_persistent_without_mitigations_label].max() multicore_max_execs_mitigations_off_only_cores = pivotdf.loc[multicore_fuzzers_max_execs / 2][multicore_persistent_without_mitigations_label] multicore_max_execs_mitigations_on = pivotdf["Multicore: Persistent mode/shared memory without kernel config"].max() multicore_avg_gain_per_core = pivotdf.loc[pivotdf.index <= 8]["Multicore: Persistent mode/shared memory + kernel config"].diff().dropna().mean() mitigations_off_increase = int(multicore_max_execs_mitigations_off - multicore_max_execs_mitigations_on) md(f""" ### Line graph analysis Here are a few things that jump out from the graph above. Let's start at the bottom of the graph. #### test-instr vs. test-instr-persist-shmem This graph is scaled so that the single-core, non-persistent-mode performance ({int(singlecore_base_execs)} execs per second) is represented as **1.0x**. If you build and run a fuzzer without creating a persistent mode harness for it, and without running fuzzers in parallel, this is the performance you get on this machine. #### Multicore test-instr By running as many parallel fuzzers are there are CPU threads, we can reach {int(multicore_base_max_execs)} execs per second, which is **{factor_for_execs(multicore_base_max_execs)}x** that base speed. #### Persistent mode + shared memory ##### Singlecore By modifying the harness to use persistent mode with shared memory as described [here](https://github.com/AFLplusplus/AFLplusplus/blob/stable/instrumentation/README.persistent_mode.md#4-persistent-mode), we end up with **{factor_for_execs(singlecore_persist_execs)}x** base speed. So -- perhaps counter-intuively -- if you have a choice between switching to using multiple cores or rewriting the harness to use persistent mode on a single core, it is better (at least on this machine) to use persistent mode on a single core, than to use non-persistent mode on all cores. ##### Multicore By scaling up that persistent mode with shared memory harness across cores, and with kernel mitigations still turned on (see next section), we get to **{factor_for_execs(multicore_max_execs_mitigations_on)}x** base speed. #### Kernel config By "kernel config", I'm referring to booting the Linux kernel with `mitigations=off`, which is a meta-parameter for disabling *all* hardware vulnerability meltdowns (such as Spectre, Meltdown, Retbleed, etc) introduced in Linux v5.2. Disabling these results in a `execs_per_sec` increase of {mitigations_off_increase} execs -- the difference between {factor_for_execs(multicore_max_execs_mitigations_off)}x (mitigations off) and {factor_for_execs(multicore_max_execs_mitigations_on)}x (mitigations on) base speed. Turning on mitigations reduced the overall performance by {abs(round(((multicore_max_execs_mitigations_on - multicore_max_execs_mitigations_off) / multicore_max_execs_mitigations_off) * 100))}%! One way to think about this is that the mitigations turn this 16-thread CPU into a 7-thread CPU, since the number of execs reached with 16 threads and mitigations on is around the same number of execs reached with 7 threads and mitigations off. Or if we want to think in terms of cores, then the average number of execs gained per core in the initial eight is {int(multicore_avg_gain_per_core)} execs per sec, but the loss due to mitigations is {mitigations_off_increase} execs per sec, which is the averaged performance of {round(mitigations_off_increase / multicore_avg_gain_per_core, 1)} cores. With kernel mitigations turned off, we reach our highest available execs_per_sec speed on this machine, which is **{factor_for_execs(multicore_max_execs_mitigations_off)}x** higher than where we started from. #### How many parallel fuzzers should we use on this machine? * Using >16 is worse than using 16. Makes sense. * So, we should use the number of CPUs in /proc/cpuinfo (threads) to get the best performance. But if we did halve the number of fuzzers, we would surprisingly only lose {abs(int(((multicore_max_execs_mitigations_off_only_cores - multicore_max_execs_mitigations_off) / multicore_max_execs_mitigations_off) * 100))}% of performance. This could be a good tradeoff in terms of cost. """)
Line graph analysis¶
Here are a few things that jump out from the graph above. Let's start at the bottom of the graph.
test-instr vs. test-instr-persist-shmem¶
This graph is scaled so that the single-core, non-persistent-mode performance (9845 execs per second) is represented as 1.0x. If you build and run a fuzzer without creating a persistent mode harness for it, and without running fuzzers in parallel, this is the performance you get on this machine.
Multicore test-instr¶
By running as many parallel fuzzers are there are CPU threads, we can reach 101692 execs per second, which is 10.3x that base speed.
Persistent mode + shared memory¶
Singlecore¶
By modifying the harness to use persistent mode with shared memory as described here, we end up with 12.8x base speed. So -- perhaps counter-intuively -- if you have a choice between switching to using multiple cores or rewriting the harness to use persistent mode on a single core, it is better (at least on this machine) to use persistent mode on a single core, than to use non-persistent mode on all cores.
Multicore¶
By scaling up that persistent mode with shared memory harness across cores, and with kernel mitigations still turned on (see next section), we get to 86.1x base speed.
Kernel config¶
By "kernel config", I'm referring to booting the Linux kernel with mitigations=off
, which is a meta-parameter for disabling all hardware vulnerability meltdowns (such as Spectre,
Meltdown, Retbleed, etc) introduced in Linux v5.2. Disabling these results in a execs_per_sec
increase of 321386 execs -- the difference between
118.7x (mitigations off) and 86.1x (mitigations on) base speed. Turning on mitigations
reduced the overall performance by 27%!
One way to think about this is that the mitigations turn this 16-thread CPU into a 7-thread CPU, since the number of execs reached with 16 threads and mitigations on is around the same number of execs reached with 7 threads and mitigations off.
Or if we want to think in terms of cores, then the average number of execs gained per core in the initial eight is 110474 execs per sec, but the loss due to mitigations is 321386 execs per sec, which is the averaged performance of 2.9 cores.
With kernel mitigations turned off, we reach our highest available execs_per_sec speed on this machine, which is 118.7x higher than where we started from.
How many parallel fuzzers should we use on this machine?¶
- Using >16 is worse than using 16. Makes sense.
- So, we should use the number of CPUs in /proc/cpuinfo (threads) to get the best performance. But if we did halve the number of fuzzers, we would surprisingly only lose 23% of performance. This could be a good tradeoff in terms of cost.
Example with more cores¶
While there was some nuance here, the answer was pretty straightforward -- use the number of CPU threads you have access to. What if there were more threads? Here the experiment is repeated on an AWS EC2 "r6a.48xlarge" spot instance with 192 vCPUs:
r6a = df.query("`config.comment` == 'AWS EC2 r6a.48xlarge spot instance'") r6a.head(2).dropna(axis=1)
config.afl_persistent_config | config.afl_system_config | config.afl_version | config.comment | config.compiler | config.target_arch | hardware.cpu_fastest_core_mhz | hardware.cpu_model | hardware.cpu_threads | targets.test-instr-persist-shmem.multicore.execs_per_sec | targets.test-instr-persist-shmem.multicore.execs_total | targets.test-instr-persist-shmem.multicore.fuzzers_used | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
223 | True | True | ++4.09a | AWS EC2 r6a.48xlarge spot instance | clang version 15.0.7 (Amazon Linux 15.0.7-3.am... | x86_64-amazon-linux-gnu | 3514.326 | AMD EPYC 7R13 Processor | 192 | 119469.35 | 1194813.0 | 1.0 |
224 | True | True | ++4.09a | AWS EC2 r6a.48xlarge spot instance | clang version 15.0.7 (Amazon Linux 15.0.7-3.am... | x86_64-amazon-linux-gnu | 3599.748 | AMD EPYC 7R13 Processor | 192 | 237177.20 | 2372250.0 | 2.0 |
r6a_graphdf = build_graphdf_from_query(r6a) r6a_graphdf.head(2)
execs_per_sec | parallel_fuzzers | afl_persistent_config | afl_system_config | label | |
---|---|---|---|---|---|
0 | 119469.35 | 1.0 | True | True | Multicore: Persistent mode/shared memory + ker... |
1 | 237177.20 | 2.0 | True | True | Multicore: Persistent mode/shared memory + ker... |
r6a_pivotdf = r6a_graphdf.pivot(index="parallel_fuzzers", columns="label", values="execs_per_sec") r6a_fig = r6a_pivotdf.plot( title="Fuzzer performance", labels={ "label": "Configuration", "parallel_fuzzers": "Number of parallel fuzzers", "value": "Fuzz target executions per second" } ) # Compute tick values and their labels for the primary Y-axis tickvals = np.linspace(r6a_graphdf['execs_per_sec'].min(), r6a_graphdf['execs_per_sec'].max(), 6) ticktext = [f"{val:.0f}x" for val in tickvals / graphdf['execs_per_sec'].min()] # Update the primary Y-axis with custom tick labels r6a_fig.update_yaxes(tickvals=tickvals, ticktext=ticktext) r6a_fig.update_xaxes(tickvals=list(range(0,200+1, 4))) r6a_fig.update_layout(width=1200, height=400) r6a_fig.show("svg")
Line graph analysis¶
This is a shocking result for a 192 vCPU machine -- our optimal number of parallel fuzzers was 16! Using 32 parallel fuzzers gives less performance than using 8 fuzzers. Using 192 parallel fuzzers (the physical number of threads in this machine) gives the same performance as using 4 fuzzers.
This is clearly a cautionary tale about measuring before simply using the number of hardware threads in your machine. But does this mean that AFL++ is a bad fuzzer, or that AWS tricked us and gave us a 16-thread machine instead of a 192-thread one?
No, probably not -- the most likely cause here (other than a horrible bug) may be that we're already saturating the Linux kernel's ability to service system calls (although we're definitely hitting such a limit way earlier than I expected). A good way to test this theory would be to run more system-call-servicers (read: kernels!) at once on this machine; one way to do that is to use hardware virtualization with KVM.