whisper.cpp/extra/bench-all.sh

#!/bin/bash

# Helper script to run the bench tool on all models and print the results in share-able format

printf "Usage: ./bench.sh [n_threads] [encoder-only]\n"

if [ -z "$1" ]; then
    n_threads=4
else
    n_threads=$1
fi

encoder_only=0
if [ -z "$2" ]; then
    encoder_only=0
else
    encoder_only=$2
fi

models=(                                               \
      "tiny"   "tiny-q4_0"   "tiny-q4_1"   "tiny-q5_0"   "tiny-q5_1"   "tiny-q8_0" \
      "base"   "base-q4_0"   "base-q4_1"   "base-q5_0"   "base-q5_1"   "base-q8_0" \
     "small"  "small-q4_0"  "small-q4_1"  "small-q5_0"  "small-q5_1"  "small-q8_0" \
    "medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
     "large"  "large-q4_0"  "large-q4_1"  "large-q5_0"  "large-q5_1"  "large-q8_0" \
)

if [ "$encoder_only" -eq 0 ]; then
    printf "\n"
    printf "Running memcpy benchmark\n"
    printf "\n"

    ./bench -w 1 -t $n_threads 2>&1

    printf "\n"
    printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
    printf "\n"

    ./bench -w 2 -t $n_threads 2>&1

    printf "\n"
    printf "Running benchmark for all models\n"
    printf "This can take a while!\n"
    printf "\n"
fi

printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "Bch5" "PP" "Commit"
printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---" "---"

for model in "${models[@]}"; do
    # actual run
    # store stderr output in a variable in order to parse it later
    output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
    ret=$?

    # parse the output:
    encode_time=$(echo "$output" | grep "encode time" | awk '{print $11}')
    decode_time=$(echo "$output" | grep "decode time" | awk '{print $11}')
    batchd_time=$(echo "$output" | grep "batchd time" | awk '{print $11}')
    prompt_time=$(echo "$output" | grep "prompt time" | awk '{print $11}')
    system_info=$(echo "$output" | grep "system_info")
    n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')

    # floor to milliseconds
    #encode_time=${encode_time%.*}
    #decode_time=${decode_time%.*}
    #prompt_time=${prompt_time%.*}

    config=""

    if [[ $system_info == *"AVX2 = 1"* ]]; then
        config="$config AVX2"
    fi

    if [[ $system_info == *"NEON = 1"* ]]; then
        config="$config NEON"
    fi

    if [[ $system_info == *"BLAS = 1"* ]]; then
        config="$config BLAS"
    fi

    if [[ $system_info == *"COREML = 1"* ]]; then
        config="$config COREML"
    fi

    if [[ $system_info == *"CUDA = 1"* ]]; then
        config="$config CUDA"
    fi

    if [[ $system_info == *"METAL = 1"* ]]; then
        config="$config METAL"
    fi

    commit=$(git rev-parse --short HEAD)

    if [ $ret -eq 0 ]; then
        printf "| <todo> | <todo> | %16s | %11s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$batchd_time" "$prompt_time" "$commit"
    fi
done
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00			`#!/bin/bash`

			`# Helper script to run the bench tool on all models and print the results in share-able format`

bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`printf "Usage: ./bench.sh [n_threads] [encoder-only]\n"`
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00
			`if [ -z "$1" ]; then`
			`n_threads=4`
			`else`
			`n_threads=$1`
			`fi`

bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`encoder_only=0`
			`if [ -z "$2" ]; then`
			`encoder_only=0`
			`else`
			`encoder_only=$2`
			`fi`
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00
bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`models=( \`
whisper : add full CUDA and Metal offloading (#1472) * whisper : migrate to ggml-backend * whisper : fix logit reading * whisper : fix tensor allocation during load * whisper : fix beam-search with CUDA * whisper : free backends + fix compile warning * whisper : print when CUDA is enabled * whisper : fix CoreML * make : clean-up * talk : fix compile warning * whisper : support ggml_conv with CUDA and Metal (#1473) * ggml : add CUDA support for ggml_conv * whisper : remove ggml_repeat for conv bias + single backend * cuda : fix im2col kernel * metal : add im2col support + mul mat-vec f16 x f16 * bench-all : add q4 models * whisper : clean-up * quantize-all : fix * ggml : im2col opts * whisper : avoid whisper_model_data wrapper * whisper : add note that ggml_mul_mat_pad does not work with CUDA * whisper : factor out graph compute in common function * whisper : fixes * whisper : fix UB with measure buffers * whisper : try to fix the parallel whisper_state functionality (#1479) * whisper : try to fix the parallel whisper_state functionality * whisper : fix multi-state Metal * whisper : free backend instances in whisper_state 2023-11-12 13:31:08 +00:00			`"tiny" "tiny-q4_0" "tiny-q4_1" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \`
			`"base" "base-q4_0" "base-q4_1" "base-q5_0" "base-q5_1" "base-q8_0" \`
			`"small" "small-q4_0" "small-q4_1" "small-q5_0" "small-q5_1" "small-q8_0" \`
			`"medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" \`
			`"large" "large-q4_0" "large-q4_1" "large-q5_0" "large-q5_1" "large-q8_0" \`
bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`)`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00
bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`if [ "$encoder_only" -eq 0 ]; then`
			`printf "\n"`
			`printf "Running memcpy benchmark\n"`
			`printf "\n"`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00
bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`./bench -w 1 -t $n_threads 2>&1`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00
bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`printf "\n"`
			`printf "Running ggml_mul_mat benchmark with $n_threads threads\n"`
			`printf "\n"`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00
bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`./bench -w 2 -t $n_threads 2>&1`

			`printf "\n"`
			`printf "Running benchmark for all models\n"`
			`printf "This can take a while!\n"`
			`printf "\n"`
			`fi`
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00
whisper : add batched decoding (#1486) * whisper : add whisper_batch * whisper : move kv_self to whisper_state * whisper : full batched decoding support * whisper : fix memory leak in whisper_batch * whisper : fix mem leak again + remove oboslete function * whisper : clear kv cache when using whisper_decode API * whisper : speed-up sampling * whisper : fix decoders initializer * bench : add batch size 5 bench * whisper : add comment about the KV cache size * whisper : add check for max number of decoders * whisper : avoid starting sampling threads with bs=1 * whisper : enable beam-search by default * cuda : sync llama.cpp fixes 2023-11-15 14:12:52 +00:00			`printf "\| %6s \| %6s \| %16s \| %11s \| %3s \| %7s \| %7s \| %7s \| %7s \| %7s \|\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "Bch5" "PP" "Commit"`
			`printf "\| %6s \| %6s \| %16s \| %11s \| %3s \| %7s \| %7s \| %7s \| %7s \| %7s \|\n" "---" "---" "---" "---" "---" "---" "---" "---" "---" "---"`
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00
			`for model in "${models[@]}"; do`
			`# actual run`
			`# store stderr output in a variable in order to parse it later`
			`output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)`
bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`ret=$?`
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00
			`# parse the output:`
whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`encode_time=$(echo "$output" \| grep "encode time" \| awk '{print $11}')`
			`decode_time=$(echo "$output" \| grep "decode time" \| awk '{print $11}')`
whisper : add batched decoding (#1486) * whisper : add whisper_batch * whisper : move kv_self to whisper_state * whisper : full batched decoding support * whisper : fix memory leak in whisper_batch * whisper : fix mem leak again + remove oboslete function * whisper : clear kv cache when using whisper_decode API * whisper : speed-up sampling * whisper : fix decoders initializer * bench : add batch size 5 bench * whisper : add comment about the KV cache size * whisper : add check for max number of decoders * whisper : avoid starting sampling threads with bs=1 * whisper : enable beam-search by default * cuda : sync llama.cpp fixes 2023-11-15 14:12:52 +00:00			`batchd_time=$(echo "$output" \| grep "batchd time" \| awk '{print $11}')`
whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`prompt_time=$(echo "$output" \| grep "prompt time" \| awk '{print $11}')`
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00			`system_info=$(echo "$output" \| grep "system_info")`
			`n_threads=$(echo "$output" \| grep "system_info" \| awk '{print $4}')`

bench : more concise representation of the results (#89) 2022-12-11 09:56:13 +00:00			`# floor to milliseconds`
whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`#encode_time=${encode_time%.*}`
			`#decode_time=${decode_time%.*}`
			`#prompt_time=${prompt_time%.*}`
bench : more concise representation of the results (#89) 2022-12-11 09:56:13 +00:00
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00			`config=""`

			`if [[ $system_info == "AVX2 = 1" ]]; then`
			`config="$config AVX2"`
			`fi`

			`if [[ $system_info == "NEON = 1" ]]; then`
			`config="$config NEON"`
			`fi`

			`if [[ $system_info == "BLAS = 1" ]]; then`
			`config="$config BLAS"`
			`fi`

whisper : add Core ML support (#566) * coreml : use Core ML encoder inference * coreml : simlpify whisper_encode + log messages * whisper : resolve rebase conflicts * coreml : add scripts for CoreML model generation * bench-all : recognize COREML flag 2023-04-15 10:21:27 +00:00			`if [[ $system_info == "COREML = 1" ]]; then`
			`config="$config COREML"`
			`fi`

whisper : add full CUDA and Metal offloading (#1472) * whisper : migrate to ggml-backend * whisper : fix logit reading * whisper : fix tensor allocation during load * whisper : fix beam-search with CUDA * whisper : free backends + fix compile warning * whisper : print when CUDA is enabled * whisper : fix CoreML * make : clean-up * talk : fix compile warning * whisper : support ggml_conv with CUDA and Metal (#1473) * ggml : add CUDA support for ggml_conv * whisper : remove ggml_repeat for conv bias + single backend * cuda : fix im2col kernel * metal : add im2col support + mul mat-vec f16 x f16 * bench-all : add q4 models * whisper : clean-up * quantize-all : fix * ggml : im2col opts * whisper : avoid whisper_model_data wrapper * whisper : add note that ggml_mul_mat_pad does not work with CUDA * whisper : factor out graph compute in common function * whisper : fixes * whisper : fix UB with measure buffers * whisper : try to fix the parallel whisper_state functionality (#1479) * whisper : try to fix the parallel whisper_state functionality * whisper : fix multi-state Metal * whisper : free backend instances in whisper_state 2023-11-12 13:31:08 +00:00			`if [[ $system_info == "CUDA = 1" ]]; then`
			`config="$config CUDA"`
			`fi`

metal : add F32 support + update bench output 2023-09-15 10:56:08 +00:00			`if [[ $system_info == "METAL = 1" ]]; then`
			`config="$config METAL"`
			`fi`

bench : add commit hash to bench-all.sh results 2022-12-06 16:47:48 +00:00			`commit=$(git rev-parse --short HEAD)`

bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`if [ $ret -eq 0 ]; then`
whisper : add batched decoding (#1486) * whisper : add whisper_batch * whisper : move kv_self to whisper_state * whisper : full batched decoding support * whisper : fix memory leak in whisper_batch * whisper : fix mem leak again + remove oboslete function * whisper : clear kv cache when using whisper_decode API * whisper : speed-up sampling * whisper : fix decoders initializer * bench : add batch size 5 bench * whisper : add comment about the KV cache size * whisper : add check for max number of decoders * whisper : avoid starting sampling threads with bs=1 * whisper : enable beam-search by default * cuda : sync llama.cpp fixes 2023-11-15 14:12:52 +00:00			`printf "\| <todo> \| <todo> \| %16s \| %11s \| %3s \| %7s \| %7s \| %7s \| %7s \| %7s \|\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$batchd_time" "$prompt_time" "$commit"`
bench : improve benchmarks 2023-05-01 11:44:39 +00:00			`fi`
Add helper script to benchmark all models Simply run: $ ./extra/bench-all.sh 2022-10-26 20:19:58 +00:00			`done`