whisper.cpp/examples/bench/bench.cpp

#include "whisper.h"

#include <cstdio>
#include <string>
#include <thread>

// command-line parameters
struct whisper_params {
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat

    std::string model = "models/ggml-base.en.bin";
};

void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
        else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
        else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
        else if (arg == "-w" || arg == "--what")    { params.what     = atoi(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }

    return true;
}

void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
    fprintf(stderr, "                           %-7s  0 - whisper\n",                                 "");
    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
    fprintf(stderr, "\n");
}

int whisper_bench_full(const whisper_params & params) {
    // whisper init

    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());

    {
        fprintf(stderr, "\n");
        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
    }

    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 2;
    }

    if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
        fprintf(stderr, "error: failed to set mel: %d\n", ret);
        return 3;
    }
    // heat encoder
    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

    whisper_token tokens[512];
    memset(tokens, 0, sizeof(tokens));

    // prompt heat
    if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

    // text-generation heat
    if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

    whisper_reset_timings(ctx);

    // actual run
    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }

    for (int i = 0; i < 16; i++) {
        if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
            fprintf(stderr, "error: failed to encode model: %d\n", ret);
            return 4;
        }
    }

    for (int i = 0; i < 256; i++) {
        if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
            fprintf(stderr, "error: failed to encode model: %d\n", ret);
            return 4;
        }
    }

    whisper_print_timings(ctx);
    whisper_free(ctx);

    fprintf(stderr, "\n");
    fprintf(stderr, "If you wish, you can submit these results here:\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  https://github.com/ggerganov/whisper.cpp/issues/89\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Please include the following information:\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  - CPU model\n");
    fprintf(stderr, "  - Operating system\n");
    fprintf(stderr, "  - Compiler\n");
    fprintf(stderr, "\n");

    return 0;
}

int main(int argc, char ** argv) {
    whisper_params params;

    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }

    int ret = -1;

    switch (params.what) {
        case 0: ret = whisper_bench_full(params);                break;
        case 1: ret = whisper_bench_memcpy(params.n_threads);       break;
        case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
        default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
    }

    return ret;
}
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`#include "whisper.h"`

			`#include <cstdio>`
			`#include <string>`
			`#include <thread>`

			`// command-line parameters`
			`struct whisper_params {`
refactoring : more readable code 2022-11-25 17:08:51 +00:00			`int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00			`int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00
refactoring : more readable code 2022-11-25 17:08:51 +00:00			`std::string model = "models/ggml-base.en.bin";`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`};`

			`void whisper_print_usage(int argc, char ** argv, const whisper_params & params);`

			`bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {`
			`for (int i = 1; i < argc; i++) {`
			`std::string arg = argv[i];`

refactoring : more readable code 2022-11-25 17:08:51 +00:00			`if (arg == "-h" \|\| arg == "--help") {`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`whisper_print_usage(argc, argv, params);`
			`exit(0);`
refactoring : more readable code 2022-11-25 17:08:51 +00:00			`}`
			`else if (arg == "-t" \|\| arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }`
			`else if (arg == "-m" \|\| arg == "--model") { params.model = argv[++i]; }`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00			`else if (arg == "-w" \|\| arg == "--what") { params.what = atoi(argv[++i]); }`
refactoring : more readable code 2022-11-25 17:08:51 +00:00			`else {`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());`
			`whisper_print_usage(argc, argv, params);`
			`exit(0);`
			`}`
			`}`

			`return true;`
			`}`

cmake : enable and fix -Wall -Wextra -Wpedantic C++ warnings 2022-12-19 18:45:08 +00:00			`void whisper_print_usage(int /argc/, char ** argv, const whisper_params & params) {`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`fprintf(stderr, "\n");`
			`fprintf(stderr, "usage: %s [options]\n", argv[0]);`
			`fprintf(stderr, "\n");`
			`fprintf(stderr, "options:\n");`
refactoring : more readable code 2022-11-25 17:08:51 +00:00			`fprintf(stderr, " -h, --help [default] show this help message and exit\n");`
			`fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);`
			`fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00			`fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);`
whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`fprintf(stderr, " %-7s 0 - whisper\n", "");`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00			`fprintf(stderr, " %-7s 1 - memcpy\n", "");`
			`fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`fprintf(stderr, "\n");`
			`}`

whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`int whisper_bench_full(const whisper_params & params) {`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`// whisper init`

whisper : add loader class to allow loading from buffer and others (#353) * whisper : add loader to allow loading from other than file * whisper : rename whisper_init to whisper_init_from_file * whisper : add whisper_init_from_buffer * android : Delete local.properties * android : load models directly from assets * whisper : adding <stddef.h> needed for size_t + code style Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-01-08 11:03:33 +00:00			`struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00
Print system info at start of program 2022-10-27 14:22:10 +00:00			`{`
			`fprintf(stderr, "\n");`
			`fprintf(stderr, "system_info: n_threads = %d / %d \| %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());`
			`}`

refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`if (ctx == nullptr) {`
			`fprintf(stderr, "error: failed to initialize whisper context\n");`
			`return 2;`
			`}`

			`if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {`
			`fprintf(stderr, "error: failed to set mel: %d\n", ret);`
			`return 3;`
			`}`
whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`// heat encoder`
			`if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {`
			`fprintf(stderr, "error: failed to encode model: %d\n", ret);`
			`return 4;`
			`}`

			`whisper_token tokens[512];`
			`memset(tokens, 0, sizeof(tokens));`

			`// prompt heat`
			`if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {`
			`fprintf(stderr, "error: failed to encode model: %d\n", ret);`
			`return 4;`
			`}`

			`// text-generation heat`
			`if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {`
			`fprintf(stderr, "error: failed to encode model: %d\n", ret);`
			`return 4;`
			`}`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00
whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`whisper_reset_timings(ctx);`

			`// actual run`
refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {`
			`fprintf(stderr, "error: failed to encode model: %d\n", ret);`
			`return 4;`
			`}`

whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`for (int i = 0; i < 16; i++) {`
			`if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {`
			`fprintf(stderr, "error: failed to encode model: %d\n", ret);`
			`return 4;`
			`}`
			`}`

			`for (int i = 0; i < 256; i++) {`
			`if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {`
			`fprintf(stderr, "error: failed to encode model: %d\n", ret);`
			`return 4;`
			`}`
			`}`

refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`whisper_print_timings(ctx);`
			`whisper_free(ctx);`

ggml : add system info functions 2022-10-25 17:18:26 +00:00			`fprintf(stderr, "\n");`
			`fprintf(stderr, "If you wish, you can submit these results here:\n");`
			`fprintf(stderr, "\n");`
			`fprintf(stderr, " https://github.com/ggerganov/whisper.cpp/issues/89\n");`
			`fprintf(stderr, "\n");`
			`fprintf(stderr, "Please include the following information:\n");`
			`fprintf(stderr, "\n");`
			`fprintf(stderr, " - CPU model\n");`
			`fprintf(stderr, " - Operating system\n");`
			`fprintf(stderr, " - Compiler\n");`
			`fprintf(stderr, "\n");`

refactoring : move main + stream in examples + other stuff 2022-10-25 16:13:08 +00:00			`return 0;`
			`}`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00
			`int main(int argc, char ** argv) {`
			`whisper_params params;`

			`if (whisper_params_parse(argc, argv, params) == false) {`
			`return 1;`
			`}`

			`int ret = -1;`

			`switch (params.what) {`
whisper : Metal and ggml-alloc support (#1270) * metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package 2023-09-15 09:18:18 +00:00			`case 0: ret = whisper_bench_full(params); break;`
bench : fix Windows linkage by moving ggml benches in whisper lib .. 2023-01-18 19:00:41 +00:00			`case 1: ret = whisper_bench_memcpy(params.n_threads); break;`
			`case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;`
bench : add memcpy and ggml_mul_mat benchmarks 2023-01-18 18:31:46 +00:00			`default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;`
			`}`

			`return ret;`
			`}`