whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support * ggml : support for scratch ring-buffer * ggml : bug fix in ggml_repeat() * ggml : error on scratch buffer overflow * whisper : use scratch buffers during inference (base model only) * whisper : update memory usage for all models * whisper : fix encoder memory usage * whisper : use whisper_context functions instead of macros * whisper : fix FF + remove it from README * ggml : reuse ggml_new_i32 * ggml : refactor the scratch buffer storage * whisper : reorder scratch buffers in the decoder * main : add option to disable temp fallback * Update README.md
2025-06-17 14:28:07 +00:00 · 2023-02-04 09:45:52 +02:00
parent c306a7fd89
commit f3ee4a9673
7 changed files with 702 additions and 472 deletions
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -9,25 +9,35 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
 usage: ./main [options] file0.wav file1.wav ...

 options:
-  -h,       --help          [default] show this help message and exit
-  -t N,     --threads N     [4      ] number of threads to use during computation
-  -p N,     --processors N  [1      ] number of processors to use during computation
-  -ot N,    --offset-t N    [0      ] time offset in milliseconds
-  -on N,    --offset-n N    [0      ] segment index offset
-  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
-  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
-  -ml N,    --max-len N     [0      ] maximum segment length in characters
-  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
-  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
-  -tr,      --translate     [false  ] translate from source language to english
-  -otxt,    --output-txt    [false  ] output result in a text file
-  -ovtt,    --output-vtt    [false  ] output result in a vtt file
-  -osrt,    --output-srt    [false  ] output result in a srt file
-  -owts,    --output-words  [false  ] output script for generating karaoke video
-  -ps,      --print-special [false  ] print special tokens
-  -pc,      --print-colors  [false  ] print colors
-  -nt,      --no-timestamps [true   ] do not print timestamps
-  -l LANG,  --language LANG [en     ] spoken language
-  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
-  -f FNAME, --file FNAME    [       ] input WAV file path
+  -h,        --help              [default] show this help message and exit
+  -t N,      --threads N         [4      ] number of threads to use during computation
+  -p N,      --processors N      [1      ] number of processors to use during computation
+  -ot N,     --offset-t N        [0      ] time offset in milliseconds
+  -on N,     --offset-n N        [0      ] segment index offset
+  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
+  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
+  -ml N,     --max-len N         [0      ] maximum segment length in characters
+  -bo N,     --best-of N         [5      ] number of best candidates to keep
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
+  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
+  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
+  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
+  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,       --translate         [false  ] translate from source language to english
+  -di,       --diarize           [false  ] stereo audio diarization
+  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
+  -otxt,     --output-txt        [false  ] output result in a text file
+  -ovtt,     --output-vtt        [false  ] output result in a vtt file
+  -osrt,     --output-srt        [false  ] output result in a srt file
+  -owts,     --output-words      [false  ] output script for generating karaoke video
+  -ocsv,     --output-csv        [false  ] output result in a CSV file
+  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
+  -ps,       --print-special     [false  ] print special tokens
+  -pc,       --print-colors      [false  ] print colors
+  -pp,       --print-progress    [false  ] print progress
+  -nt,       --no-timestamps     [true   ] do not print timestamps
+  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
+             --prompt PROMPT     [       ] initial prompt
+  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
+  -f FNAME,  --file FNAME        [       ] input WAV file path
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -53,22 +53,23 @@ void replace_all(std::string & s, const std::string & search, const std::string
 // command-line parameters
 struct whisper_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors = 1;
-    int32_t offset_t_ms  = 0;
-    int32_t offset_n     = 0;
-    int32_t duration_ms  = 0;
+    int32_t n_processors =  1;
+    int32_t offset_t_ms  =  0;
+    int32_t offset_n     =  0;
+    int32_t duration_ms  =  0;
    int32_t max_context  = -1;
-    int32_t max_len      = 0;
-    int32_t best_of      = 5;
+    int32_t max_len      =  0;
+    int32_t best_of      =  5;
    int32_t beam_size    = -1;

-    float word_thold    = 0.01f;
-    float entropy_thold = 2.4f;
-    float logprob_thold = -1.0f;
+    float word_thold    =  0.01f;
+    float entropy_thold =  2.40f;
+    float logprob_thold = -1.00f;

    bool speed_up       = false;
    bool translate      = false;
    bool diarize        = false;
+    bool no_fallback    = false;
    bool output_txt     = false;
    bool output_vtt     = false;
    bool output_srt     = false;
@ -117,6 +118,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-su"   || arg == "--speed-up")       { params.speed_up       = true; }
        else if (arg == "-tr"   || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-di"   || arg == "--diarize")        { params.diarize        = true; }
+        else if (arg == "-nf"   || arg == "--no-fallback")    { params.no_fallback    = true; }
        else if (arg == "-otxt" || arg == "--output-txt")     { params.output_txt     = true; }
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
@ -162,6 +164,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
    fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
@ -514,7 +517,7 @@ int main(int argc, char ** argv) {

    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
-		const auto fname_outp = f < params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
+		const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];

        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@ -647,17 +650,19 @@ int main(int argc, char ** argv) {

            wparams.token_timestamps = params.output_wts || params.max_len > 0;
            wparams.thold_pt         = params.word_thold;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;

            wparams.speed_up         = params.speed_up;

+            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
+
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;

-            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
+            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
+            wparams.entropy_thold    = params.entropy_thold;
+            wparams.logprob_thold    = params.logprob_thold;

            whisper_print_user_data user_data = { &params, &pcmf32s };