whisper : add option to speed up the audio tempo by x2

Using a Phase Vocoder for speeding up the audio tempo by scaling down the frequencies in the frequency domain. This reduces the computation in the Encoder by a factor of 2. The transcription accuracy is degraded, but for slow to normal speech - it seems to be still very good. I think this can find application for real-time transcription - i.e. the "stream" example.
2025-06-12 20:18:08 +00:00 · 2022-11-12 18:03:49 +02:00
parent 41b48ab7f1
commit 83c742f1a7
4 changed files with 64 additions and 10 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -59,6 +59,7 @@ struct whisper_params {

    float word_thold = 0.01f;

+    bool speed_up             = false;
    bool verbose              = false;
    bool translate            = false;
    bool output_txt           = false;
@ -104,6 +105,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            params.max_len = std::stoi(argv[++i]);
        } else if (arg == "-wt" || arg == "--word-thold") {
            params.word_thold = std::stof(argv[++i]);
+        } else if (arg == "-su" || arg == "--speed-up") {
+            params.speed_up = true;
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else if (arg == "--translate") {
@ -161,6 +164,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
    fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
    fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
+    fprintf(stderr, "  -su,      --speed-up       speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
    fprintf(stderr, "  -v,       --verbose        verbose output\n");
    fprintf(stderr, "            --translate      translate from source language to english\n");
    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
@ -454,7 +458,7 @@ int main(int argc, char ** argv) {
        std::vector<float> pcmf32;
        {
            drwav wav;
-            
+
            if (fname_inp == "-") {
                std::vector<uint8_t> wav_data;
                {
@ -563,6 +567,8 @@ int main(int argc, char ** argv) {
            wparams.thold_pt             = params.word_thold;
            wparams.max_len              = params.output_wts && params.max_len == 0 ? 60 : params.max_len;

+            wparams.speed_up             = params.speed_up;
+
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;