refactoring : move main + stream in examples + other stuff

2025-06-18 06:48:08 +00:00 · 2022-10-25 19:13:08 +03:00
parent 4c68f4cac0
commit c6710efde2
18 changed files with 205 additions and 102 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -4,11 +4,24 @@ find_package(Threads REQUIRED)

 # third-party

-#add_subdirectory(third-party)
+if (WHISPER_SUPPORT_SDL2)
+    # SDL2
+    find_package(SDL2 REQUIRED)
+
+    string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES)
+
+    message(STATUS "SDL2_INCLUDE_DIRS = ${SDL2_INCLUDE_DIRS}")
+    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
+endif()

 # examples

+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
 if (EMSCRIPTEN)
    add_subdirectory(whisper.wasm)
 else()
+    add_subdirectory(main)
+    add_subdirectory(stream)
+    add_subdirectory(bench)
 endif()
--- a/examples/bench/CMakeLists.txt
+++ b/examples/bench/CMakeLists.txt
@ -0,0 +1,3 @@
+set(TARGET bench)
+add_executable(${TARGET} bench.cpp)
+target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/bench/README.md
+++ b/examples/bench/README.md
@ -0,0 +1,3 @@
+# bench
+
+TODO
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -0,0 +1,78 @@
+#include "whisper.h"
+
+#include <cstdio>
+#include <string>
+#include <thread>
+
+// command-line parameters
+struct whisper_params {
+    int32_t n_threads   = std::min(4, (int32_t) std::thread::hardware_concurrency());
+
+    std::string model     = "models/ggml-base.en.bin";
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "\n");
+}
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    // whisper init
+
+    struct whisper_context * ctx = whisper_init(params.model.c_str());
+
+    if (ctx == nullptr) {
+        fprintf(stderr, "error: failed to initialize whisper context\n");
+        return 2;
+    }
+
+    if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
+        fprintf(stderr, "error: failed to set mel: %d\n", ret);
+        return 3;
+    }
+
+    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        return 4;
+    }
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
--- a/examples/dr_wav.h
+++ b/examples/dr_wav.h
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -0,0 +1,3 @@
+set(TARGET main)
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/main/README.md
+++ b/examples/main/README.md
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -0,0 +1,416 @@
+#include "whisper.h"
+
+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+
+#include <cmath>
+#include <fstream>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <vector>
+
+// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
+// Lowest is red, middle is yellow, highest is green.
+const std::vector<std::string> k_colors = {
+    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
+    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
+};
+
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t, bool comma = false) {
+    int64_t msec = t * 10;
+    int64_t hr = msec / (1000 * 60 * 60);
+    msec = msec - hr * (1000 * 60 * 60);
+    int64_t min = msec / (1000 * 60);
+    msec = msec - min * (1000 * 60);
+    int64_t sec = msec / 1000;
+    msec = msec - sec * 1000;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
+
+    return std::string(buf);
+}
+
+// command-line parameters
+struct whisper_params {
+    int32_t seed        = -1; // RNG seed, not used currently
+    int32_t n_threads   = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t offset_t_ms = 0;
+    int32_t offset_n    = 0;
+
+    bool verbose              = false;
+    bool translate            = false;
+    bool output_txt           = false;
+    bool output_vtt           = false;
+    bool output_srt           = false;
+    bool print_special_tokens = false;
+    bool print_colors         = false;
+    bool no_timestamps        = false;
+
+    std::string language  = "en";
+    std::string model     = "models/ggml-base.en.bin";
+
+    std::vector<std::string> fname_inp = {};
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg[0] != '-') {
+            params.fname_inp.push_back(arg);
+            continue;
+        }
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-ot" || arg == "--offset-t") {
+            params.offset_t_ms = std::stoi(argv[++i]);
+        } else if (arg == "-on" || arg == "--offset-n") {
+            params.offset_n = std::stoi(argv[++i]);
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else if (arg == "--translate") {
+            params.translate = true;
+        } else if (arg == "-l" || arg == "--language") {
+            params.language = argv[++i];
+            if (whisper_lang_id(params.language.c_str()) == -1) {
+                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+                whisper_print_usage(argc, argv, params);
+                exit(0);
+            }
+        } else if (arg == "-otxt" || arg == "--output-txt") {
+            params.output_txt = true;
+        } else if (arg == "-ovtt" || arg == "--output-vtt") {
+            params.output_vtt = true;
+        } else if (arg == "-osrt" || arg == "--output-srt") {
+            params.output_srt = true;
+        } else if (arg == "-ps" || arg == "--print_special") {
+            params.print_special_tokens = true;
+        } else if (arg == "-pc" || arg == "--print_colors") {
+            params.print_colors = true;
+        } else if (arg == "-nt" || arg == "--no_timestamps") {
+            params.no_timestamps = true;
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-f" || arg == "--file") {
+            params.fname_inp.push_back(argv[++i]);
+        } else if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
+    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
+    fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
+    fprintf(stderr, "  -v,       --verbose        verbose output\n");
+    fprintf(stderr, "            --translate      translate from source language to english\n");
+    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
+    fprintf(stderr, "  -ovtt,    --output-vtt     output result in a vtt file\n");
+    fprintf(stderr, "  -osrt,    --output-srt     output result in a srt file\n");
+    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
+    fprintf(stderr, "  -pc,      --print_colors   print colors\n");
+    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
+    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path\n");
+    fprintf(stderr, "\n");
+}
+
+void whisper_print_segment_callback(struct whisper_context * ctx, void * user_data) {
+    const whisper_params & params = *(whisper_params *) user_data;
+
+    const int n_segments = whisper_full_n_segments(ctx);
+
+    // print the last segment
+    const int i = n_segments - 1;
+    if (i == 0) {
+        printf("\n");
+    }
+
+    if (params.no_timestamps) {
+        if (params.print_colors) {
+            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                if (params.print_special_tokens == false) {
+                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                    if (id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+                }
+
+                const char * text = whisper_full_get_token_text(ctx, i, j);
+                const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+            }
+        } else {
+            const char * text = whisper_full_get_segment_text(ctx, i);
+            printf("%s", text);
+        }
+        fflush(stdout);
+    } else {
+        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+        if (params.print_colors) {
+            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
+            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                if (params.print_special_tokens == false) {
+                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                    if (id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+                }
+
+                const char * text = whisper_full_get_token_text(ctx, i, j);
+                const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+            }
+            printf("\n");
+        } else {
+            const char * text = whisper_full_get_segment_text(ctx, i);
+
+            printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+        }
+    }
+}
+
+bool output_txt(struct whisper_context * ctx, const char * fname) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        fout << text;
+    }
+
+    return true;
+}
+
+bool output_vtt(struct whisper_context * ctx, const char * fname) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return 9;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    fout << "WEBVTT\n\n";
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+        fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
+        fout << text << "\n\n";
+    }
+
+    return true;
+}
+
+bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+        fout << i + 1 + params.offset_n << "\n";
+        fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
+        fout << text << "\n\n";
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    if (params.fname_inp.empty()) {
+        fprintf(stderr, "error: no input files specified\n");
+        whisper_print_usage(argc, argv, params);
+        return 2;
+    }
+
+    // whisper init
+
+    struct whisper_context * ctx = whisper_init(params.model.c_str());
+
+    if (ctx == nullptr) {
+        fprintf(stderr, "error: failed to initialize whisper context\n");
+        return 3;
+    }
+
+    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
+        const auto fname_inp = params.fname_inp[f];
+
+        // WAV input
+        std::vector<float> pcmf32;
+        {
+            drwav wav;
+            if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
+                fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
+                whisper_print_usage(argc, argv, {});
+                return 4;
+            }
+
+            if (wav.channels != 1 && wav.channels != 2) {
+                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
+                return 5;
+            }
+
+            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
+                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
+                return 6;
+            }
+
+            if (wav.bitsPerSample != 16) {
+                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
+                return 7;
+            }
+
+            int n = wav.totalPCMFrameCount;
+
+            std::vector<int16_t> pcm16;
+            pcm16.resize(n*wav.channels);
+            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
+            drwav_uninit(&wav);
+
+            // convert to mono, float
+            pcmf32.resize(n);
+            if (wav.channels == 1) {
+                for (int i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[i])/32768.0f;
+                }
+            } else {
+                for (int i = 0; i < n; i++) {
+                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
+                }
+            }
+        }
+
+        // print some info about the processing
+        {
+            fprintf(stderr, "\n");
+            if (!whisper_is_multilingual(ctx)) {
+                if (params.language != "en" || params.translate) {
+                    params.language = "en";
+                    params.translate = false;
+                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+                }
+            }
+            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
+                    params.language.c_str(),
+                    params.translate ? "translate" : "transcribe",
+                    params.no_timestamps ? 0 : 1);
+
+            fprintf(stderr, "\n");
+        }
+
+
+        // run the inference
+        {
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+            wparams.print_realtime       = false;
+            wparams.print_progress       = false;
+            wparams.print_timestamps     = !params.no_timestamps;
+            wparams.print_special_tokens = params.print_special_tokens;
+            wparams.translate            = params.translate;
+            wparams.language             = params.language.c_str();
+            wparams.n_threads            = params.n_threads;
+            wparams.offset_ms            = params.offset_t_ms;
+
+            // this callback is called on each new segment
+            if (!wparams.print_realtime) {
+                wparams.new_segment_callback           = whisper_print_segment_callback;
+                wparams.new_segment_callback_user_data = &params;
+            }
+
+            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                return 8;
+            }
+
+            printf("\n");
+
+            // output to text file
+            if (params.output_txt) {
+                const auto fname_txt = fname_inp + ".txt";
+                output_txt(ctx, fname_txt.c_str());
+            }
+
+            // output to VTT file
+            if (params.output_vtt) {
+                const auto fname_vtt = fname_inp + ".vtt";
+                output_vtt(ctx, fname_vtt.c_str());
+            }
+
+            // output to SRT file
+            if (params.output_srt) {
+                const auto fname_srt = fname_inp + ".srt";
+                output_srt(ctx, fname_srt.c_str(), params);
+            }
+        }
+    }
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -0,0 +1,7 @@
+if (WHISPER_SUPPORT_SDL2)
+    # stream
+    set(TARGET stream)
+    add_executable(${TARGET} stream.cpp)
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endif ()
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -0,0 +1,345 @@
+// Real-time speech recognition of input from a microphone
+//
+// A very quick-n-dirty implementation serving mainly as a proof of concept.
+
+#include "whisper.h"
+
+// third-party utilities
+// use your favorite implementations
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+
+#include <SDL.h>
+#include <SDL_audio.h>
+
+#include <cassert>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <vector>
+
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t) {
+    int64_t sec = t/100;
+    int64_t msec = t - sec*100;
+    int64_t min = sec/60;
+    sec = sec - min*60;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
+
+    return std::string(buf);
+}
+
+// command-line parameters
+struct whisper_params {
+    int32_t seed      = -1; // RNG seed, not used currently
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t step_ms   = 3000;
+    int32_t length_ms = 10000;
+
+    bool verbose              = false;
+    bool translate            = false;
+    bool no_context           = true;
+    bool print_special_tokens = false;
+    bool no_timestamps        = true;
+
+    std::string language  = "en";
+    std::string model     = "models/ggml-base.en.bin";
+    std::string fname_inp = "samples/jfk.wav";
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "--step") {
+            params.step_ms = std::stoi(argv[++i]);
+        } else if (arg == "--length") {
+            params.length_ms = std::stoi(argv[++i]);
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else if (arg == "--translate") {
+            params.translate = true;
+        } else if (arg == "-kc" || arg == "--keep-context") {
+            params.no_context = false;
+        } else if (arg == "-l" || arg == "--language") {
+            params.language = argv[++i];
+            if (whisper_lang_id(params.language.c_str()) == -1) {
+                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+                whisper_print_usage(argc, argv, params);
+                exit(0);
+            }
+        } else if (arg == "-ps" || arg == "--print_special") {
+            params.print_special_tokens = true;
+        } else if (arg == "-nt" || arg == "--no_timestamps") {
+            params.no_timestamps = true;
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-f" || arg == "--file") {
+            params.fname_inp = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
+    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "            --step N         audio step size in milliseconds (default: %d)\n", params.step_ms);
+    fprintf(stderr, "            --length N       audio length in milliseconds (default: %d)\n", params.length_ms);
+    fprintf(stderr, "  -v,       --verbose        verbose output\n");
+    fprintf(stderr, "            --translate      translate from source language to english\n");
+    fprintf(stderr, "  -kc,      --keep-context   keep text context from earlier audio (default: false)\n");
+    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
+    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
+    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path (default: %s)\n", params.fname_inp.c_str());
+    fprintf(stderr, "\n");
+}
+
+//
+// SDL Audio capture
+//
+
+SDL_AudioDeviceID g_dev_id_in = 0;
+
+bool audio_sdl_init(const int capture_id) {
+    if (g_dev_id_in) {
+        fprintf(stderr, "%s: already initialized\n", __func__);
+        return false;
+    }
+
+    if (g_dev_id_in == 0) {
+        SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+        if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+            return (1);
+        }
+
+        SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+        {
+            int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+            printf("%s: found %d capture devices:\n", __func__, nDevices);
+            for (int i = 0; i < nDevices; i++) {
+                printf("%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+            }
+        }
+    }
+
+    if (g_dev_id_in == 0) {
+        SDL_AudioSpec capture_spec_requested;
+        SDL_AudioSpec capture_spec_obtained;
+
+        SDL_zero(capture_spec_requested);
+        SDL_zero(capture_spec_obtained);
+
+        capture_spec_requested.freq     = WHISPER_SAMPLE_RATE;
+        capture_spec_requested.format   = AUDIO_F32;
+        capture_spec_requested.channels = 1;
+        capture_spec_requested.samples  = 1024;
+
+        if (capture_id >= 0) {
+            printf("%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+            g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+        } else {
+            printf("%s: attempt to open default capture device ...\n", __func__);
+            g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+        }
+        if (!g_dev_id_in) {
+            printf("%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+            g_dev_id_in = 0;
+        } else {
+            printf("%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
+            printf("%s:     - sample rate:       %d\n", __func__, capture_spec_obtained.freq);
+            printf("%s:     - format:            %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
+            printf("%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
+            printf("%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
+        }
+    }
+
+
+    return true;
+}
+
+///////////////////////////
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+
+    // init audio
+
+    if (!audio_sdl_init(-1)) {
+        fprintf(stderr, "%s: audio_sdl_init() failed!\n", __func__);
+        return 1;
+    }
+
+    // whisper init
+
+    struct whisper_context * ctx = whisper_init(params.model.c_str());
+
+    const int n_samples = (params.step_ms/1000.0)*WHISPER_SAMPLE_RATE;
+    const int n_samples_len = (params.length_ms/1000.0)*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s = 30*WHISPER_SAMPLE_RATE;
+    std::vector<float> pcmf32(n_samples_30s, 0.0f);
+    std::vector<float> pcmf32_old;
+
+    const int n_new_line = params.length_ms / params.step_ms - 1;
+
+    // print some info about the processing
+    {
+        printf("\n");
+        if (!whisper_is_multilingual(ctx)) {
+            if (params.language != "en" || params.translate) {
+                params.language = "en";
+                params.translate = false;
+                printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+            }
+        }
+        printf("%s: processing %d samples (step = %.1f sec / len = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+                __func__,
+                n_samples,
+                float(n_samples)/WHISPER_SAMPLE_RATE,
+                float(n_samples_len)/WHISPER_SAMPLE_RATE,
+                params.n_threads,
+                params.language.c_str(),
+                params.translate ? "translate" : "transcribe",
+                params.no_timestamps ? 0 : 1);
+
+        printf("%s: n_new_line = %d\n", __func__, n_new_line);
+        printf("\n");
+    }
+
+    SDL_PauseAudioDevice(g_dev_id_in, 0);
+
+    int n_iter = 0;
+    bool is_running = true;
+
+    // main audio loop
+    while (is_running) {
+        // process SDL events:
+        SDL_Event event;
+        while (SDL_PollEvent(&event)) {
+            switch (event.type) {
+                case SDL_QUIT:
+                    is_running = false;
+                    break;
+                default:
+                    break;
+            }
+        }
+
+        // process new audio
+        if (n_iter > 0 && SDL_GetQueuedAudioSize(g_dev_id_in) > 2*n_samples*sizeof(float)) {
+            fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
+            SDL_ClearQueuedAudio(g_dev_id_in);
+        }
+
+        while (SDL_GetQueuedAudioSize(g_dev_id_in) < n_samples*sizeof(float)) {
+            SDL_Delay(1);
+        }
+
+        const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float);
+
+        // take one second from previous iteration
+        //const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
+
+        // take up to params.length_ms audio from previous iteration
+        const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_len - n_samples_new));
+
+        //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
+
+        pcmf32.resize(n_samples_new + n_samples_take);
+
+        for (int i = 0; i < n_samples_take; i++) {
+            pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
+        }
+
+        SDL_DequeueAudio(g_dev_id_in, pcmf32.data() + n_samples_take, n_samples_new*sizeof(float));
+
+        pcmf32_old = pcmf32;
+
+        // run the inference
+        {
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+            wparams.print_progress       = false;
+            wparams.print_special_tokens = params.print_special_tokens;
+            wparams.print_realtime       = false;
+            wparams.print_timestamps     = !params.no_timestamps;
+            wparams.translate            = params.translate;
+            wparams.no_context           = params.no_context;
+            wparams.language             = params.language.c_str();
+            wparams.n_threads            = params.n_threads;
+
+            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                return 6;
+            }
+
+            // print result;
+            {
+                printf("\33[2K\r");
+
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = 0; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+
+                    if (params.no_timestamps) {
+                        printf ("%s", text);
+                        fflush(stdout);
+                    } else {
+                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                        printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+                    }
+                }
+            }
+
+            ++n_iter;
+
+            if ((n_iter % n_new_line) == 0) {
+                printf("\n");
+
+                pcmf32_old.clear();
+            }
+        }
+    }
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}