examples : add VAD speech segments example (#3147)

This commit adds an example that demonstrates how to use a VAD (Voice Activity Detection) model to segment an audio file into speech segments. Resolves: https://github.com/ggml-org/whisper.cpp/issues/3144
2025-05-21 09:47:52 +00:00 · 2025-05-13 12:31:00 +02:00 · 2025-05-13 12:31:00 +02:00 · fbad8058c4
commit fbad8058c4
parent b2513a6208
4 changed files with 204 additions and 0 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -105,6 +105,7 @@ else()
    add_subdirectory(bench)
    add_subdirectory(server)
    add_subdirectory(quantize)
+    add_subdirectory(vad-speech-segments)
    if (WHISPER_SDL2)
        add_subdirectory(stream)
        add_subdirectory(command)
--- a/examples/vad-speech-segments/CMakeLists.txt
+++ b/examples/vad-speech-segments/CMakeLists.txt
@ -0,0 +1,8 @@
+set(TARGET vad-speech-segments)
+add_executable(${TARGET} speech.cpp)
+
+include(DefaultTargetOptions)
+
+target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+
+install(TARGETS ${TARGET} RUNTIME)
--- a/examples/vad-speech-segments/README.md
+++ b/examples/vad-speech-segments/README.md
@ -0,0 +1,52 @@
+# whisper.cpp/examples/vad-speech-segments
+
+This examples demonstrates how to use a VAD (Voice Activity Detection) model to
+segment an audio file into speech segments.
+
+### Building the example
+The example can be built using the following command:
+```console
+cmake -S . -B build
+cmake --build build -j8 --target vad-speech-segments
+```
+
+### Running the example
+The examples can be run using the following command, which uses a model
+that we use internally for testing:
+```console
+./build/bin/vad-speech-segments \
+    -vad-model models/for-tests-silero-v5.1.2-ggml.bin \
+    --file samples/jfk.wav \
+    --no-prints
+
+Detected 5 speech segments:
+Speech segment 0: start = 0.29, end = 2.21
+Speech segment 1: start = 3.30, end = 3.77
+Speech segment 2: start = 4.00, end = 4.35
+Speech segment 3: start = 5.38, end = 7.65
+Speech segment 4: start = 8.16, end = 10.59
+```
+To see more output from whisper.cpp remove the `--no-prints` argument.
+
+
+### Command line options
+```console
+./build/bin/vad-speech-segments --help
+
+usage: ./build/bin/vad-speech-segments [options] file
+supported audio formats: flac, mp3, ogg, wav
+
+options:
+  -h,        --help                          [default] show this help message and exit
+  -f FNAME,  --file FNAME                    [       ] input audio file path
+  -t N,      --threads N                     [4      ] number of threads to use during computation
+  -ug,       --use-gpu                       [true   ] use GPU
+  -vm FNAME, --vad-model FNAME               [       ] VAD model path
+  -vt N,     --vad-threshold N               [0.50   ] VAD threshold for speech recognition
+  -vspd N,   --vad-min-speech-duration-ms  N [250    ] VAD min speech duration (0.0-1.0)
+  -vsd N,    --vad-min-silence-duration-ms N [100    ] VAD min silence duration (to split segments)
+  -vmsd N,   --vad-max-speech-duration-s   N [FLT_MAX] VAD max speech duration (auto-split longer)
+  -vp N,     --vad-speech-pad-ms           N [30     ] VAD speech padding (extend segments)
+  -vo N,     --vad-samples-overlap         N [0.10   ] VAD samples overlap (seconds between segments)
+  -np,       --no-prints                     [false  ] do not print anything other than the results
+```
--- a/examples/vad-speech-segments/speech.cpp
+++ b/examples/vad-speech-segments/speech.cpp
@ -0,0 +1,143 @@
+#include "common.h"
+#include "common-whisper.h"
+
+#include "whisper.h"
+
+#include <cstdio>
+#include <cfloat>
+#include <string>
+
+// command-line parameters
+struct cli_params {
+    int32_t     n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    std::string vad_model = "";
+    float       vad_threshold = 0.5f;
+    int         vad_min_speech_duration_ms = 250;
+    int         vad_min_silence_duration_ms = 100;
+    float       vad_max_speech_duration_s = FLT_MAX;
+    int         vad_speech_pad_ms = 30;
+    float       vad_samples_overlap = 0.1f;
+    bool        use_gpu = false;
+    std::string fname_inp = {};
+    bool        no_prints       = false;
+};
+
+static void vad_print_usage(int /*argc*/, char ** argv, const cli_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options] file\n", argv[0]);
+    fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,        --help                          [default] show this help message and exit\n");
+    fprintf(stderr, "  -f FNAME,  --file FNAME                    [%-7s] input audio file path\n",                            "");
+    fprintf(stderr, "  -t N,      --threads N                     [%-7d] number of threads to use during computation\n",      params.n_threads);
+    fprintf(stderr, "  -ug,       --use-gpu                       [%-7s] use GPU\n",                                          params.use_gpu ? "true" : "false");
+    fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                                   params.vad_model.c_str());
+    fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n",           params.vad_threshold);
+    fprintf(stderr, "  -vspd N,   --vad-min-speech-duration-ms  N [%-7d] VAD min speech duration (0.0-1.0)\n",                params.vad_min_speech_duration_ms);
+    fprintf(stderr, "  -vsd N,    --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n",     params.vad_min_silence_duration_ms);
+    fprintf(stderr, "  -vmsd N,   --vad-max-speech-duration-s   N [%-7s] VAD max speech duration (auto-split longer)\n",      params.vad_max_speech_duration_s == FLT_MAX ?
+                                                                                                                                  std::string("FLT_MAX").c_str() :
+                                                                                                                                  std::to_string(params.vad_max_speech_duration_s).c_str());
+    fprintf(stderr, "  -vp N,     --vad-speech-pad-ms           N [%-7d] VAD speech padding (extend segments)\n",             params.vad_speech_pad_ms);
+    fprintf(stderr, "  -vo N,     --vad-samples-overlap         N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
+    fprintf(stderr, "  -np,       --no-prints                     [%-7s] do not print anything other than the results\n",     params.no_prints ? "true" : "false");
+    fprintf(stderr, "\n");
+}
+
+static char * requires_value_error(const std::string & arg) {
+    fprintf(stderr, "error: argument %s requires value\n", arg.c_str());
+    exit(0);
+}
+
+static bool vad_params_parse(int argc, char ** argv, cli_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            vad_print_usage(argc, argv, params);
+            exit(0);
+        }
+        #define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
+        else if (arg == "-f"    || arg == "--file")                        { params.fname_inp = ARGV_NEXT; }
+        else if (arg == "-t"    || arg == "--threads")                     { params.n_threads                   = std::stoi(ARGV_NEXT); }
+        else if (arg == "-ug"   || arg == "--use-gpu")                     { params.use_gpu                     = true; }
+        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = ARGV_NEXT; }
+        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(ARGV_NEXT); }
+        else if (arg == "-vsd"  || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vsd"  || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s")   { params.vad_max_speech_duration_s   = std::stof(ARGV_NEXT); }
+        else if (arg == "-vp"   || arg == "--vad-speech-pad-ms")           { params.vad_speech_pad_ms           = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vo"   || arg == "--vad-samples-overlap")         { params.vad_samples_overlap         = std::stof(ARGV_NEXT); }
+        else if (arg == "-np"   || arg == "--no-prints")                   { params.no_prints       = true; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            vad_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
+
+int main(int argc, char ** argv) {
+    cli_params cli_params;
+
+    if (!vad_params_parse(argc, argv, cli_params)) {
+        vad_print_usage(argc, argv, cli_params);
+        return 1;
+    }
+
+    if (cli_params.no_prints) {
+        whisper_log_set(cb_log_disable, NULL);
+    }
+
+    // Load the input sample audio file.
+    std::vector<float> pcmf32;
+    std::vector<std::vector<float>> pcmf32s;
+    if (!read_audio_data(cli_params.fname_inp.c_str(), pcmf32, pcmf32s, false)) {
+        fprintf(stderr, "error: failed to read audio data from %s\n", cli_params.fname_inp.c_str());
+        return 2;
+    }
+
+    // Initialize the context which loads the VAD model.
+    struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
+    ctx_params.n_threads  = cli_params.n_threads;
+    ctx_params.use_gpu    = cli_params.use_gpu;
+    struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
+            cli_params.vad_model.c_str(),
+            ctx_params);
+
+    // Detect speech in the input audio file.
+    if (!whisper_vad_detect_speech(vctx, pcmf32.data(), pcmf32.size())) {
+        fprintf(stderr, "error: failed to detect speech\n");
+        return 3;
+    }
+
+    // Get the the vad segements using the probabilities that have been computed
+    // previously and stored in the whisper_vad_context.
+    struct whisper_vad_params params = whisper_vad_default_params();
+    params.threshold = cli_params.vad_threshold;
+    params.min_speech_duration_ms = cli_params.vad_min_speech_duration_ms;
+    params.min_silence_duration_ms = cli_params.vad_min_silence_duration_ms;
+    params.max_speech_duration_s = cli_params.vad_max_speech_duration_s;
+    params.speech_pad_ms = cli_params.vad_speech_pad_ms;
+    params.samples_overlap = cli_params.vad_samples_overlap;
+    struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, params);
+
+    printf("\n");
+    printf("Detected %d speech segments:\n", whisper_vad_segments_n_segments(segments));
+    for (int i = 0; i < whisper_vad_segments_n_segments(segments); ++i) {
+        printf("Speech segment %d: start = %.2f, end = %.2f\n", i,
+               whisper_vad_segments_get_segment_t0(segments, i),
+               whisper_vad_segments_get_segment_t1(segments, i));
+    }
+    printf("\n");
+
+    whisper_vad_free_segments(segments);
+    whisper_vad_free(vctx);
+
+    return 0;
+}