vad : add initial Voice Activity Detection (VAD) support (#3065)

* vad : add initial Voice Activity Detection (VAD) support This commit add support for Voice Activity Detection (VAD). When enabled this feature will process the audio input and detect speech segments. This information is then used to reduce the number of samples that need to be processed by whisper_full. Resolves: https://github.com/ggml-org/whisper.cpp/issues/3003 --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-06-19 23:33:43 +00:00 · 2025-05-12 16:10:11 +02:00
parent e39ba750cd
commit e41bc5c61a
11 changed files with 1972 additions and 11 deletions
--- a/tests/test-vad.cpp
+++ b/tests/test-vad.cpp
@ -0,0 +1,83 @@
+#include "whisper.h"
+#include "common-whisper.h"
+
+#include <cstdio>
+#include <string>
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+#include <cassert>
+
+void assert_default_params(const struct whisper_vad_params & params) {
+    assert(params.threshold == 0.5);
+    assert(params.min_speech_duration_ms == 250);
+    assert(params.min_silence_duration_ms == 100);
+    assert(params.samples_overlap == 0.1f);
+}
+
+void assert_default_context_params(const struct whisper_vad_context_params & params) {
+    assert(params.n_threads == 4);
+    assert(params.use_gpu == false);
+    assert(params.gpu_device == 0);
+}
+
+void test_detect_speech(
+        struct whisper_vad_context * vctx,
+        struct whisper_vad_params params,
+        const float * pcmf32,
+        int n_samples) {
+    assert(whisper_vad_detect_speech(vctx, pcmf32, n_samples));
+    assert(whisper_vad_n_probs(vctx) == 344);
+    assert(whisper_vad_probs(vctx) != nullptr);
+}
+
+struct whisper_vad_segments * test_detect_timestamps(
+        struct whisper_vad_context * vctx,
+        struct whisper_vad_params params) {
+    struct whisper_vad_segments * timestamps = whisper_vad_segments_from_probs(vctx, params);
+    assert(whisper_vad_segments_n_segments(timestamps) == 5);
+
+    for (int i = 0; i < whisper_vad_segments_n_segments(timestamps); ++i) {
+        printf("VAD segment %d: start = %.2f, end = %.2f\n", i,
+               whisper_vad_segments_get_segment_t0(timestamps, i),
+               whisper_vad_segments_get_segment_t1(timestamps, i));
+    }
+
+    return timestamps;
+}
+
+int main() {
+    std::string vad_model_path = "../../models/for-tests-silero-v5.1.2-ggml.bin";
+    std::string sample_path    = "../../samples/jfk.wav";
+
+    // Load the sample audio file
+    std::vector<float> pcmf32;
+    std::vector<std::vector<float>> pcmf32s;
+    assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
+    assert(pcmf32.size() > 0);
+    assert(pcmf32s.size() == 0); // no stereo vector
+
+    // Load the VAD model
+    struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
+    assert_default_context_params(ctx_params);
+
+    struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
+            vad_model_path.c_str(),
+            ctx_params);
+    assert(vctx != nullptr);
+
+    struct whisper_vad_params params = whisper_vad_default_params();
+    assert_default_params(params);
+
+    // Test speech probabilites
+    test_detect_speech(vctx, params, pcmf32.data(), pcmf32.size());
+
+    // Test speech timestamps (uses speech probabilities from above)
+    struct whisper_vad_segments * timestamps = test_detect_timestamps(vctx, params);
+
+    whisper_vad_free_segments(timestamps);
+    whisper_vad_free(vctx);
+
+    return 0;
+}