From fbad8058c4c6ba60592a62f2eae3d9fb2c141635 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 13 May 2025 12:31:00 +0200 Subject: [PATCH] examples : add VAD speech segments example (#3147) This commit adds an example that demonstrates how to use a VAD (Voice Activity Detection) model to segment an audio file into speech segments. Resolves: https://github.com/ggml-org/whisper.cpp/issues/3144 --- examples/CMakeLists.txt | 1 + examples/vad-speech-segments/CMakeLists.txt | 8 ++ examples/vad-speech-segments/README.md | 52 +++++++ examples/vad-speech-segments/speech.cpp | 143 ++++++++++++++++++++ 4 files changed, 204 insertions(+) create mode 100644 examples/vad-speech-segments/CMakeLists.txt create mode 100644 examples/vad-speech-segments/README.md create mode 100644 examples/vad-speech-segments/speech.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e4265aff..c37a2e6d 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -105,6 +105,7 @@ else() add_subdirectory(bench) add_subdirectory(server) add_subdirectory(quantize) + add_subdirectory(vad-speech-segments) if (WHISPER_SDL2) add_subdirectory(stream) add_subdirectory(command) diff --git a/examples/vad-speech-segments/CMakeLists.txt b/examples/vad-speech-segments/CMakeLists.txt new file mode 100644 index 00000000..da685244 --- /dev/null +++ b/examples/vad-speech-segments/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET vad-speech-segments) +add_executable(${TARGET} speech.cpp) + +include(DefaultTargetOptions) + +target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) + +install(TARGETS ${TARGET} RUNTIME) diff --git a/examples/vad-speech-segments/README.md b/examples/vad-speech-segments/README.md new file mode 100644 index 00000000..d9c3e74b --- /dev/null +++ b/examples/vad-speech-segments/README.md @@ -0,0 +1,52 @@ +# whisper.cpp/examples/vad-speech-segments + +This examples demonstrates how to use a VAD (Voice Activity Detection) model to +segment an audio file into speech segments. + +### Building the example +The example can be built using the following command: +```console +cmake -S . -B build +cmake --build build -j8 --target vad-speech-segments +``` + +### Running the example +The examples can be run using the following command, which uses a model +that we use internally for testing: +```console +./build/bin/vad-speech-segments \ + -vad-model models/for-tests-silero-v5.1.2-ggml.bin \ + --file samples/jfk.wav \ + --no-prints + +Detected 5 speech segments: +Speech segment 0: start = 0.29, end = 2.21 +Speech segment 1: start = 3.30, end = 3.77 +Speech segment 2: start = 4.00, end = 4.35 +Speech segment 3: start = 5.38, end = 7.65 +Speech segment 4: start = 8.16, end = 10.59 +``` +To see more output from whisper.cpp remove the `--no-prints` argument. + + +### Command line options +```console +./build/bin/vad-speech-segments --help + +usage: ./build/bin/vad-speech-segments [options] file +supported audio formats: flac, mp3, ogg, wav + +options: + -h, --help [default] show this help message and exit + -f FNAME, --file FNAME [ ] input audio file path + -t N, --threads N [4 ] number of threads to use during computation + -ug, --use-gpu [true ] use GPU + -vm FNAME, --vad-model FNAME [ ] VAD model path + -vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition + -vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0) + -vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments) + -vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer) + -vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments) + -vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments) + -np, --no-prints [false ] do not print anything other than the results +``` diff --git a/examples/vad-speech-segments/speech.cpp b/examples/vad-speech-segments/speech.cpp new file mode 100644 index 00000000..287933bb --- /dev/null +++ b/examples/vad-speech-segments/speech.cpp @@ -0,0 +1,143 @@ +#include "common.h" +#include "common-whisper.h" + +#include "whisper.h" + +#include +#include +#include + +// command-line parameters +struct cli_params { + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + std::string vad_model = ""; + float vad_threshold = 0.5f; + int vad_min_speech_duration_ms = 250; + int vad_min_silence_duration_ms = 100; + float vad_max_speech_duration_s = FLT_MAX; + int vad_speech_pad_ms = 30; + float vad_samples_overlap = 0.1f; + bool use_gpu = false; + std::string fname_inp = {}; + bool no_prints = false; +}; + +static void vad_print_usage(int /*argc*/, char ** argv, const cli_params & params) { + fprintf(stderr, "\n"); + fprintf(stderr, "usage: %s [options] file\n", argv[0]); + fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help [default] show this help message and exit\n"); + fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", ""); + fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); + fprintf(stderr, " -ug, --use-gpu [%-7s] use GPU\n", params.use_gpu ? "true" : "false"); + fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str()); + fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold); + fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms); + fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms); + fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ? + std::string("FLT_MAX").c_str() : + std::to_string(params.vad_max_speech_duration_s).c_str()); + fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms); + fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap); + fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false"); + fprintf(stderr, "\n"); +} + +static char * requires_value_error(const std::string & arg) { + fprintf(stderr, "error: argument %s requires value\n", arg.c_str()); + exit(0); +} + +static bool vad_params_parse(int argc, char ** argv, cli_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-h" || arg == "--help") { + vad_print_usage(argc, argv, params); + exit(0); + } + #define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg)) + else if (arg == "-f" || arg == "--file") { params.fname_inp = ARGV_NEXT; } + else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); } + else if (arg == "-ug" || arg == "--use-gpu") { params.use_gpu = true; } + else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; } + else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); } + else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); } + else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); } + else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); } + else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); } + else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); } + else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; } + else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + vad_print_usage(argc, argv, params); + exit(0); + } + } + + return true; +} + +static void cb_log_disable(enum ggml_log_level , const char * , void * ) { } + +int main(int argc, char ** argv) { + cli_params cli_params; + + if (!vad_params_parse(argc, argv, cli_params)) { + vad_print_usage(argc, argv, cli_params); + return 1; + } + + if (cli_params.no_prints) { + whisper_log_set(cb_log_disable, NULL); + } + + // Load the input sample audio file. + std::vector pcmf32; + std::vector> pcmf32s; + if (!read_audio_data(cli_params.fname_inp.c_str(), pcmf32, pcmf32s, false)) { + fprintf(stderr, "error: failed to read audio data from %s\n", cli_params.fname_inp.c_str()); + return 2; + } + + // Initialize the context which loads the VAD model. + struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params(); + ctx_params.n_threads = cli_params.n_threads; + ctx_params.use_gpu = cli_params.use_gpu; + struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params( + cli_params.vad_model.c_str(), + ctx_params); + + // Detect speech in the input audio file. + if (!whisper_vad_detect_speech(vctx, pcmf32.data(), pcmf32.size())) { + fprintf(stderr, "error: failed to detect speech\n"); + return 3; + } + + // Get the the vad segements using the probabilities that have been computed + // previously and stored in the whisper_vad_context. + struct whisper_vad_params params = whisper_vad_default_params(); + params.threshold = cli_params.vad_threshold; + params.min_speech_duration_ms = cli_params.vad_min_speech_duration_ms; + params.min_silence_duration_ms = cli_params.vad_min_silence_duration_ms; + params.max_speech_duration_s = cli_params.vad_max_speech_duration_s; + params.speech_pad_ms = cli_params.vad_speech_pad_ms; + params.samples_overlap = cli_params.vad_samples_overlap; + struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, params); + + printf("\n"); + printf("Detected %d speech segments:\n", whisper_vad_segments_n_segments(segments)); + for (int i = 0; i < whisper_vad_segments_n_segments(segments); ++i) { + printf("Speech segment %d: start = %.2f, end = %.2f\n", i, + whisper_vad_segments_get_segment_t0(segments, i), + whisper_vad_segments_get_segment_t1(segments, i)); + } + printf("\n"); + + whisper_vad_free_segments(segments); + whisper_vad_free(vctx); + + return 0; +}