mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-16 22:08:07 +00:00
server : add Voice Activity Detection (VAD) support (#3246)
* server : add Voice Activity Detection (VAD) support This commit adds support for Voice Activity Detection (VAD) in the server example. The motivation for this is to enable VAD processing when using whisper-server. Resolves: https://github.com/ggml-org/whisper.cpp/issues/3089 * server : add VAD parameters to usage in README.md [no ci] This commit also adds a few missing parameters. * server : fix conflicting short options [no ci]
This commit is contained in:
@ -23,6 +23,7 @@ options:
|
|||||||
-sow, --split-on-word [false ] split on word rather than on token
|
-sow, --split-on-word [false ] split on word rather than on token
|
||||||
-bo N, --best-of N [2 ] number of best candidates to keep
|
-bo N, --best-of N [2 ] number of best candidates to keep
|
||||||
-bs N, --beam-size N [-1 ] beam size for beam search
|
-bs N, --beam-size N [-1 ] beam size for beam search
|
||||||
|
-ac N, --audio-ctx N [0 ] audio context size (0 - all)
|
||||||
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
|
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
|
||||||
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
|
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
|
||||||
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
|
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
|
||||||
@ -41,9 +42,28 @@ options:
|
|||||||
--prompt PROMPT [ ] initial prompt
|
--prompt PROMPT [ ] initial prompt
|
||||||
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
|
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
|
||||||
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
|
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
|
||||||
|
-dtw MODEL --dtw MODEL [ ] compute token-level timestamps
|
||||||
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
|
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
|
||||||
--port PORT, [8080 ] Port number for the server
|
--port PORT, [8080 ] Port number for the server
|
||||||
|
--public PATH, [examples/server/public] Path to the public folder
|
||||||
|
--request-path PATH, [ ] Request path for all requests
|
||||||
|
--inference-path PATH, [/inference] Inference path for all requests
|
||||||
--convert, [false ] Convert audio to WAV, requires ffmpeg on the server
|
--convert, [false ] Convert audio to WAV, requires ffmpeg on the server
|
||||||
|
-sns, --suppress-nst [false ] suppress non-speech tokens
|
||||||
|
-nth N, --no-speech-thold N [0.60 ] no speech threshold
|
||||||
|
-nc, --no-context [false ] do not use previous audio context
|
||||||
|
-ng, --no-gpu [false ] do not use gpu
|
||||||
|
-fa, --flash-attn [false ] flash attention
|
||||||
|
|
||||||
|
Voice Activity Detection (VAD) options:
|
||||||
|
--vad [false ] enable Voice Activity Detection (VAD)
|
||||||
|
-vm FNAME, --vad-model FNAME [ ] VAD model path
|
||||||
|
-vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition
|
||||||
|
-vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0)
|
||||||
|
-vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments)
|
||||||
|
-vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer)
|
||||||
|
-vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments)
|
||||||
|
-vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments)
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!WARNING]
|
> [!WARNING]
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include "httplib.h"
|
#include "httplib.h"
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
|
#include <cfloat>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
@ -90,6 +91,16 @@ struct whisper_params {
|
|||||||
std::string openvino_encode_device = "CPU";
|
std::string openvino_encode_device = "CPU";
|
||||||
|
|
||||||
std::string dtw = "";
|
std::string dtw = "";
|
||||||
|
|
||||||
|
// Voice Activity Detection (VAD) parameters
|
||||||
|
bool vad = false;
|
||||||
|
std::string vad_model = "";
|
||||||
|
float vad_threshold = 0.5f;
|
||||||
|
int vad_min_speech_duration_ms = 250;
|
||||||
|
int vad_min_silence_duration_ms = 100;
|
||||||
|
float vad_max_speech_duration_s = FLT_MAX;
|
||||||
|
int vad_speech_pad_ms = 30;
|
||||||
|
float vad_samples_overlap = 0.1f;
|
||||||
};
|
};
|
||||||
|
|
||||||
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
|
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
|
||||||
@ -140,6 +151,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|||||||
fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
|
fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
|
||||||
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
|
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
|
||||||
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
|
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
|
||||||
|
// Voice Activity Detection (VAD) parameters
|
||||||
|
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
|
||||||
|
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
|
||||||
|
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
|
||||||
|
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
|
||||||
|
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
|
||||||
|
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
|
||||||
|
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
|
||||||
|
std::string("FLT_MAX").c_str() :
|
||||||
|
std::to_string(params.vad_max_speech_duration_s).c_str());
|
||||||
|
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
|
||||||
|
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,6 +218,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
|||||||
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
|
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
|
||||||
else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
|
else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
|
||||||
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
|
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
|
||||||
|
|
||||||
|
// Voice Activity Detection (VAD)
|
||||||
|
else if ( arg == "--vad") { params.vad = true; }
|
||||||
|
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
|
||||||
|
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
|
||||||
|
else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
|
||||||
|
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
|
||||||
else {
|
else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
whisper_print_usage(argc, argv, params, sparams);
|
whisper_print_usage(argc, argv, params, sparams);
|
||||||
@ -511,6 +544,34 @@ void get_req_parameters(const Request & req, whisper_params & params)
|
|||||||
{
|
{
|
||||||
params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
|
params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
|
||||||
}
|
}
|
||||||
|
if (req.has_file("vad"))
|
||||||
|
{
|
||||||
|
params.vad = parse_str_to_bool(req.get_file_value("vad").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("vad_threshold"))
|
||||||
|
{
|
||||||
|
params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("vad_min_speech_duration_ms"))
|
||||||
|
{
|
||||||
|
params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("vad_min_silence_duration_ms"))
|
||||||
|
{
|
||||||
|
params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("vad_max_speech_duration_s"))
|
||||||
|
{
|
||||||
|
params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("vad_speech_pad_ms"))
|
||||||
|
{
|
||||||
|
params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("vad_samples_overlap"))
|
||||||
|
{
|
||||||
|
params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -829,6 +890,16 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
wparams.suppress_nst = params.suppress_nst;
|
wparams.suppress_nst = params.suppress_nst;
|
||||||
|
|
||||||
|
wparams.vad = params.vad;
|
||||||
|
wparams.vad_model_path = params.vad_model.c_str();
|
||||||
|
|
||||||
|
wparams.vad_params.threshold = params.vad_threshold;
|
||||||
|
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
|
||||||
|
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
|
||||||
|
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
|
||||||
|
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
|
||||||
|
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
|
||||||
|
|
||||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
||||||
|
|
||||||
// this callback is called on each new segment
|
// this callback is called on each new segment
|
||||||
|
Reference in New Issue
Block a user