diff --git a/examples/server/README.md b/examples/server/README.md index c8e2f714..ffba5f4e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -23,6 +23,7 @@ options: -sow, --split-on-word [false ] split on word rather than on token -bo N, --best-of N [2 ] number of best candidates to keep -bs N, --beam-size N [-1 ] beam size for beam search + -ac N, --audio-ctx N [0 ] audio context size (0 - all) -wt N, --word-thold N [0.01 ] word timestamp probability threshold -et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail -lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail @@ -41,9 +42,28 @@ options: --prompt PROMPT [ ] initial prompt -m FNAME, --model FNAME [models/ggml-base.en.bin] model path -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference + -dtw MODEL --dtw MODEL [ ] compute token-level timestamps --host HOST, [127.0.0.1] Hostname/ip-adress for the server --port PORT, [8080 ] Port number for the server + --public PATH, [examples/server/public] Path to the public folder + --request-path PATH, [ ] Request path for all requests + --inference-path PATH, [/inference] Inference path for all requests --convert, [false ] Convert audio to WAV, requires ffmpeg on the server + -sns, --suppress-nst [false ] suppress non-speech tokens + -nth N, --no-speech-thold N [0.60 ] no speech threshold + -nc, --no-context [false ] do not use previous audio context + -ng, --no-gpu [false ] do not use gpu + -fa, --flash-attn [false ] flash attention + +Voice Activity Detection (VAD) options: + --vad [false ] enable Voice Activity Detection (VAD) + -vm FNAME, --vad-model FNAME [ ] VAD model path + -vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition + -vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0) + -vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments) + -vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer) + -vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments) + -vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments) ``` > [!WARNING] diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8d99ebee..df508839 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,7 @@ #include "httplib.h" #include "json.hpp" +#include #include #include #include @@ -90,6 +91,16 @@ struct whisper_params { std::string openvino_encode_device = "CPU"; std::string dtw = ""; + + // Voice Activity Detection (VAD) parameters + bool vad = false; + std::string vad_model = ""; + float vad_threshold = 0.5f; + int vad_min_speech_duration_ms = 250; + int vad_min_silence_duration_ms = 100; + float vad_max_speech_duration_s = FLT_MAX; + int vad_speech_pad_ms = 30; + float vad_samples_overlap = 0.1f; }; void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) { @@ -140,6 +151,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false"); fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false"); + // Voice Activity Detection (VAD) parameters + fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n"); + fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false"); + fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str()); + fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold); + fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms); + fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms); + fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ? + std::string("FLT_MAX").c_str() : + std::to_string(params.vad_max_speech_duration_s).c_str()); + fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms); + fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap); fprintf(stderr, "\n"); } @@ -195,6 +218,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if ( arg == "--request-path") { sparams.request_path = argv[++i]; } else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; } else if ( arg == "--convert") { sparams.ffmpeg_converter = true; } + + // Voice Activity Detection (VAD) + else if ( arg == "--vad") { params.vad = true; } + else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; } + else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); } + else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); } + else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); } + else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); } + else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); } + else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params, sparams); @@ -511,6 +544,34 @@ void get_req_parameters(const Request & req, whisper_params & params) { params.no_context = parse_str_to_bool(req.get_file_value("no_context").content); } + if (req.has_file("vad")) + { + params.vad = parse_str_to_bool(req.get_file_value("vad").content); + } + if (req.has_file("vad_threshold")) + { + params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content); + } + if (req.has_file("vad_min_speech_duration_ms")) + { + params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content); + } + if (req.has_file("vad_min_silence_duration_ms")) + { + params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content); + } + if (req.has_file("vad_max_speech_duration_s")) + { + params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content); + } + if (req.has_file("vad_speech_pad_ms")) + { + params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content); + } + if (req.has_file("vad_samples_overlap")) + { + params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content); + } } } // namespace @@ -829,6 +890,16 @@ int main(int argc, char ** argv) { wparams.suppress_nst = params.suppress_nst; + wparams.vad = params.vad; + wparams.vad_model_path = params.vad_model.c_str(); + + wparams.vad_params.threshold = params.vad_threshold; + wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms; + wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms; + wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s; + wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; + wparams.vad_params.samples_overlap = params.vad_samples_overlap; + whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 }; // this callback is called on each new segment