mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-12 20:18:08 +00:00
whisper : token-level timestamps with DTW (#1485)
* whisper.cpp: impl dtw algo * WIP: producing and placing DTW timestamps on tokens * Fix compile and assertion errors. Attempt to DTW timestamp with single_segment=false. * Fix mistake causing incorrect alignment of dtw timestamps * implement N_TOP_MOST and CUSTOM alignment heads setting * whisper: fix typo on alignment heads enum * Fix issues related to changes in whisper.cpp * Fixed excessive memory use when using DTW timestamps. Other minor fixes to DTW timestamping function * decoder: save cross QKs only if requested * Calling median filter with ggml_map_custom1 * Reimpl aheads n_top_most and custom. Sanity checks on chosen aheads * Copying cross QKs from decoder backend correctly * dtw: cleanup * Fix incorrect n_frames passed to dtw when near end of audio * Fix aheads_masks_init for backend != CPU * whisper : minor style * main : add dtw (wip) * whisper: fix invalid memory access in aheads_masks_init * main : add dtw (cont) * whisper : minor --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@ -26,17 +26,17 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t n_processors = 1;
|
||||
int32_t offset_t_ms = 0;
|
||||
int32_t offset_n = 0;
|
||||
int32_t duration_ms = 0;
|
||||
int32_t progress_step = 5;
|
||||
int32_t max_context = -1;
|
||||
int32_t max_len = 0;
|
||||
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
|
||||
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
|
||||
int32_t audio_ctx = 0;
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t n_processors = 1;
|
||||
int32_t offset_t_ms = 0;
|
||||
int32_t offset_n = 0;
|
||||
int32_t duration_ms = 0;
|
||||
int32_t progress_step = 5;
|
||||
int32_t max_context = -1;
|
||||
int32_t max_len = 0;
|
||||
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
|
||||
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
|
||||
int32_t audio_ctx = 0;
|
||||
|
||||
float word_thold = 0.01f;
|
||||
float entropy_thold = 2.40f;
|
||||
@ -76,6 +76,8 @@ struct whisper_params {
|
||||
|
||||
std::string openvino_encode_device = "CPU";
|
||||
|
||||
std::string dtw = "";
|
||||
|
||||
std::vector<std::string> fname_inp = {};
|
||||
std::vector<std::string> fname_out = {};
|
||||
};
|
||||
@ -149,6 +151,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
||||
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
||||
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; }
|
||||
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
|
||||
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
||||
else {
|
||||
@ -208,6 +211,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
||||
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
|
||||
fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
|
||||
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
|
||||
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
|
||||
fprintf(stderr, "\n");
|
||||
@ -649,7 +653,8 @@ bool output_json(
|
||||
times_o(token.t0, token.t1, false);
|
||||
}
|
||||
value_i("id", token.id, false);
|
||||
value_f("p", token.p, true);
|
||||
value_f("p", token.p, false);
|
||||
value_f("t_dtw", token.t_dtw, true);
|
||||
end_obj(j == (n - 1));
|
||||
}
|
||||
end_arr(!params.diarize && !params.tinydiarize);
|
||||
@ -889,6 +894,28 @@ int main(int argc, char ** argv) {
|
||||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
|
||||
if (!params.dtw.empty()) {
|
||||
cparams.dtw_token_timestamps = true;
|
||||
cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE;
|
||||
|
||||
if (params.dtw == "tiny") cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY;
|
||||
if (params.dtw == "tiny.en") cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN;
|
||||
if (params.dtw == "base") cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
|
||||
if (params.dtw == "base.en") cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN;
|
||||
if (params.dtw == "small") cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL;
|
||||
if (params.dtw == "small.en") cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN;
|
||||
if (params.dtw == "medium") cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM;
|
||||
if (params.dtw == "medium.en") cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN;
|
||||
if (params.dtw == "large.v1") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
|
||||
if (params.dtw == "large.v2") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
|
||||
if (params.dtw == "large.v3") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
|
||||
|
||||
if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
|
||||
fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
||||
|
||||
if (ctx == nullptr) {
|
||||
|
Reference in New Issue
Block a user