whisper : reduce delta_min from 1000ms to 100ms

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-04-10 12:25:29 +03:00
parent 43f5030aeb
commit becd0c888e
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -5527,11 +5527,13 @@ int whisper_full_with_state(
const int seek_start = params.offset_ms/10; const int seek_start = params.offset_ms/10;
const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10; const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
// if length of spectrogram is less than 1.0s (100 frames), then return // if length of spectrogram is less than 100ms (10 frames), then return
// basically don't process anything that is less than 1.0s // basically don't process anything that is less than 100ms
// see issue #39: https://github.com/ggml-org/whisper.cpp/issues/39 // ref: https://github.com/ggml-org/whisper.cpp/issues/2065
if (seek_end < seek_start + 100) { const int delta_min = 10;
WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
if (seek_end < seek_start + delta_min) {
WHISPER_LOG_WARN("%s: input is too short - %d ms < 100 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
return 0; return 0;
} }
@ -5675,8 +5677,8 @@ int whisper_full_with_state(
ctx, state, progress_cur, params.progress_callback_user_data); ctx, state, progress_cur, params.progress_callback_user_data);
} }
// if only 1 second left, then stop // if only 100ms left, then stop
if (seek + 100 >= seek_end) { if (seek + delta_min >= seek_end) {
break; break;
} }
@ -6023,10 +6025,10 @@ int whisper_full_with_state(
// end of segment // end of segment
if (token.id == whisper_token_eot(ctx) || // end of text token if (token.id == whisper_token_eot(ctx) || // end of text token
(params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached (params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
(has_ts && seek + seek_delta + 100 >= seek_end) // end of audio reached (has_ts && seek + seek_delta + delta_min >= seek_end) // end of audio reached (100ms)
) { ) {
if (result_len == 0 && !params.no_timestamps) { if (result_len == 0 && !params.no_timestamps) {
if (seek + seek_delta + 100 >= seek_end) { if (seek + seek_delta + delta_min >= seek_end) {
result_len = i + 1; result_len = i + 1;
} else { } else {
WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j); WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);