whisper : add single-timestamp logic (#2629)

* Fix hallucinations during silence

When the predicted tokens end with a single timestamp the the entire 30 segment should be considered as done, to avoid hallucinations for the remaining part of segment.
This behaviour is on par with openai's whisper. Refer to logic related to `single_timestamp_ending` in https://github.com/openai/whisper/blob/main/whisper/transcribe.py

* Accept review comments related to formatting.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Karthick 2024-12-17 22:37:08 +05:30 committed by GitHub
parent 09a1b61218
commit 2f2841bfce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -6060,7 +6060,7 @@ int whisper_full_with_state(
{
const auto & best_decoder = state->decoders[best_decoder_id];
const auto seek_delta = best_decoder.seek_delta;
auto seek_delta = best_decoder.seek_delta;
const auto result_len = best_decoder.sequence.result_len;
const auto & tokens_cur = best_decoder.sequence.tokens;
@ -6201,6 +6201,15 @@ int whisper_full_with_state(
}
}
// ref: https://github.com/ggerganov/whisper.cpp/pull/2629
const bool single_timestamp_ending = tokens_cur.size() > 1 &&
tokens_cur[tokens_cur.size() - 2].id < whisper_token_beg(ctx) &&
tokens_cur[tokens_cur.size() - 1].id > whisper_token_beg(ctx);
if (single_timestamp_ending) {
WHISPER_LOG_DEBUG("single timestamp ending - skip entire chunk\n");
seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100);
}
// update audio window
seek += seek_delta;