whisper : add single-timestamp logic (#2629)

* Fix hallucinations during silence When the predicted tokens end with a single timestamp the the entire 30 segment should be considered as done, to avoid hallucinations for the remaining part of segment. This behaviour is on par with openai's whisper. Refer to logic related to `single_timestamp_ending` in https://github.com/openai/whisper/blob/main/whisper/transcribe.py * Accept review comments related to formatting. Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-06-18 14:58:09 +00:00 · 2024-12-17 22:37:08 +05:30
parent 09a1b61218
commit 2f2841bfce
1 changed files with 10 additions and 1 deletions
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@ -6060,7 +6060,7 @@ int whisper_full_with_state(
        {
            const auto & best_decoder = state->decoders[best_decoder_id];

-            const auto seek_delta = best_decoder.seek_delta;
+            auto seek_delta = best_decoder.seek_delta;
            const auto result_len = best_decoder.sequence.result_len;

            const auto & tokens_cur = best_decoder.sequence.tokens;
@ -6201,6 +6201,15 @@ int whisper_full_with_state(
                }
            }

+            // ref: https://github.com/ggerganov/whisper.cpp/pull/2629
+            const bool single_timestamp_ending = tokens_cur.size() > 1 &&
+                tokens_cur[tokens_cur.size() - 2].id < whisper_token_beg(ctx) &&
+                tokens_cur[tokens_cur.size() - 1].id > whisper_token_beg(ctx);
+            if (single_timestamp_ending) {
+                WHISPER_LOG_DEBUG("single timestamp ending - skip entire chunk\n");
+                seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100);
+            }
+
            // update audio window
            seek += seek_delta;