From 6a7c82501e3794724ba80bfb9a983810af036803 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 16 Dec 2022 18:31:17 +0200 Subject: [PATCH] whisper : improve decoding strategy (#244) - Clear past prompt when there is very short audio left for processing. My observation is that in these cases the decoding tends to repeat and hallucinate stuff and I think this is induced by the existing prompt - When we fail to sample timestamp token, retry by clearing the past prompt. If it fails again, then we advance the window by 1 second --- whisper.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 1bc79967..da35456a 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2650,10 +2650,17 @@ int whisper_full( } } + // of only 1 second left, then stop if (seek + 100 >= seek_end) { break; } + // if there is a very short audio segment left to process, we remove any past prompt since it tends + // to confuse the decoder and often make it repeat or hallucinate stuff + if (seek > seek_start && seek + 500 >= seek_end) { + prompt_past.clear(); + } + if (params.encoder_begin_callback) { if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) { fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__); @@ -2780,8 +2787,14 @@ int whisper_full( } if (failed) { - fprintf(stderr, "\n%s: failed to generate timestamp token - using fallback strategy\n\n", __func__); - seek += 100; + // when we fail to sample timestamp token, retry by clearing the past prompt + // if it fails again, then we advance the window by 1 second + if (prompt_past.size() > 0) { + prompt_past.clear(); + } else { + fprintf(stderr, "\n%s: failed to generate timestamp token - skipping one second\n\n", __func__); + seek += 100; + } continue; }