From d14823582d0ff89c219ad5c76d163fce55551d81 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 17 Oct 2022 23:52:24 +0300 Subject: [PATCH] Try to improve the sampling strategy a bit It sill fails sometimes when it does not sample a timestamp token for the entire segment. We now print a message in such cases --- whisper.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index fab7071f..98852781 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2425,7 +2425,7 @@ int whisper_full( whisper_token id = 0; whisper_token tid = whisper_token_beg(ctx); - id = whisper_sample_best(ctx, result_len == 0); + id = whisper_sample_best(ctx, result_len == 0 || i > 32); if (i > 0) { tid = whisper_sample_timestamp(ctx); } @@ -2445,7 +2445,9 @@ int whisper_full( // end of text token if (id == whisper_token_eot(ctx)) { if (result_len == 0) { - result_len = i + 1; + // TODO: figure out how to resolve this + fprintf(stderr, "\n%s: failed to generate timestamp token - this should not happen\n\n", __func__); + //result_len = i + 1; } break; }