server : add no-speech threshold parameter and functionality (#2654)

2025-06-18 14:58:09 +00:00 · 2024-12-21 16:00:08 +01:00
parent f4668169a0
commit 4183517076
3 changed files with 15 additions and 3 deletions
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@ -428,6 +428,7 @@ struct whisper_segment {
    int64_t t1;

    std::string text;
+    float no_speech_prob;

    std::vector<whisper_token_data> tokens;

@ -6147,7 +6148,7 @@ int whisper_full_with_state(

                            //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);

-                            result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
+                            result_all.push_back({ tt0, tt1, text, state->no_speech_prob, {}, speaker_turn_next });
                            for (int j = i0; j <= i; j++) {
                                result_all.back().tokens.push_back(tokens_cur[j]);
                            }
@ -6192,7 +6193,7 @@ int whisper_full_with_state(
                        }
                    }

-                    result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
+                    result_all.push_back({ tt0, tt1, text, state->no_speech_prob, {}, speaker_turn_next });
                    for (int j = i0; j < (int) tokens_cur.size(); j++) {
                        result_all.back().tokens.push_back(tokens_cur[j]);
                    }
@ -6459,6 +6460,10 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
    return ctx->state->result_all[i_segment].tokens[i_token].p;
 }

+float whisper_full_get_segment_no_speech_prob(struct whisper_context * ctx, int i_segment) {
+    return ctx->state->result_all[i_segment].no_speech_prob;
+}
+
 // =================================================================================================

 //