mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-13 04:28:07 +00:00
whisper : support speaker segmentation (local diarization) of mono audio via tinydiarize (#1058)
* add HuggingFace mirror to download ggml model * support tdrz via simple hack overriding solm tokens * fix incorrect translate/transcribe token_ids that are not static const * add apollo 13 sample for tdrz demo * render [SPEAKER TURN] consistently in all terminal output using vocab.id_to_token * extend whisper_segment with speaker_turn_next field and save in json output * fix failing go build * slipped in some python syntax whoops * whisper : finalize tinydiarize support (add flag + fixes) * whisper : tdrz support for word-level timestamps (respect max_len) * java : try to fix tests after adding tdrz_enable flag * main : remove TODO leftover * java : fix params order list after adding "tdrz_enable" * whisper : fix solm and add nosp token * main : print tinydiarize help --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@ -224,8 +224,8 @@ public interface WhisperCppJnaLibrary extends Library {
|
||||
int whisper_token_lang(Pointer ctx, int lang_id);
|
||||
|
||||
// Task tokens
|
||||
int whisper_token_translate();
|
||||
int whisper_token_transcribe();
|
||||
int whisper_token_translate (Pointer ctx);
|
||||
int whisper_token_transcribe(Pointer ctx);
|
||||
|
||||
// Performance information from the default state.
|
||||
void whisper_print_timings(Pointer ctx);
|
||||
|
@ -137,6 +137,14 @@ public class WhisperFullParams extends Structure {
|
||||
/** Overwrite the audio context size (0 = use default). */
|
||||
public int audio_ctx;
|
||||
|
||||
/** Enable tinydiarize (default = false) */
|
||||
public CBool tdrz_enable;
|
||||
|
||||
/** Enable tinydiarize (default = false) */
|
||||
public void tdrzEnable(boolean enable) {
|
||||
tdrz_enable = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Tokens to provide to the whisper decoder as an initial prompt.
|
||||
* These are prepended to any existing text context from a previous call. */
|
||||
public String initial_prompt;
|
||||
@ -302,7 +310,7 @@ public class WhisperFullParams extends Structure {
|
||||
"no_context", "single_segment",
|
||||
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
|
||||
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
|
||||
"initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
||||
"tdrz_enable", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
||||
"suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
|
||||
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
|
||||
"new_segment_callback", "new_segment_callback_user_data",
|
||||
|
Reference in New Issue
Block a user