2022-10-04 20:35:01 +03:00
# ifndef WHISPER_H
# define WHISPER_H
2023-11-12 15:31:08 +02:00
# include "ggml.h"
2023-01-08 19:03:33 +08:00
# include <stddef.h>
2022-10-04 22:43:37 +03:00
# include <stdint.h>
2022-10-10 08:11:18 +03:00
# include <stdbool.h>
2022-10-04 22:43:37 +03:00
2023-11-06 17:04:24 +08:00
# ifdef __GNUC__
# define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
# elif defined(_MSC_VER)
# define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
# else
# define WHISPER_DEPRECATED(func, hint) func
# endif
2022-10-04 20:35:01 +03:00
# ifdef WHISPER_SHARED
# ifdef _WIN32
# ifdef WHISPER_BUILD
# define WHISPER_API __declspec(dllexport)
# else
# define WHISPER_API __declspec(dllimport)
# endif
# else
# define WHISPER_API __attribute__ ((visibility ("default")))
# endif
# else
# define WHISPER_API
# endif
2022-10-04 22:43:37 +03:00
# define WHISPER_SAMPLE_RATE 16000
# define WHISPER_N_FFT 400
2024-06-04 09:32:23 +03:00
# define WHISPER_N_FFT_HALF (WHISPER_N_FFT / 2 + 1)
2022-10-04 22:43:37 +03:00
# define WHISPER_HOP_LENGTH 160
# define WHISPER_CHUNK_SIZE 30
2024-06-04 09:32:23 +03:00
# define WHISPER_N_SAMPLES (WHISPER_SAMPLE_RATE * WHISPER_CHUNK_SIZE)
2022-10-04 22:43:37 +03:00
2022-10-04 20:35:01 +03:00
# ifdef __cplusplus
extern " C " {
# endif
//
// C interface
//
2022-10-18 18:27:57 +03:00
// The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
// concurrently.
2022-10-08 18:09:56 +03:00
//
// Basic usage:
//
// #include "whisper.h"
//
// ...
//
2023-11-20 20:52:27 +02:00
// whisper_context_params cparams = whisper_context_default_params();
2023-11-24 13:13:12 +02:00
//
2023-11-20 20:52:27 +02:00
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
2022-10-08 18:09:56 +03:00
//
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
// fprintf(stderr, "failed to process audio\n");
// return 7;
// }
//
// const int n_segments = whisper_full_n_segments(ctx);
// for (int i = 0; i < n_segments; ++i) {
// const char * text = whisper_full_get_segment_text(ctx, i);
// printf("%s", text);
// }
//
// whisper_free(ctx);
//
// ...
//
// This is a demonstration of the most straightforward usage of the library.
// "pcmf32" contains the RAW audio data in 32-bit floating point format.
//
// The interface also allows for more fine-grained control over the computation, but it requires a deeper
// understanding of how the model works.
//
2022-10-04 20:35:01 +03:00
struct whisper_context ;
2023-03-05 20:42:19 +01:00
struct whisper_state ;
2023-07-26 00:15:57 +08:00
struct whisper_full_params ;
2022-10-04 20:35:01 +03:00
2023-11-15 16:12:52 +02:00
typedef int32_t whisper_pos ;
typedef int32_t whisper_token ;
typedef int32_t whisper_seq_id ;
2022-10-04 20:35:01 +03:00
2024-03-20 13:25:26 -03:00
enum whisper_alignment_heads_preset {
WHISPER_AHEADS_NONE ,
WHISPER_AHEADS_N_TOP_MOST , // All heads from the N-top-most text-layers
WHISPER_AHEADS_CUSTOM ,
WHISPER_AHEADS_TINY_EN ,
WHISPER_AHEADS_TINY ,
WHISPER_AHEADS_BASE_EN ,
WHISPER_AHEADS_BASE ,
WHISPER_AHEADS_SMALL_EN ,
WHISPER_AHEADS_SMALL ,
WHISPER_AHEADS_MEDIUM_EN ,
WHISPER_AHEADS_MEDIUM ,
WHISPER_AHEADS_LARGE_V1 ,
WHISPER_AHEADS_LARGE_V2 ,
WHISPER_AHEADS_LARGE_V3 ,
} ;
typedef struct whisper_ahead {
int n_text_layer ;
int n_head ;
} whisper_ahead ;
typedef struct whisper_aheads {
size_t n_heads ;
const whisper_ahead * heads ;
} whisper_aheads ;
2023-11-06 17:04:24 +08:00
struct whisper_context_params {
bool use_gpu ;
2024-05-15 09:38:19 +03:00
bool flash_attn ;
2024-02-09 17:27:47 +02:00
int gpu_device ; // CUDA device
2024-03-20 13:25:26 -03:00
// [EXPERIMENTAL] Token-level timestamps with DTW
bool dtw_token_timestamps ;
enum whisper_alignment_heads_preset dtw_aheads_preset ;
int dtw_n_top ;
struct whisper_aheads dtw_aheads ;
size_t dtw_mem_size ; // TODO: remove
2023-11-06 17:04:24 +08:00
} ;
2022-11-02 21:18:20 +02:00
typedef struct whisper_token_data {
2022-10-29 09:42:14 +03:00
whisper_token id ; // token id
whisper_token tid ; // forced timestamp token id
2022-11-26 17:28:28 +02:00
float p ; // probability of the token
2023-01-15 11:29:57 +02:00
float plog ; // log probability of the token
2022-11-26 17:28:28 +02:00
float pt ; // probability of the timestamp token
float ptsum ; // sum of probabilities of all timestamp tokens
2022-11-02 21:18:20 +02:00
// token-level timestamp data
// do not use if you haven't computed token-level timestamps
2022-11-26 17:28:28 +02:00
int64_t t0 ; // start time of the token
int64_t t1 ; // end time of the token
2022-11-02 21:18:20 +02:00
2024-03-20 13:25:26 -03:00
// [EXPERIMENTAL] Token-level timestamps with DTW
// do not use if you haven't computed token-level timestamps with dtw
// Roughly corresponds to the moment in audio in which the token was output
int64_t t_dtw ;
2022-11-26 17:28:28 +02:00
float vlen ; // voice length of the token
2022-11-02 21:18:20 +02:00
} whisper_token_data ;
2022-10-29 09:42:14 +03:00
2023-01-08 19:03:33 +08:00
typedef struct whisper_model_loader {
void * context ;
size_t ( * read ) ( void * ctx , void * output , size_t read_size ) ;
bool ( * eof ) ( void * ctx ) ;
void ( * close ) ( void * ctx ) ;
} whisper_model_loader ;
2023-11-13 03:51:34 -05:00
// grammar element type
enum whisper_gretype {
// end of rule definition
WHISPER_GRETYPE_END = 0 ,
// start of alternate definition for rule
WHISPER_GRETYPE_ALT = 1 ,
// non-terminal element: reference to rule
WHISPER_GRETYPE_RULE_REF = 2 ,
// terminal element: character (code point)
WHISPER_GRETYPE_CHAR = 3 ,
// inverse char(s) ([^a], [^a-b] [^abc])
WHISPER_GRETYPE_CHAR_NOT = 4 ,
// modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z])
WHISPER_GRETYPE_CHAR_RNG_UPPER = 5 ,
// modifies a preceding WHISPER_GRETYPE_CHAR or
// WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
WHISPER_GRETYPE_CHAR_ALT = 6 ,
} ;
typedef struct whisper_grammar_element {
enum whisper_gretype type ;
uint32_t value ; // Unicode code point or rule ID
} whisper_grammar_element ;
2023-01-08 12:35:56 +02:00
// Various functions for loading a ggml whisper model.
// Allocate (almost) all memory needed for the model.
2023-01-08 19:03:33 +08:00
// Return NULL on failure
2023-11-12 15:31:08 +02:00
WHISPER_API struct whisper_context * whisper_init_from_file_with_params ( const char * path_model , struct whisper_context_params params ) ;
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params ( void * buffer , size_t buffer_size , struct whisper_context_params params ) ;
WHISPER_API struct whisper_context * whisper_init_with_params ( struct whisper_model_loader * loader , struct whisper_context_params params ) ;
2022-10-08 18:09:56 +03:00
2023-03-05 20:42:19 +01:00
// These are the same as the above, but the internal state of the context is not allocated automatically
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
2023-11-12 15:31:08 +02:00
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state ( const char * path_model , struct whisper_context_params params ) ;
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state ( void * buffer , size_t buffer_size , struct whisper_context_params params ) ;
WHISPER_API struct whisper_context * whisper_init_with_params_no_state ( struct whisper_model_loader * loader , struct whisper_context_params params ) ;
2023-11-06 17:04:24 +08:00
WHISPER_DEPRECATED (
WHISPER_API struct whisper_context * whisper_init_from_file ( const char * path_model ) ,
" use whisper_init_from_file_with_params instead "
) ;
WHISPER_DEPRECATED (
WHISPER_API struct whisper_context * whisper_init_from_buffer ( void * buffer , size_t buffer_size ) ,
" use whisper_init_from_buffer_with_params instead "
) ;
WHISPER_DEPRECATED (
WHISPER_API struct whisper_context * whisper_init ( struct whisper_model_loader * loader ) ,
" use whisper_init_with_params instead "
) ;
WHISPER_DEPRECATED (
WHISPER_API struct whisper_context * whisper_init_from_file_no_state ( const char * path_model ) ,
" use whisper_init_from_file_with_params_no_state instead "
) ;
WHISPER_DEPRECATED (
WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state ( void * buffer , size_t buffer_size ) ,
" use whisper_init_from_buffer_with_params_no_state instead "
) ;
WHISPER_DEPRECATED (
WHISPER_API struct whisper_context * whisper_init_no_state ( struct whisper_model_loader * loader ) ,
" use whisper_init_with_params_no_state instead "
) ;
2023-03-05 20:42:19 +01:00
WHISPER_API struct whisper_state * whisper_init_state ( struct whisper_context * ctx ) ;
2023-07-04 08:56:11 -04:00
// Given a context, enable use of OpenVINO for encode inference.
// model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
// the path will be generated from the ggml model path that was passed
// in to whisper_init_from_file. For example, if 'path_model' was
// "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
// assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
// device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
// cache_dir: Optional cache directory that can speed up init time, especially for
// GPU, by caching compiled 'blobs' there.
// Set to nullptr if not used.
2023-07-04 20:28:27 +03:00
// Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
2023-07-04 08:56:11 -04:00
WHISPER_API int whisper_ctx_init_openvino_encoder (
struct whisper_context * ctx ,
const char * model_path ,
const char * device ,
const char * cache_dir ) ;
2023-03-05 20:42:19 +01:00
// Frees all allocated memory
WHISPER_API void whisper_free ( struct whisper_context * ctx ) ;
WHISPER_API void whisper_free_state ( struct whisper_state * state ) ;
2023-05-29 09:38:58 +10:00
WHISPER_API void whisper_free_params ( struct whisper_full_params * params ) ;
2023-11-06 17:04:24 +08:00
WHISPER_API void whisper_free_context_params ( struct whisper_context_params * params ) ;
2022-10-04 20:35:01 +03:00
2022-10-08 18:09:56 +03:00
// Convert RAW PCM audio to log mel spectrogram.
2023-03-05 20:42:19 +01:00
// The resulting spectrogram is stored inside the default state of the provided whisper context.
2022-10-08 18:09:56 +03:00
// Returns 0 on success
2022-10-04 20:35:01 +03:00
WHISPER_API int whisper_pcm_to_mel (
struct whisper_context * ctx ,
2022-11-26 17:28:28 +02:00
const float * samples ,
int n_samples ,
int n_threads ) ;
2022-10-04 20:35:01 +03:00
2023-03-05 20:42:19 +01:00
WHISPER_API int whisper_pcm_to_mel_with_state (
struct whisper_context * ctx ,
struct whisper_state * state ,
const float * samples ,
int n_samples ,
int n_threads ) ;
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
2022-10-08 18:09:56 +03:00
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
2022-10-04 20:35:01 +03:00
// n_mel must be 80
2022-10-08 18:09:56 +03:00
// Returns 0 on success
2022-10-04 20:35:01 +03:00
WHISPER_API int whisper_set_mel (
struct whisper_context * ctx ,
2022-11-26 17:28:28 +02:00
const float * data ,
int n_len ,
int n_mel ) ;
2022-10-04 20:35:01 +03:00
2023-03-05 20:42:19 +01:00
WHISPER_API int whisper_set_mel_with_state (
struct whisper_context * ctx ,
struct whisper_state * state ,
const float * data ,
int n_len ,
int n_mel ) ;
// Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
2022-10-08 18:09:56 +03:00
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
// offset can be used to specify the offset of the first frame in the spectrogram.
// Returns 0 on success
2022-10-04 20:35:01 +03:00
WHISPER_API int whisper_encode (
struct whisper_context * ctx ,
2022-11-26 17:28:28 +02:00
int offset ,
int n_threads ) ;
2022-10-04 20:35:01 +03:00
2023-03-05 20:42:19 +01:00
WHISPER_API int whisper_encode_with_state (
struct whisper_context * ctx ,
struct whisper_state * state ,
int offset ,
int n_threads ) ;
2022-10-08 18:09:56 +03:00
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
// Make sure to call whisper_encode() first.
// tokens + n_tokens is the provided context for the decoder.
// n_past is the number of tokens to use from previous decoder calls.
// Returns 0 on success
2023-01-15 11:29:57 +02:00
// TODO: add support for multiple decoders
2022-10-04 20:35:01 +03:00
WHISPER_API int whisper_decode (
struct whisper_context * ctx ,
2022-11-26 17:28:28 +02:00
const whisper_token * tokens ,
int n_tokens ,
int n_past ,
int n_threads ) ;
2022-10-04 20:35:01 +03:00
2023-03-05 20:42:19 +01:00
WHISPER_API int whisper_decode_with_state (
struct whisper_context * ctx ,
struct whisper_state * state ,
const whisper_token * tokens ,
int n_tokens ,
int n_past ,
int n_threads ) ;
2022-12-13 19:21:07 +02:00
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
2024-03-25 14:48:19 +02:00
// Returns a negative number on failure - the number of tokens that would have been returned
2022-12-13 19:21:07 +02:00
// TODO: not sure if correct
WHISPER_API int whisper_tokenize (
struct whisper_context * ctx ,
const char * text ,
whisper_token * tokens ,
2023-01-05 21:07:50 +02:00
int n_max_tokens ) ;
2022-12-13 19:21:07 +02:00
2024-03-25 14:48:19 +02:00
// Return the number of tokens in the provided text
// Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
int whisper_token_count ( struct whisper_context * ctx , const char * text ) ;
2022-12-17 17:58:08 +02:00
// Largest language id (i.e. number of available languages - 1)
WHISPER_API int whisper_lang_max_id ( ) ;
2022-10-08 18:09:56 +03:00
// Return the id of the specified language, returns -1 if not found
2022-12-17 17:58:08 +02:00
// Examples:
// "de" -> 2
// "german" -> 2
2022-10-04 20:35:01 +03:00
WHISPER_API int whisper_lang_id ( const char * lang ) ;
2022-12-17 17:58:08 +02:00
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
WHISPER_API const char * whisper_lang_str ( int id ) ;
2023-11-24 18:33:13 +11:00
// Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
2023-11-24 13:13:12 +02:00
WHISPER_API const char * whisper_lang_str_full ( int id ) ;
2023-11-24 18:33:13 +11:00
2022-12-17 17:58:08 +02:00
// Use mel data at offset_ms to try and auto-detect the spoken language
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
// Returns the top language id or negative on failure
// If not null, fills the lang_probs array with the probabilities of all languages
2023-04-14 20:03:16 +03:00
// The array must be whisper_lang_max_id() + 1 in size
2022-12-17 17:58:08 +02:00
// ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
WHISPER_API int whisper_lang_auto_detect (
struct whisper_context * ctx ,
int offset_ms ,
int n_threads ,
float * lang_probs ) ;
2023-03-05 20:42:19 +01:00
WHISPER_API int whisper_lang_auto_detect_with_state (
struct whisper_context * ctx ,
struct whisper_state * state ,
int offset_ms ,
int n_threads ,
float * lang_probs ) ;
WHISPER_API int whisper_n_len ( struct whisper_context * ctx ) ; // mel length
WHISPER_API int whisper_n_len_from_state ( struct whisper_state * state ) ; // mel length
WHISPER_API int whisper_n_vocab ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_n_text_ctx ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_n_audio_ctx ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_is_multilingual ( struct whisper_context * ctx ) ;
2022-10-08 10:56:59 +03:00
2023-03-22 20:37:36 +01:00
WHISPER_API int whisper_model_n_vocab ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_audio_ctx ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_audio_state ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_audio_head ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_audio_layer ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_text_ctx ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_text_state ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_text_head ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_text_layer ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_model_n_mels ( struct whisper_context * ctx ) ;
2023-04-30 18:51:57 +03:00
WHISPER_API int whisper_model_ftype ( struct whisper_context * ctx ) ;
2023-03-22 20:37:36 +01:00
WHISPER_API int whisper_model_type ( struct whisper_context * ctx ) ;
2023-01-15 11:29:57 +02:00
// Token logits obtained from the last call to whisper_decode()
// The logits for the last token are stored in the last row
// Rows: n_tokens
// Cols: n_vocab
2023-03-05 20:42:19 +01:00
WHISPER_API float * whisper_get_logits ( struct whisper_context * ctx ) ;
WHISPER_API float * whisper_get_logits_from_state ( struct whisper_state * state ) ;
2022-10-04 20:35:01 +03:00
2022-10-08 18:09:56 +03:00
// Token Id -> String. Uses the vocabulary in the provided context
2022-10-04 20:35:01 +03:00
WHISPER_API const char * whisper_token_to_str ( struct whisper_context * ctx , whisper_token token ) ;
2023-03-22 20:37:36 +01:00
WHISPER_API const char * whisper_model_type_readable ( struct whisper_context * ctx ) ;
2022-10-04 20:35:01 +03:00
2022-10-08 18:09:56 +03:00
// Special tokens
2022-10-04 20:35:01 +03:00
WHISPER_API whisper_token whisper_token_eot ( struct whisper_context * ctx ) ;
WHISPER_API whisper_token whisper_token_sot ( struct whisper_context * ctx ) ;
WHISPER_API whisper_token whisper_token_solm ( struct whisper_context * ctx ) ;
2023-07-03 23:45:00 -07:00
WHISPER_API whisper_token whisper_token_prev ( struct whisper_context * ctx ) ;
WHISPER_API whisper_token whisper_token_nosp ( struct whisper_context * ctx ) ;
2022-10-04 20:35:01 +03:00
WHISPER_API whisper_token whisper_token_not ( struct whisper_context * ctx ) ;
WHISPER_API whisper_token whisper_token_beg ( struct whisper_context * ctx ) ;
2022-12-17 17:58:08 +02:00
WHISPER_API whisper_token whisper_token_lang ( struct whisper_context * ctx , int lang_id ) ;
2022-10-04 20:35:01 +03:00
2022-10-08 18:09:56 +03:00
// Task tokens
2023-07-03 23:45:00 -07:00
WHISPER_API whisper_token whisper_token_translate ( struct whisper_context * ctx ) ;
WHISPER_API whisper_token whisper_token_transcribe ( struct whisper_context * ctx ) ;
2022-10-04 20:35:01 +03:00
2023-03-05 20:42:19 +01:00
// Performance information from the default state.
2022-10-04 20:35:01 +03:00
WHISPER_API void whisper_print_timings ( struct whisper_context * ctx ) ;
2022-11-25 23:07:42 +02:00
WHISPER_API void whisper_reset_timings ( struct whisper_context * ctx ) ;
2022-10-04 20:35:01 +03:00
2022-11-26 17:28:28 +02:00
// Print system information
WHISPER_API const char * whisper_print_system_info ( void ) ;
2022-10-04 20:35:01 +03:00
////////////////////////////////////////////////////////////////////////////
2022-10-18 18:17:24 +03:00
// Available sampling strategies
enum whisper_sampling_strategy {
2023-04-14 20:03:16 +03:00
WHISPER_SAMPLING_GREEDY , // similar to OpenAI's GreedyDecoder
2023-01-15 11:29:57 +02:00
WHISPER_SAMPLING_BEAM_SEARCH , // similar to OpenAI's BeamSearchDecoder
2022-10-04 20:35:01 +03:00
} ;
2022-10-22 21:06:50 +03:00
// Text segment callback
// Called on every newly generated text segment
// Use the whisper_full_...() functions to obtain the text segments
2023-03-05 20:42:19 +01:00
typedef void ( * whisper_new_segment_callback ) ( struct whisper_context * ctx , struct whisper_state * state , int n_new , void * user_data ) ;
2022-10-22 21:06:50 +03:00
2023-03-30 19:29:29 +02:00
// Progress callback
typedef void ( * whisper_progress_callback ) ( struct whisper_context * ctx , struct whisper_state * state , int progress , void * user_data ) ;
2022-11-27 20:28:36 +02:00
// Encoder begin callback
// If not NULL, called before the encoder starts
// If it returns false, the computation is aborted
2023-03-05 20:42:19 +01:00
typedef bool ( * whisper_encoder_begin_callback ) ( struct whisper_context * ctx , struct whisper_state * state , void * user_data ) ;
2022-11-27 20:28:36 +02:00
2023-02-19 18:35:01 +02:00
// Logits filter callback
// Can be used to modify the logits before sampling
// If not NULL, called after applying temperature to logits
typedef void ( * whisper_logits_filter_callback ) (
struct whisper_context * ctx ,
2023-03-05 20:42:19 +01:00
struct whisper_state * state ,
2023-02-19 18:35:01 +02:00
const whisper_token_data * tokens ,
int n_tokens ,
float * logits ,
void * user_data ) ;
2022-11-27 20:28:36 +02:00
// Parameters for the whisper_full() function
2023-08-28 00:02:00 +08:00
// If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
2022-11-27 20:28:36 +02:00
// whisper_full_default_params()
2022-10-04 20:35:01 +03:00
struct whisper_full_params {
2022-10-18 18:17:24 +03:00
enum whisper_sampling_strategy strategy ;
2022-10-04 20:35:01 +03:00
int n_threads ;
2023-01-15 11:29:57 +02:00
int n_max_text_ctx ; // max tokens to use from past text as prompt for the decoder
2022-11-26 17:28:28 +02:00
int offset_ms ; // start offset in ms
int duration_ms ; // audio duration to process in ms
2022-10-04 20:35:01 +03:00
2022-10-04 22:43:37 +03:00
bool translate ;
2023-01-16 19:37:06 +02:00
bool no_context ; // do not use past transcription (if any) as initial prompt for the decoder
2023-11-13 03:51:34 -05:00
bool no_timestamps ; // do not generate timestamps
2022-11-26 17:28:28 +02:00
bool single_segment ; // force single segment output (useful for streaming)
2023-01-15 11:29:57 +02:00
bool print_special ; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
bool print_progress ; // print progress information
bool print_realtime ; // print results from within whisper.cpp (avoid it, use callback instead)
bool print_timestamps ; // print timestamps for each text segment when printing realtime
2022-10-04 20:35:01 +03:00
2022-11-02 21:18:20 +02:00
// [EXPERIMENTAL] token-level timestamps
bool token_timestamps ; // enable token-level timestamps
float thold_pt ; // timestamp token probability threshold (~0.01)
float thold_ptsum ; // timestamp token sum probability threshold (~0.01)
int max_len ; // max segment length in characters
2023-02-05 13:44:23 +01:00
bool split_on_word ; // split on word rather than on token (when used with max_len)
2022-11-20 20:52:24 +02:00
int max_tokens ; // max tokens per segment (0 = no limit)
2022-11-02 21:18:20 +02:00
2022-11-12 18:03:49 +02:00
// [EXPERIMENTAL] speed-up techniques
2023-01-15 11:29:57 +02:00
// note: these can significantly reduce the quality of the output
whisper : significantly improve the inference quality (#1148)
* Fix MSVC compile error C3688
Instead of simply using 'add_compile_options(/utf-8)' to address the MSVC compile error C3688, a better approach would be to handle it in a way that prevents passing '/utf-8' to NVCC.
* Significantly improve inference quality
In the function `log_mel_spectrogram_worker_thread`, there's an array out-of-bounds issue occurring during the calculation of complex number moduli. This issue is causing disruptions in the FFT spectrum, which, in turn, is reducing the quality of inference.
* Significantly improve inference quality
At last, I've pinpointed the actual source of the problem. Given that the frequency spectrum generated from real input data is symmetrical around the Nyquist frequency, there's a for-loop within the `log_mel_spectrogram_worker_thread` function that attempts to fold the frequency spectrum. Regrettably, a bug within this for-loop is causing a frame shift in the frequency spectrum. The previous attempt to remedy this, which involved using `fft_size + 1` when calculating the modulus, was merely a band-aid solution and did not address the underlying issue.
* Addressed a few minor issues
Fixed the issue of `fft_out` continuously expanding. Resolved the fallback caused by using 'break' instead of `fft_in[j] = 0`.
* Significantly improve inference quality
Thanks for your patience everyone. It's finally sorted out. Now, the right side of the FFT spectrum is being flipped over to the left, and the amplitudes at corresponding positions on the left and right are added together (the spectrum on the left needs to be shifted by one position), then the average is calculated. FFT_OUT[0] is no longer discarded, making full use of the limited space to pack in more information.
* Add annotation and performance improvement
* Calculate FFT only when fft_in are not all zero
* Some minor performance improvement
* Fixed a bug impacting inference quality
* The first version after all the analysis is completed.
* Fix some bugs and add debug mode
* Fixed several bugs
* Temporarily disable speed-up mode and add debug mode.
* Add debug mode
* Disable speed-up mode and add debug mode
* Fix CI error (#1)
* Fix error
* Fix error
* Fixed several bugs including [BLANK_AUDIO] problem
* Remove Hard-coded hann window
* Some Final Fix (#2)
* Fix error
* Fix error
* Probably the last commit
* Probably the last commit
* whisper : minor coding style changes
* whisper : remove debug from public API
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-08-28 00:51:33 +08:00
bool debug_mode ; // enable debug_mode provides extra info (eg. Dump log_mel)
2022-11-26 17:28:28 +02:00
int audio_ctx ; // overwrite the audio context size (0 = use default)
2022-11-12 18:03:49 +02:00
2023-07-03 23:45:00 -07:00
// [EXPERIMENTAL] [TDRZ] tinydiarize
bool tdrz_enable ; // enable tinydiarize speaker turn detection
2024-04-09 08:27:28 -07:00
// A regular expression that matches tokens to suppress
const char * suppress_regex ;
2023-01-15 11:29:57 +02:00
// tokens to provide to the whisper decoder as initial prompt
2022-11-22 18:20:05 +02:00
// these are prepended to any existing text context from a previous call
2024-03-25 14:48:19 +02:00
// use whisper_tokenize() to convert text to tokens
// maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
2023-03-30 04:23:23 +08:00
const char * initial_prompt ;
2022-11-22 17:10:35 +01:00
const whisper_token * prompt_tokens ;
int prompt_n_tokens ;
2022-12-17 17:58:08 +02:00
// for auto-detection, set to nullptr, "" or "auto"
2022-10-04 20:35:01 +03:00
const char * language ;
2023-05-02 11:51:52 -05:00
bool detect_language ;
2022-10-04 20:35:01 +03:00
2023-01-15 11:29:57 +02:00
// common decoding parameters:
bool suppress_blank ; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
2023-02-08 16:05:34 +09:00
bool suppress_non_speech_tokens ; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
2023-01-15 11:29:57 +02:00
float temperature ; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
float max_initial_ts ; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
float length_penalty ; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
// fallback parameters
// ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
float temperature_inc ;
float entropy_thold ; // similar to OpenAI's "compression_ratio_threshold"
float logprob_thold ;
float no_speech_thold ; // TODO: not implemented
2022-10-18 18:17:24 +03:00
struct {
2023-01-15 11:29:57 +02:00
int best_of ; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
2022-10-18 18:17:24 +03:00
} greedy ;
struct {
2023-01-15 11:29:57 +02:00
int beam_size ; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
float patience ; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
2022-10-18 18:17:24 +03:00
} beam_search ;
2022-10-22 21:06:50 +03:00
2023-01-15 11:29:57 +02:00
// called for every newly generated text segment
2022-10-22 21:06:50 +03:00
whisper_new_segment_callback new_segment_callback ;
void * new_segment_callback_user_data ;
2022-11-27 20:28:36 +02:00
2023-03-30 19:29:29 +02:00
// called on each progress update
whisper_progress_callback progress_callback ;
void * progress_callback_user_data ;
2023-01-15 11:29:57 +02:00
// called each time before the encoder starts
2022-11-27 20:28:36 +02:00
whisper_encoder_begin_callback encoder_begin_callback ;
void * encoder_begin_callback_user_data ;
2023-02-19 18:35:01 +02:00
2023-10-04 10:57:55 +02:00
// called each time before ggml computation starts
2024-02-09 10:42:27 +01:00
ggml_abort_callback abort_callback ;
2023-10-04 10:57:55 +02:00
void * abort_callback_user_data ;
2023-02-19 18:35:01 +02:00
// called by each decoder to filter obtained logits
whisper_logits_filter_callback logits_filter_callback ;
void * logits_filter_callback_user_data ;
2023-11-13 03:51:34 -05:00
const whisper_grammar_element * * grammar_rules ;
size_t n_grammar_rules ;
size_t i_start_rule ;
float grammar_penalty ;
2022-10-04 20:35:01 +03:00
} ;
2023-11-06 17:04:24 +08:00
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref ( ) ;
WHISPER_API struct whisper_context_params whisper_context_default_params ( void ) ;
2023-05-29 09:38:58 +10:00
WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref ( enum whisper_sampling_strategy strategy ) ;
2022-10-18 18:17:24 +03:00
WHISPER_API struct whisper_full_params whisper_full_default_params ( enum whisper_sampling_strategy strategy ) ;
2022-10-04 22:43:37 +03:00
2022-10-08 18:09:56 +03:00
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
2023-03-05 20:42:19 +01:00
// Not thread safe for same context
2022-10-08 18:09:56 +03:00
// Uses the specified decoding strategy to obtain the text.
2022-10-04 20:35:01 +03:00
WHISPER_API int whisper_full (
2022-11-26 17:28:28 +02:00
struct whisper_context * ctx ,
struct whisper_full_params params ,
const float * samples ,
int n_samples ) ;
2022-10-04 20:35:01 +03:00
2023-03-05 20:42:19 +01:00
WHISPER_API int whisper_full_with_state (
struct whisper_context * ctx ,
struct whisper_state * state ,
struct whisper_full_params params ,
const float * samples ,
int n_samples ) ;
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
// Result is stored in the default state of the context
// Not thread safe if executed in parallel on the same context.
2022-10-29 14:08:23 +03:00
// It seems this approach can offer some speedup in some cases.
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
2022-10-29 12:24:02 +03:00
WHISPER_API int whisper_full_parallel (
2022-11-26 17:28:28 +02:00
struct whisper_context * ctx ,
struct whisper_full_params params ,
const float * samples ,
int n_samples ,
int n_processors ) ;
2022-10-29 12:24:02 +03:00
2023-03-05 20:42:19 +01:00
// Number of generated text segments
2022-10-08 18:09:56 +03:00
// A segment can be a few words, a sentence, or even a paragraph.
2023-03-05 20:42:19 +01:00
WHISPER_API int whisper_full_n_segments ( struct whisper_context * ctx ) ;
WHISPER_API int whisper_full_n_segments_from_state ( struct whisper_state * state ) ;
2022-10-04 22:43:37 +03:00
2023-03-05 20:42:19 +01:00
// Language id associated with the context's default state
2023-02-05 04:46:26 -08:00
WHISPER_API int whisper_full_lang_id ( struct whisper_context * ctx ) ;
2023-03-05 20:42:19 +01:00
// Language id associated with the provided state
WHISPER_API int whisper_full_lang_id_from_state ( struct whisper_state * state ) ;
// Get the start and end time of the specified segment
WHISPER_API int64_t whisper_full_get_segment_t0 ( struct whisper_context * ctx , int i_segment ) ;
WHISPER_API int64_t whisper_full_get_segment_t0_from_state ( struct whisper_state * state , int i_segment ) ;
WHISPER_API int64_t whisper_full_get_segment_t1 ( struct whisper_context * ctx , int i_segment ) ;
WHISPER_API int64_t whisper_full_get_segment_t1_from_state ( struct whisper_state * state , int i_segment ) ;
2023-07-03 23:45:00 -07:00
// Get whether the next segment is predicted as a speaker turn
WHISPER_API bool whisper_full_get_segment_speaker_turn_next ( struct whisper_context * ctx , int i_segment ) ;
2023-10-03 22:55:48 +03:00
WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state ( struct whisper_state * state , int i_segment ) ;
2023-07-03 23:45:00 -07:00
2023-03-05 20:42:19 +01:00
// Get the text of the specified segment
WHISPER_API const char * whisper_full_get_segment_text ( struct whisper_context * ctx , int i_segment ) ;
WHISPER_API const char * whisper_full_get_segment_text_from_state ( struct whisper_state * state , int i_segment ) ;
2022-10-04 22:43:37 +03:00
2023-03-05 20:42:19 +01:00
// Get number of tokens in the specified segment
WHISPER_API int whisper_full_n_tokens ( struct whisper_context * ctx , int i_segment ) ;
WHISPER_API int whisper_full_n_tokens_from_state ( struct whisper_state * state , int i_segment ) ;
2022-10-04 22:43:37 +03:00
2023-03-05 20:42:19 +01:00
// Get the token text of the specified token in the specified segment
WHISPER_API const char * whisper_full_get_token_text ( struct whisper_context * ctx , int i_segment , int i_token ) ;
WHISPER_API const char * whisper_full_get_token_text_from_state ( struct whisper_context * ctx , struct whisper_state * state , int i_segment , int i_token ) ;
2022-10-21 17:33:59 +03:00
2023-03-05 20:42:19 +01:00
WHISPER_API whisper_token whisper_full_get_token_id ( struct whisper_context * ctx , int i_segment , int i_token ) ;
WHISPER_API whisper_token whisper_full_get_token_id_from_state ( struct whisper_state * state , int i_segment , int i_token ) ;
2022-10-21 17:33:59 +03:00
2023-03-05 20:42:19 +01:00
// Get token data for the specified token in the specified segment
2022-10-30 10:05:58 +02:00
// This contains probabilities, timestamps, etc.
2023-03-05 20:42:19 +01:00
WHISPER_API whisper_token_data whisper_full_get_token_data ( struct whisper_context * ctx , int i_segment , int i_token ) ;
WHISPER_API whisper_token_data whisper_full_get_token_data_from_state ( struct whisper_state * state , int i_segment , int i_token ) ;
2022-10-30 10:05:58 +02:00
2023-03-05 20:42:19 +01:00
// Get the probability of the specified token in the specified segment
WHISPER_API float whisper_full_get_token_p ( struct whisper_context * ctx , int i_segment , int i_token ) ;
WHISPER_API float whisper_full_get_token_p_from_state ( struct whisper_state * state , int i_segment , int i_token ) ;
2022-10-21 17:33:59 +03:00
2023-01-18 21:00:41 +02:00
////////////////////////////////////////////////////////////////////////////
// Temporary helpers needed for exposing ggml interface
2023-07-03 23:45:00 -07:00
WHISPER_API int whisper_bench_memcpy ( int n_threads ) ;
WHISPER_API const char * whisper_bench_memcpy_str ( int n_threads ) ;
WHISPER_API int whisper_bench_ggml_mul_mat ( int n_threads ) ;
2023-03-08 04:36:30 +09:00
WHISPER_API const char * whisper_bench_ggml_mul_mat_str ( int n_threads ) ;
2023-01-18 21:00:41 +02:00
2023-07-25 08:58:25 -07:00
// Control logging output; default behavior is to print to stderr
2023-11-12 15:31:08 +02:00
WHISPER_API void whisper_log_set ( ggml_log_callback log_callback , void * user_data ) ;
2023-07-25 08:58:25 -07:00
2022-10-04 20:35:01 +03:00
# ifdef __cplusplus
}
# endif
# endif