2022-10-04 20:35:01 +03:00
|
|
|
#ifndef WHISPER_H
|
|
|
|
#define WHISPER_H
|
|
|
|
|
2023-01-08 19:03:33 +08:00
|
|
|
#include <stddef.h>
|
2022-10-04 22:43:37 +03:00
|
|
|
#include <stdint.h>
|
2022-10-10 08:11:18 +03:00
|
|
|
#include <stdbool.h>
|
2022-10-04 22:43:37 +03:00
|
|
|
|
2022-10-04 20:35:01 +03:00
|
|
|
#ifdef WHISPER_SHARED
|
|
|
|
# ifdef _WIN32
|
|
|
|
# ifdef WHISPER_BUILD
|
|
|
|
# define WHISPER_API __declspec(dllexport)
|
|
|
|
# else
|
|
|
|
# define WHISPER_API __declspec(dllimport)
|
|
|
|
# endif
|
|
|
|
# else
|
|
|
|
# define WHISPER_API __attribute__ ((visibility ("default")))
|
|
|
|
# endif
|
|
|
|
#else
|
|
|
|
# define WHISPER_API
|
|
|
|
#endif
|
|
|
|
|
2022-10-04 22:43:37 +03:00
|
|
|
#define WHISPER_SAMPLE_RATE 16000
|
|
|
|
#define WHISPER_N_FFT 400
|
|
|
|
#define WHISPER_N_MEL 80
|
|
|
|
#define WHISPER_HOP_LENGTH 160
|
|
|
|
#define WHISPER_CHUNK_SIZE 30
|
|
|
|
|
2022-10-04 20:35:01 +03:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
//
|
|
|
|
// C interface
|
|
|
|
//
|
2022-10-18 18:27:57 +03:00
|
|
|
// The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
|
|
|
|
// concurrently.
|
2022-10-08 18:09:56 +03:00
|
|
|
//
|
|
|
|
// Basic usage:
|
|
|
|
//
|
|
|
|
// #include "whisper.h"
|
|
|
|
//
|
|
|
|
// ...
|
|
|
|
//
|
2023-01-08 19:03:33 +08:00
|
|
|
// struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
|
2022-10-08 18:09:56 +03:00
|
|
|
//
|
|
|
|
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
|
|
|
// fprintf(stderr, "failed to process audio\n");
|
|
|
|
// return 7;
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// const int n_segments = whisper_full_n_segments(ctx);
|
|
|
|
// for (int i = 0; i < n_segments; ++i) {
|
|
|
|
// const char * text = whisper_full_get_segment_text(ctx, i);
|
|
|
|
// printf("%s", text);
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// whisper_free(ctx);
|
|
|
|
//
|
|
|
|
// ...
|
|
|
|
//
|
|
|
|
// This is a demonstration of the most straightforward usage of the library.
|
|
|
|
// "pcmf32" contains the RAW audio data in 32-bit floating point format.
|
|
|
|
//
|
|
|
|
// The interface also allows for more fine-grained control over the computation, but it requires a deeper
|
|
|
|
// understanding of how the model works.
|
|
|
|
//
|
2022-10-04 20:35:01 +03:00
|
|
|
|
|
|
|
struct whisper_context;
|
2023-03-05 20:42:19 +01:00
|
|
|
struct whisper_state;
|
2023-07-26 00:15:57 +08:00
|
|
|
struct whisper_full_params;
|
2022-10-04 20:35:01 +03:00
|
|
|
|
|
|
|
typedef int whisper_token;
|
|
|
|
|
2022-11-02 21:18:20 +02:00
|
|
|
typedef struct whisper_token_data {
|
2022-10-29 09:42:14 +03:00
|
|
|
whisper_token id; // token id
|
|
|
|
whisper_token tid; // forced timestamp token id
|
|
|
|
|
2022-11-26 17:28:28 +02:00
|
|
|
float p; // probability of the token
|
2023-01-15 11:29:57 +02:00
|
|
|
float plog; // log probability of the token
|
2022-11-26 17:28:28 +02:00
|
|
|
float pt; // probability of the timestamp token
|
|
|
|
float ptsum; // sum of probabilities of all timestamp tokens
|
2022-11-02 21:18:20 +02:00
|
|
|
|
|
|
|
// token-level timestamp data
|
|
|
|
// do not use if you haven't computed token-level timestamps
|
2022-11-26 17:28:28 +02:00
|
|
|
int64_t t0; // start time of the token
|
|
|
|
int64_t t1; // end time of the token
|
2022-11-02 21:18:20 +02:00
|
|
|
|
2022-11-26 17:28:28 +02:00
|
|
|
float vlen; // voice length of the token
|
2022-11-02 21:18:20 +02:00
|
|
|
} whisper_token_data;
|
2022-10-29 09:42:14 +03:00
|
|
|
|
2023-01-08 19:03:33 +08:00
|
|
|
typedef struct whisper_model_loader {
|
|
|
|
void * context;
|
|
|
|
|
|
|
|
size_t (*read)(void * ctx, void * output, size_t read_size);
|
|
|
|
bool (*eof)(void * ctx);
|
|
|
|
void (*close)(void * ctx);
|
|
|
|
} whisper_model_loader;
|
|
|
|
|
2023-01-08 12:35:56 +02:00
|
|
|
// Various functions for loading a ggml whisper model.
|
|
|
|
// Allocate (almost) all memory needed for the model.
|
2023-01-08 19:03:33 +08:00
|
|
|
// Return NULL on failure
|
|
|
|
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
|
|
|
|
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
|
|
|
|
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
|
2022-10-08 18:09:56 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// These are the same as the above, but the internal state of the context is not allocated automatically
|
|
|
|
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
|
|
|
|
WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
|
|
|
|
WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
|
|
|
|
WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
|
|
|
|
|
|
|
|
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
|
|
|
|
|
2023-07-04 08:56:11 -04:00
|
|
|
// Given a context, enable use of OpenVINO for encode inference.
|
|
|
|
// model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
|
|
|
|
// the path will be generated from the ggml model path that was passed
|
|
|
|
// in to whisper_init_from_file. For example, if 'path_model' was
|
|
|
|
// "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
|
|
|
|
// assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
|
|
|
|
// device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
|
|
|
|
// cache_dir: Optional cache directory that can speed up init time, especially for
|
|
|
|
// GPU, by caching compiled 'blobs' there.
|
|
|
|
// Set to nullptr if not used.
|
2023-07-04 20:28:27 +03:00
|
|
|
// Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
|
2023-07-04 08:56:11 -04:00
|
|
|
WHISPER_API int whisper_ctx_init_openvino_encoder(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
const char * model_path,
|
|
|
|
const char * device,
|
|
|
|
const char * cache_dir);
|
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Frees all allocated memory
|
|
|
|
WHISPER_API void whisper_free (struct whisper_context * ctx);
|
|
|
|
WHISPER_API void whisper_free_state(struct whisper_state * state);
|
2023-05-29 09:38:58 +10:00
|
|
|
WHISPER_API void whisper_free_params(struct whisper_full_params * params);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2022-10-08 18:09:56 +03:00
|
|
|
// Convert RAW PCM audio to log mel spectrogram.
|
2023-03-05 20:42:19 +01:00
|
|
|
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
2022-10-08 18:09:56 +03:00
|
|
|
// Returns 0 on success
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API int whisper_pcm_to_mel(
|
|
|
|
struct whisper_context * ctx,
|
2022-11-26 17:28:28 +02:00
|
|
|
const float * samples,
|
|
|
|
int n_samples,
|
|
|
|
int n_threads);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API int whisper_pcm_to_mel_with_state(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_state * state,
|
|
|
|
const float * samples,
|
|
|
|
int n_samples,
|
|
|
|
int n_threads);
|
|
|
|
|
|
|
|
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
|
|
|
// The resulting spectrogram is stored inside the default state of the provided whisper context.
|
2023-02-08 08:01:47 +01:00
|
|
|
// Returns 0 on success
|
|
|
|
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
|
2023-03-05 20:42:19 +01:00
|
|
|
struct whisper_context * ctx,
|
|
|
|
const float * samples,
|
|
|
|
int n_samples,
|
|
|
|
int n_threads);
|
|
|
|
|
|
|
|
WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_state * state,
|
|
|
|
const float * samples,
|
|
|
|
int n_samples,
|
|
|
|
int n_threads);
|
|
|
|
|
|
|
|
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
|
2022-10-08 18:09:56 +03:00
|
|
|
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
2022-10-04 20:35:01 +03:00
|
|
|
// n_mel must be 80
|
2022-10-08 18:09:56 +03:00
|
|
|
// Returns 0 on success
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API int whisper_set_mel(
|
|
|
|
struct whisper_context * ctx,
|
2022-11-26 17:28:28 +02:00
|
|
|
const float * data,
|
|
|
|
int n_len,
|
|
|
|
int n_mel);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API int whisper_set_mel_with_state(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_state * state,
|
|
|
|
const float * data,
|
|
|
|
int n_len,
|
|
|
|
int n_mel);
|
|
|
|
|
|
|
|
// Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
|
2022-10-08 18:09:56 +03:00
|
|
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
|
|
|
// offset can be used to specify the offset of the first frame in the spectrogram.
|
|
|
|
// Returns 0 on success
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API int whisper_encode(
|
|
|
|
struct whisper_context * ctx,
|
2022-11-26 17:28:28 +02:00
|
|
|
int offset,
|
|
|
|
int n_threads);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API int whisper_encode_with_state(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_state * state,
|
|
|
|
int offset,
|
|
|
|
int n_threads);
|
|
|
|
|
2022-10-08 18:09:56 +03:00
|
|
|
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
|
|
|
// Make sure to call whisper_encode() first.
|
|
|
|
// tokens + n_tokens is the provided context for the decoder.
|
|
|
|
// n_past is the number of tokens to use from previous decoder calls.
|
|
|
|
// Returns 0 on success
|
2023-01-15 11:29:57 +02:00
|
|
|
// TODO: add support for multiple decoders
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API int whisper_decode(
|
|
|
|
struct whisper_context * ctx,
|
2022-11-26 17:28:28 +02:00
|
|
|
const whisper_token * tokens,
|
|
|
|
int n_tokens,
|
|
|
|
int n_past,
|
|
|
|
int n_threads);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API int whisper_decode_with_state(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_state * state,
|
|
|
|
const whisper_token * tokens,
|
|
|
|
int n_tokens,
|
|
|
|
int n_past,
|
|
|
|
int n_threads);
|
|
|
|
|
2022-12-13 19:21:07 +02:00
|
|
|
// Convert the provided text into tokens.
|
|
|
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
|
|
|
// Returns the number of tokens on success, no more than n_max_tokens
|
|
|
|
// Returns -1 on failure
|
|
|
|
// TODO: not sure if correct
|
|
|
|
WHISPER_API int whisper_tokenize(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
const char * text,
|
|
|
|
whisper_token * tokens,
|
2023-01-05 21:07:50 +02:00
|
|
|
int n_max_tokens);
|
2022-12-13 19:21:07 +02:00
|
|
|
|
2022-12-17 17:58:08 +02:00
|
|
|
// Largest language id (i.e. number of available languages - 1)
|
|
|
|
WHISPER_API int whisper_lang_max_id();
|
|
|
|
|
2022-10-08 18:09:56 +03:00
|
|
|
// Return the id of the specified language, returns -1 if not found
|
2022-12-17 17:58:08 +02:00
|
|
|
// Examples:
|
|
|
|
// "de" -> 2
|
|
|
|
// "german" -> 2
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API int whisper_lang_id(const char * lang);
|
|
|
|
|
2022-12-17 17:58:08 +02:00
|
|
|
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
|
|
|
|
WHISPER_API const char * whisper_lang_str(int id);
|
|
|
|
|
|
|
|
// Use mel data at offset_ms to try and auto-detect the spoken language
|
|
|
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
|
|
|
|
// Returns the top language id or negative on failure
|
|
|
|
// If not null, fills the lang_probs array with the probabilities of all languages
|
2023-04-14 20:03:16 +03:00
|
|
|
// The array must be whisper_lang_max_id() + 1 in size
|
2022-12-17 17:58:08 +02:00
|
|
|
// ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
|
|
|
|
WHISPER_API int whisper_lang_auto_detect(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
int offset_ms,
|
|
|
|
int n_threads,
|
|
|
|
float * lang_probs);
|
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API int whisper_lang_auto_detect_with_state(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_state * state,
|
|
|
|
int offset_ms,
|
|
|
|
int n_threads,
|
|
|
|
float * lang_probs);
|
|
|
|
|
|
|
|
WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
|
|
|
|
WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
|
|
|
|
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
|
2022-10-08 10:56:59 +03:00
|
|
|
|
2023-03-22 20:37:36 +01:00
|
|
|
WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
|
2023-04-30 18:51:57 +03:00
|
|
|
WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
|
2023-03-22 20:37:36 +01:00
|
|
|
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
|
|
|
|
|
2023-01-15 11:29:57 +02:00
|
|
|
// Token logits obtained from the last call to whisper_decode()
|
|
|
|
// The logits for the last token are stored in the last row
|
|
|
|
// Rows: n_tokens
|
|
|
|
// Cols: n_vocab
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API float * whisper_get_logits (struct whisper_context * ctx);
|
|
|
|
WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2022-10-08 18:09:56 +03:00
|
|
|
// Token Id -> String. Uses the vocabulary in the provided context
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
|
2023-03-22 20:37:36 +01:00
|
|
|
WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
|
|
|
|
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2022-10-08 18:09:56 +03:00
|
|
|
// Special tokens
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
|
|
|
|
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
|
|
|
|
WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
|
2023-07-03 23:45:00 -07:00
|
|
|
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
|
|
|
|
WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
|
|
|
|
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
|
2022-12-17 17:58:08 +02:00
|
|
|
WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2022-10-08 18:09:56 +03:00
|
|
|
// Task tokens
|
2023-07-03 23:45:00 -07:00
|
|
|
WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
|
|
|
|
WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Performance information from the default state.
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
2022-11-25 23:07:42 +02:00
|
|
|
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2022-11-26 17:28:28 +02:00
|
|
|
// Print system information
|
|
|
|
WHISPER_API const char * whisper_print_system_info(void);
|
|
|
|
|
2022-10-04 20:35:01 +03:00
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
2022-10-18 18:17:24 +03:00
|
|
|
// Available sampling strategies
|
|
|
|
enum whisper_sampling_strategy {
|
2023-04-14 20:03:16 +03:00
|
|
|
WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder
|
2023-01-15 11:29:57 +02:00
|
|
|
WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
|
2022-10-04 20:35:01 +03:00
|
|
|
};
|
|
|
|
|
2022-10-22 21:06:50 +03:00
|
|
|
// Text segment callback
|
|
|
|
// Called on every newly generated text segment
|
|
|
|
// Use the whisper_full_...() functions to obtain the text segments
|
2023-03-05 20:42:19 +01:00
|
|
|
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
|
2022-10-22 21:06:50 +03:00
|
|
|
|
2023-03-30 19:29:29 +02:00
|
|
|
// Progress callback
|
|
|
|
typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
|
|
|
|
|
2022-11-27 20:28:36 +02:00
|
|
|
// Encoder begin callback
|
|
|
|
// If not NULL, called before the encoder starts
|
|
|
|
// If it returns false, the computation is aborted
|
2023-03-05 20:42:19 +01:00
|
|
|
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
|
2022-11-27 20:28:36 +02:00
|
|
|
|
2023-02-19 18:35:01 +02:00
|
|
|
// Logits filter callback
|
|
|
|
// Can be used to modify the logits before sampling
|
|
|
|
// If not NULL, called after applying temperature to logits
|
|
|
|
typedef void (*whisper_logits_filter_callback)(
|
|
|
|
struct whisper_context * ctx,
|
2023-03-05 20:42:19 +01:00
|
|
|
struct whisper_state * state,
|
2023-02-19 18:35:01 +02:00
|
|
|
const whisper_token_data * tokens,
|
|
|
|
int n_tokens,
|
|
|
|
float * logits,
|
|
|
|
void * user_data);
|
|
|
|
|
2022-11-27 20:28:36 +02:00
|
|
|
// Parameters for the whisper_full() function
|
2023-08-28 00:02:00 +08:00
|
|
|
// If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
|
2022-11-27 20:28:36 +02:00
|
|
|
// whisper_full_default_params()
|
2022-10-04 20:35:01 +03:00
|
|
|
struct whisper_full_params {
|
2022-10-18 18:17:24 +03:00
|
|
|
enum whisper_sampling_strategy strategy;
|
2022-10-04 20:35:01 +03:00
|
|
|
|
|
|
|
int n_threads;
|
2023-01-15 11:29:57 +02:00
|
|
|
int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder
|
2022-11-26 17:28:28 +02:00
|
|
|
int offset_ms; // start offset in ms
|
|
|
|
int duration_ms; // audio duration to process in ms
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2022-10-04 22:43:37 +03:00
|
|
|
bool translate;
|
2023-01-16 19:37:06 +02:00
|
|
|
bool no_context; // do not use past transcription (if any) as initial prompt for the decoder
|
2022-11-26 17:28:28 +02:00
|
|
|
bool single_segment; // force single segment output (useful for streaming)
|
2023-01-15 11:29:57 +02:00
|
|
|
bool print_special; // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
|
|
|
|
bool print_progress; // print progress information
|
|
|
|
bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead)
|
|
|
|
bool print_timestamps; // print timestamps for each text segment when printing realtime
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2022-11-02 21:18:20 +02:00
|
|
|
// [EXPERIMENTAL] token-level timestamps
|
|
|
|
bool token_timestamps; // enable token-level timestamps
|
|
|
|
float thold_pt; // timestamp token probability threshold (~0.01)
|
|
|
|
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
|
|
|
|
int max_len; // max segment length in characters
|
2023-02-05 13:44:23 +01:00
|
|
|
bool split_on_word; // split on word rather than on token (when used with max_len)
|
2022-11-20 20:52:24 +02:00
|
|
|
int max_tokens; // max tokens per segment (0 = no limit)
|
2022-11-02 21:18:20 +02:00
|
|
|
|
2022-11-12 18:03:49 +02:00
|
|
|
// [EXPERIMENTAL] speed-up techniques
|
2023-01-15 11:29:57 +02:00
|
|
|
// note: these can significantly reduce the quality of the output
|
2022-11-26 17:28:28 +02:00
|
|
|
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
whisper : significantly improve the inference quality (#1148)
* Fix MSVC compile error C3688
Instead of simply using 'add_compile_options(/utf-8)' to address the MSVC compile error C3688, a better approach would be to handle it in a way that prevents passing '/utf-8' to NVCC.
* Significantly improve inference quality
In the function `log_mel_spectrogram_worker_thread`, there's an array out-of-bounds issue occurring during the calculation of complex number moduli. This issue is causing disruptions in the FFT spectrum, which, in turn, is reducing the quality of inference.
* Significantly improve inference quality
At last, I've pinpointed the actual source of the problem. Given that the frequency spectrum generated from real input data is symmetrical around the Nyquist frequency, there's a for-loop within the `log_mel_spectrogram_worker_thread` function that attempts to fold the frequency spectrum. Regrettably, a bug within this for-loop is causing a frame shift in the frequency spectrum. The previous attempt to remedy this, which involved using `fft_size + 1` when calculating the modulus, was merely a band-aid solution and did not address the underlying issue.
* Addressed a few minor issues
Fixed the issue of `fft_out` continuously expanding. Resolved the fallback caused by using 'break' instead of `fft_in[j] = 0`.
* Significantly improve inference quality
Thanks for your patience everyone. It's finally sorted out. Now, the right side of the FFT spectrum is being flipped over to the left, and the amplitudes at corresponding positions on the left and right are added together (the spectrum on the left needs to be shifted by one position), then the average is calculated. FFT_OUT[0] is no longer discarded, making full use of the limited space to pack in more information.
* Add annotation and performance improvement
* Calculate FFT only when fft_in are not all zero
* Some minor performance improvement
* Fixed a bug impacting inference quality
* The first version after all the analysis is completed.
* Fix some bugs and add debug mode
* Fixed several bugs
* Temporarily disable speed-up mode and add debug mode.
* Add debug mode
* Disable speed-up mode and add debug mode
* Fix CI error (#1)
* Fix error
* Fix error
* Fixed several bugs including [BLANK_AUDIO] problem
* Remove Hard-coded hann window
* Some Final Fix (#2)
* Fix error
* Fix error
* Probably the last commit
* Probably the last commit
* whisper : minor coding style changes
* whisper : remove debug from public API
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-08-28 00:51:33 +08:00
|
|
|
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
2022-11-26 17:28:28 +02:00
|
|
|
int audio_ctx; // overwrite the audio context size (0 = use default)
|
2022-11-12 18:03:49 +02:00
|
|
|
|
2023-07-03 23:45:00 -07:00
|
|
|
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
|
|
|
bool tdrz_enable; // enable tinydiarize speaker turn detection
|
|
|
|
|
2023-01-15 11:29:57 +02:00
|
|
|
// tokens to provide to the whisper decoder as initial prompt
|
2022-11-22 18:20:05 +02:00
|
|
|
// these are prepended to any existing text context from a previous call
|
2023-03-30 04:23:23 +08:00
|
|
|
const char * initial_prompt;
|
2022-11-22 17:10:35 +01:00
|
|
|
const whisper_token * prompt_tokens;
|
|
|
|
int prompt_n_tokens;
|
|
|
|
|
2022-12-17 17:58:08 +02:00
|
|
|
// for auto-detection, set to nullptr, "" or "auto"
|
2022-10-04 20:35:01 +03:00
|
|
|
const char * language;
|
2023-05-02 11:51:52 -05:00
|
|
|
bool detect_language;
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2023-01-15 11:29:57 +02:00
|
|
|
// common decoding parameters:
|
|
|
|
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
|
2023-02-08 16:05:34 +09:00
|
|
|
bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
|
2023-01-15 11:29:57 +02:00
|
|
|
|
|
|
|
float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
|
|
|
|
float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
|
|
|
|
float length_penalty; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
|
|
|
|
|
|
|
|
// fallback parameters
|
|
|
|
// ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
|
|
|
|
float temperature_inc;
|
|
|
|
float entropy_thold; // similar to OpenAI's "compression_ratio_threshold"
|
|
|
|
float logprob_thold;
|
|
|
|
float no_speech_thold; // TODO: not implemented
|
|
|
|
|
2022-10-18 18:17:24 +03:00
|
|
|
struct {
|
2023-01-15 11:29:57 +02:00
|
|
|
int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
|
2022-10-18 18:17:24 +03:00
|
|
|
} greedy;
|
|
|
|
|
|
|
|
struct {
|
2023-01-15 11:29:57 +02:00
|
|
|
int beam_size; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
|
|
|
|
|
|
|
|
float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
|
2022-10-18 18:17:24 +03:00
|
|
|
} beam_search;
|
2022-10-22 21:06:50 +03:00
|
|
|
|
2023-01-15 11:29:57 +02:00
|
|
|
// called for every newly generated text segment
|
2022-10-22 21:06:50 +03:00
|
|
|
whisper_new_segment_callback new_segment_callback;
|
|
|
|
void * new_segment_callback_user_data;
|
2022-11-27 20:28:36 +02:00
|
|
|
|
2023-03-30 19:29:29 +02:00
|
|
|
// called on each progress update
|
|
|
|
whisper_progress_callback progress_callback;
|
|
|
|
void * progress_callback_user_data;
|
|
|
|
|
2023-01-15 11:29:57 +02:00
|
|
|
// called each time before the encoder starts
|
2022-11-27 20:28:36 +02:00
|
|
|
whisper_encoder_begin_callback encoder_begin_callback;
|
|
|
|
void * encoder_begin_callback_user_data;
|
2023-02-19 18:35:01 +02:00
|
|
|
|
|
|
|
// called by each decoder to filter obtained logits
|
|
|
|
whisper_logits_filter_callback logits_filter_callback;
|
|
|
|
void * logits_filter_callback_user_data;
|
2022-10-04 20:35:01 +03:00
|
|
|
};
|
|
|
|
|
2023-05-29 09:38:58 +10:00
|
|
|
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
|
|
|
|
WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
|
2022-10-18 18:17:24 +03:00
|
|
|
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
|
2022-10-04 22:43:37 +03:00
|
|
|
|
2022-10-08 18:09:56 +03:00
|
|
|
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
2023-03-05 20:42:19 +01:00
|
|
|
// Not thread safe for same context
|
2022-10-08 18:09:56 +03:00
|
|
|
// Uses the specified decoding strategy to obtain the text.
|
2022-10-04 20:35:01 +03:00
|
|
|
WHISPER_API int whisper_full(
|
2022-11-26 17:28:28 +02:00
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_full_params params,
|
|
|
|
const float * samples,
|
|
|
|
int n_samples);
|
2022-10-04 20:35:01 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API int whisper_full_with_state(
|
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_state * state,
|
|
|
|
struct whisper_full_params params,
|
|
|
|
const float * samples,
|
|
|
|
int n_samples);
|
|
|
|
|
|
|
|
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
|
|
|
// Result is stored in the default state of the context
|
|
|
|
// Not thread safe if executed in parallel on the same context.
|
2022-10-29 14:08:23 +03:00
|
|
|
// It seems this approach can offer some speedup in some cases.
|
|
|
|
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
2022-10-29 12:24:02 +03:00
|
|
|
WHISPER_API int whisper_full_parallel(
|
2022-11-26 17:28:28 +02:00
|
|
|
struct whisper_context * ctx,
|
|
|
|
struct whisper_full_params params,
|
|
|
|
const float * samples,
|
|
|
|
int n_samples,
|
|
|
|
int n_processors);
|
2022-10-29 12:24:02 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Number of generated text segments
|
2022-10-08 18:09:56 +03:00
|
|
|
// A segment can be a few words, a sentence, or even a paragraph.
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API int whisper_full_n_segments (struct whisper_context * ctx);
|
|
|
|
WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
|
2022-10-04 22:43:37 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Language id associated with the context's default state
|
2023-02-05 04:46:26 -08:00
|
|
|
WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
|
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Language id associated with the provided state
|
|
|
|
WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
|
|
|
|
|
|
|
|
// Get the start and end time of the specified segment
|
|
|
|
WHISPER_API int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx, int i_segment);
|
|
|
|
WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
|
|
|
|
|
|
|
|
WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment);
|
|
|
|
WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
|
|
|
|
|
2023-07-03 23:45:00 -07:00
|
|
|
// Get whether the next segment is predicted as a speaker turn
|
|
|
|
WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
|
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Get the text of the specified segment
|
|
|
|
WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
|
|
|
|
WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
|
2022-10-04 22:43:37 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Get number of tokens in the specified segment
|
|
|
|
WHISPER_API int whisper_full_n_tokens (struct whisper_context * ctx, int i_segment);
|
|
|
|
WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
|
2022-10-04 22:43:37 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Get the token text of the specified token in the specified segment
|
|
|
|
WHISPER_API const char * whisper_full_get_token_text (struct whisper_context * ctx, int i_segment, int i_token);
|
|
|
|
WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
|
2022-10-21 17:33:59 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
|
|
|
|
WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
|
2022-10-21 17:33:59 +03:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Get token data for the specified token in the specified segment
|
2022-10-30 10:05:58 +02:00
|
|
|
// This contains probabilities, timestamps, etc.
|
2023-03-05 20:42:19 +01:00
|
|
|
WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token);
|
|
|
|
WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
|
2022-10-30 10:05:58 +02:00
|
|
|
|
2023-03-05 20:42:19 +01:00
|
|
|
// Get the probability of the specified token in the specified segment
|
|
|
|
WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
|
|
|
|
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
|
2022-10-21 17:33:59 +03:00
|
|
|
|
2023-01-18 21:00:41 +02:00
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
// Temporary helpers needed for exposing ggml interface
|
|
|
|
|
2023-07-03 23:45:00 -07:00
|
|
|
WHISPER_API int whisper_bench_memcpy (int n_threads);
|
|
|
|
WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
|
|
|
|
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
|
2023-03-08 04:36:30 +09:00
|
|
|
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
2023-01-18 21:00:41 +02:00
|
|
|
|
2023-07-25 08:58:25 -07:00
|
|
|
// Control logging output; default behavior is to print to stderr
|
|
|
|
|
|
|
|
typedef void (*whisper_log_callback)(const char * line);
|
|
|
|
WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
|
|
|
|
|
2022-10-04 20:35:01 +03:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif
|