2022-10-04 17:35:01 +00:00
# include "whisper.h"
2023-06-25 12:30:39 +00:00
# ifdef WHISPER_USE_COREML
2023-04-15 10:21:27 +00:00
# include "coreml/whisper-encoder.h"
# endif
2022-10-04 17:35:01 +00:00
# include "ggml.h"
# include <algorithm>
# include <cassert>
2022-10-09 14:26:37 +00:00
# define _USE_MATH_DEFINES
2022-10-04 17:35:01 +00:00
# include <cmath>
# include <cstdio>
# include <cstring>
# include <fstream>
# include <map>
# include <string>
# include <thread>
# include <vector>
2022-12-13 17:21:07 +00:00
# include <regex>
2023-01-15 09:29:57 +00:00
# include <random>
2023-06-25 11:22:21 +00:00
# if defined(_MSC_VER)
# pragma warning(disable: 4244 4267) // possible loss of data
# endif
2023-01-23 18:48:10 +00:00
# if defined(GGML_BIG_ENDIAN)
# include <bit>
template < typename T >
static T byteswap ( T value ) {
return std : : byteswap ( value ) ;
}
template < >
float byteswap ( float value ) {
return std : : bit_cast < float > ( byteswap ( std : : bit_cast < std : : uint32_t > ( value ) ) ) ;
}
template < typename T >
static void byteswap_tensor_data ( ggml_tensor * tensor ) {
T * datum = reinterpret_cast < T * > ( tensor - > data ) ;
for ( int i = 0 ; i < ggml_nelements ( tensor ) ; i + + ) {
datum [ i ] = byteswap ( datum [ i ] ) ;
}
}
static void byteswap_tensor ( ggml_tensor * tensor ) {
switch ( tensor - > type ) {
case GGML_TYPE_I16 : {
byteswap_tensor_data < int16_t > ( tensor ) ;
break ;
}
case GGML_TYPE_F16 : {
byteswap_tensor_data < ggml_fp16_t > ( tensor ) ;
break ;
}
case GGML_TYPE_I32 : {
byteswap_tensor_data < int32_t > ( tensor ) ;
break ;
}
case GGML_TYPE_F32 : {
byteswap_tensor_data < float > ( tensor ) ;
break ;
}
default : { // GML_TYPE_I8
break ;
}
}
}
# define BYTESWAP_VALUE(d) d = byteswap(d)
# define BYTESWAP_FILTERS(f) \
do { \
for ( auto & datum : f . data ) { \
datum = byteswap ( datum ) ; \
} \
} while ( 0 )
# define BYTESWAP_TENSOR(t) \
do { \
byteswap_tensor ( tensor ) ; \
} while ( 0 )
# else
# define BYTESWAP_VALUE(d) do {} while (0)
# define BYTESWAP_FILTERS(f) do {} while (0)
# define BYTESWAP_TENSOR(t) do {} while (0)
# endif
2023-01-15 09:29:57 +00:00
# define WHISPER_ASSERT(x) \
do { \
if ( ! ( x ) ) { \
fprintf ( stderr , " WHISPER_ASSERT: %s:%d: %s \n " , __FILE__ , __LINE__ , # x ) ; \
abort ( ) ; \
} \
} while ( 0 )
// define this to enable verbose trace logging - useful for debugging purposes
//#define WHISPER_DEBUG
# if defined(WHISPER_DEBUG)
# define WHISPER_PRINT_DEBUG(...) \
do { \
fprintf ( stderr , __VA_ARGS__ ) ; \
} while ( 0 )
# else
# define WHISPER_PRINT_DEBUG(...)
# endif
2022-10-04 17:35:01 +00:00
2023-04-30 09:14:33 +00:00
//#define WHISPER_USE_FLASH_ATTN
2023-01-15 09:29:57 +00:00
//#define WHISPER_USE_FLASH_FF
# define WHISPER_MAX_DECODERS 16
2022-10-04 17:35:01 +00:00
2023-02-04 07:45:52 +00:00
# define WHISPER_USE_SCRATCH
# define WHISPER_MAX_SCRATCH_BUFFERS 16
2022-10-04 17:35:01 +00:00
// available whisper models
enum e_model {
MODEL_UNKNOWN ,
MODEL_TINY ,
MODEL_BASE ,
MODEL_SMALL ,
MODEL_MEDIUM ,
MODEL_LARGE ,
} ;
static const std : : map < std : : string , std : : pair < int , std : : string > > g_lang = {
{ " en " , { 0 , " english " , } } ,
{ " zh " , { 1 , " chinese " , } } ,
{ " de " , { 2 , " german " , } } ,
{ " es " , { 3 , " spanish " , } } ,
{ " ru " , { 4 , " russian " , } } ,
{ " ko " , { 5 , " korean " , } } ,
{ " fr " , { 6 , " french " , } } ,
{ " ja " , { 7 , " japanese " , } } ,
{ " pt " , { 8 , " portuguese " , } } ,
{ " tr " , { 9 , " turkish " , } } ,
{ " pl " , { 10 , " polish " , } } ,
{ " ca " , { 11 , " catalan " , } } ,
{ " nl " , { 12 , " dutch " , } } ,
{ " ar " , { 13 , " arabic " , } } ,
{ " sv " , { 14 , " swedish " , } } ,
{ " it " , { 15 , " italian " , } } ,
{ " id " , { 16 , " indonesian " , } } ,
{ " hi " , { 17 , " hindi " , } } ,
{ " fi " , { 18 , " finnish " , } } ,
{ " vi " , { 19 , " vietnamese " , } } ,
2023-05-20 15:17:54 +00:00
{ " he " , { 20 , " hebrew " , } } ,
2022-10-04 17:35:01 +00:00
{ " uk " , { 21 , " ukrainian " , } } ,
{ " el " , { 22 , " greek " , } } ,
{ " ms " , { 23 , " malay " , } } ,
{ " cs " , { 24 , " czech " , } } ,
{ " ro " , { 25 , " romanian " , } } ,
{ " da " , { 26 , " danish " , } } ,
{ " hu " , { 27 , " hungarian " , } } ,
{ " ta " , { 28 , " tamil " , } } ,
{ " no " , { 29 , " norwegian " , } } ,
{ " th " , { 30 , " thai " , } } ,
{ " ur " , { 31 , " urdu " , } } ,
{ " hr " , { 32 , " croatian " , } } ,
{ " bg " , { 33 , " bulgarian " , } } ,
{ " lt " , { 34 , " lithuanian " , } } ,
{ " la " , { 35 , " latin " , } } ,
{ " mi " , { 36 , " maori " , } } ,
{ " ml " , { 37 , " malayalam " , } } ,
{ " cy " , { 38 , " welsh " , } } ,
{ " sk " , { 39 , " slovak " , } } ,
{ " te " , { 40 , " telugu " , } } ,
{ " fa " , { 41 , " persian " , } } ,
{ " lv " , { 42 , " latvian " , } } ,
{ " bn " , { 43 , " bengali " , } } ,
{ " sr " , { 44 , " serbian " , } } ,
{ " az " , { 45 , " azerbaijani " , } } ,
{ " sl " , { 46 , " slovenian " , } } ,
{ " kn " , { 47 , " kannada " , } } ,
{ " et " , { 48 , " estonian " , } } ,
{ " mk " , { 49 , " macedonian " , } } ,
{ " br " , { 50 , " breton " , } } ,
{ " eu " , { 51 , " basque " , } } ,
{ " is " , { 52 , " icelandic " , } } ,
{ " hy " , { 53 , " armenian " , } } ,
{ " ne " , { 54 , " nepali " , } } ,
{ " mn " , { 55 , " mongolian " , } } ,
{ " bs " , { 56 , " bosnian " , } } ,
{ " kk " , { 57 , " kazakh " , } } ,
{ " sq " , { 58 , " albanian " , } } ,
{ " sw " , { 59 , " swahili " , } } ,
{ " gl " , { 60 , " galician " , } } ,
{ " mr " , { 61 , " marathi " , } } ,
{ " pa " , { 62 , " punjabi " , } } ,
{ " si " , { 63 , " sinhala " , } } ,
{ " km " , { 64 , " khmer " , } } ,
{ " sn " , { 65 , " shona " , } } ,
{ " yo " , { 66 , " yoruba " , } } ,
{ " so " , { 67 , " somali " , } } ,
{ " af " , { 68 , " afrikaans " , } } ,
{ " oc " , { 69 , " occitan " , } } ,
{ " ka " , { 70 , " georgian " , } } ,
{ " be " , { 71 , " belarusian " , } } ,
{ " tg " , { 72 , " tajik " , } } ,
{ " sd " , { 73 , " sindhi " , } } ,
{ " gu " , { 74 , " gujarati " , } } ,
{ " am " , { 75 , " amharic " , } } ,
{ " yi " , { 76 , " yiddish " , } } ,
{ " lo " , { 77 , " lao " , } } ,
{ " uz " , { 78 , " uzbek " , } } ,
{ " fo " , { 79 , " faroese " , } } ,
{ " ht " , { 80 , " haitian creole " , } } ,
{ " ps " , { 81 , " pashto " , } } ,
{ " tk " , { 82 , " turkmen " , } } ,
{ " nn " , { 83 , " nynorsk " , } } ,
{ " mt " , { 84 , " maltese " , } } ,
{ " sa " , { 85 , " sanskrit " , } } ,
{ " lb " , { 86 , " luxembourgish " , } } ,
{ " my " , { 87 , " myanmar " , } } ,
{ " bo " , { 88 , " tibetan " , } } ,
{ " tl " , { 89 , " tagalog " , } } ,
{ " mg " , { 90 , " malagasy " , } } ,
{ " as " , { 91 , " assamese " , } } ,
{ " tt " , { 92 , " tatar " , } } ,
{ " haw " , { 93 , " hawaiian " , } } ,
{ " ln " , { 94 , " lingala " , } } ,
{ " ha " , { 95 , " hausa " , } } ,
{ " ba " , { 96 , " bashkir " , } } ,
{ " jw " , { 97 , " javanese " , } } ,
{ " su " , { 98 , " sundanese " , } } ,
} ;
2023-03-28 07:36:16 +00:00
static const size_t MB = 1ull * 1024 * 1024 ;
2022-10-04 17:35:01 +00:00
2023-02-04 07:45:52 +00:00
static const std : : map < e_model , size_t > MEM_REQ_SCRATCH0 = {
2023-04-30 09:14:33 +00:00
{ MODEL_TINY , 62ull * MB } ,
{ MODEL_BASE , 80ull * MB } ,
{ MODEL_SMALL , 120ull * MB } ,
{ MODEL_MEDIUM , 158ull * MB } ,
{ MODEL_LARGE , 198ull * MB } ,
2023-02-04 07:45:52 +00:00
} ;
static const std : : map < e_model , size_t > MEM_REQ_SCRATCH1 = {
{ MODEL_TINY , 18ull * MB } ,
{ MODEL_BASE , 24ull * MB } ,
{ MODEL_SMALL , 36ull * MB } ,
{ MODEL_MEDIUM , 48ull * MB } ,
{ MODEL_LARGE , 60ull * MB } ,
} ;
static const std : : map < e_model , size_t > MEM_REQ_SCRATCH2 = {
{ MODEL_TINY , 4ull * MB } ,
{ MODEL_BASE , 4ull * MB } ,
{ MODEL_SMALL , 6ull * MB } ,
{ MODEL_MEDIUM , 7ull * MB } ,
{ MODEL_LARGE , 9ull * MB } ,
} ;
static const std : : map < e_model , size_t > MEM_REQ_SCRATCH3 = {
{ MODEL_TINY , 4ull * MB } ,
{ MODEL_BASE , 4ull * MB } ,
{ MODEL_SMALL , 6ull * MB } ,
{ MODEL_MEDIUM , 7ull * MB } ,
{ MODEL_LARGE , 9ull * MB } ,
} ;
2023-04-30 15:51:57 +00:00
static const std : : map < ggml_type , std : : map < e_model , size_t > > MEM_REQ_MODEL = {
{ GGML_TYPE_F32 ,
{
{ MODEL_TINY , 74ull * MB } ,
{ MODEL_BASE , 142ull * MB } ,
{ MODEL_SMALL , 466ull * MB } ,
{ MODEL_MEDIUM , 1464ull * MB } ,
{ MODEL_LARGE , 2952ull * MB } ,
} ,
} ,
{ GGML_TYPE_F16 ,
{
{ MODEL_TINY , 74ull * MB } ,
{ MODEL_BASE , 142ull * MB } ,
{ MODEL_SMALL , 466ull * MB } ,
{ MODEL_MEDIUM , 1464ull * MB } ,
{ MODEL_LARGE , 2952ull * MB } ,
} ,
} ,
{ GGML_TYPE_Q4_0 ,
{
{ MODEL_TINY , 26ull * MB } ,
{ MODEL_BASE , 50ull * MB } ,
{ MODEL_SMALL , 154ull * MB } ,
{ MODEL_MEDIUM , 470ull * MB } ,
{ MODEL_LARGE , 940ull * MB } ,
} ,
} ,
{ GGML_TYPE_Q4_1 ,
{
2023-05-01 07:03:56 +00:00
{ MODEL_TINY , 32ull * MB } ,
{ MODEL_BASE , 58ull * MB } ,
{ MODEL_SMALL , 182ull * MB } ,
{ MODEL_MEDIUM , 562ull * MB } ,
{ MODEL_LARGE , 1124ull * MB } ,
2023-04-30 15:51:57 +00:00
} ,
} ,
2023-05-01 07:03:56 +00:00
{ GGML_TYPE_Q5_0 ,
2023-04-30 15:51:57 +00:00
{
2023-05-01 07:03:56 +00:00
{ MODEL_TINY , 30ull * MB } ,
{ MODEL_BASE , 54ull * MB } ,
{ MODEL_SMALL , 170ull * MB } ,
{ MODEL_MEDIUM , 516ull * MB } ,
{ MODEL_LARGE , 1034ull * MB } ,
2023-04-30 15:51:57 +00:00
} ,
} ,
{ GGML_TYPE_Q5_1 ,
{
2023-05-01 07:03:56 +00:00
{ MODEL_TINY , 32ull * MB } ,
{ MODEL_BASE , 58ull * MB } ,
{ MODEL_SMALL , 182ull * MB } ,
{ MODEL_MEDIUM , 562ull * MB } ,
{ MODEL_LARGE , 1124ull * MB } ,
} ,
} ,
{ GGML_TYPE_Q8_0 ,
{
{ MODEL_TINY , 45ull * MB } ,
{ MODEL_BASE , 84ull * MB } ,
{ MODEL_SMALL , 268ull * MB } ,
{ MODEL_MEDIUM , 834ull * MB } ,
{ MODEL_LARGE , 1674ull * MB } ,
2023-04-30 15:51:57 +00:00
} ,
} ,
2022-11-02 16:31:18 +00:00
} ;
2023-01-15 09:29:57 +00:00
static const std : : map < e_model , size_t > MEM_REQ_KV_SELF = {
{ MODEL_TINY , 3ull * MB } ,
{ MODEL_BASE , 6ull * MB } ,
{ MODEL_SMALL , 16ull * MB } ,
{ MODEL_MEDIUM , 43ull * MB } ,
{ MODEL_LARGE , 71ull * MB } ,
} ;
static const std : : map < e_model , size_t > MEM_REQ_KV_CROSS = {
{ MODEL_TINY , 9ull * MB } ,
{ MODEL_BASE , 18ull * MB } ,
{ MODEL_SMALL , 53ull * MB } ,
{ MODEL_MEDIUM , 141ull * MB } ,
{ MODEL_LARGE , 235ull * MB } ,
2022-10-04 17:35:01 +00:00
} ;
static const std : : map < e_model , size_t > MEM_REQ_ENCODE = {
2023-04-30 09:14:33 +00:00
{ MODEL_TINY , 30ull * MB } ,
{ MODEL_BASE , 38ull * MB } ,
{ MODEL_SMALL , 56ull * MB } ,
{ MODEL_MEDIUM , 74ull * MB } ,
{ MODEL_LARGE , 94ull * MB } ,
2022-10-04 17:35:01 +00:00
} ;
static const std : : map < e_model , size_t > MEM_REQ_DECODE = {
2023-02-04 07:45:52 +00:00
{ MODEL_TINY , 3ull * MB } ,
{ MODEL_BASE , 5ull * MB } ,
{ MODEL_SMALL , 10ull * MB } ,
{ MODEL_MEDIUM , 18ull * MB } ,
{ MODEL_LARGE , 27ull * MB } ,
2022-10-04 17:35:01 +00:00
} ;
struct whisper_mel {
int n_len ;
2023-04-15 14:18:43 +00:00
int n_len_org ;
2022-10-04 17:35:01 +00:00
int n_mel ;
std : : vector < float > data ;
} ;
struct whisper_filters {
int32_t n_mel ;
int32_t n_fft ;
std : : vector < float > data ;
} ;
struct whisper_vocab {
using id = int32_t ;
using token = std : : string ;
int n_vocab = 51864 ;
std : : map < token , id > token_to_id ;
std : : map < id , token > id_to_token ;
2023-07-04 06:45:00 +00:00
// reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
id token_eot = 50256 ;
id token_sot = 50257 ;
// task tokens (used only for multilingual models)
id token_translate = 50357 ;
id token_transcribe = 50358 ;
// other special tokens
id token_solm = 50359 ; // [TDRZ] used by tinydiarize models to indicate speaker turn
id token_prev = 50360 ;
id token_nosp = 50361 ;
id token_not = 50362 ; // no timestamps
id token_beg = 50363 ; // begin timestamps
2022-10-04 17:35:01 +00:00
bool is_multilingual ( ) const {
return n_vocab = = 51865 ;
}
} ;
2022-10-04 19:43:37 +00:00
struct whisper_segment {
int64_t t0 ;
int64_t t1 ;
std : : string text ;
2022-10-21 14:33:59 +00:00
std : : vector < whisper_token_data > tokens ;
2023-07-04 06:45:00 +00:00
bool speaker_turn_next ;
2022-10-04 17:35:01 +00:00
} ;
// medium
// hparams: {
// 'n_mels': 80,
// 'n_vocab': 51864,
// 'n_audio_ctx': 1500,
// 'n_audio_state': 1024,
// 'n_audio_head': 16,
// 'n_audio_layer': 24,
// 'n_text_ctx': 448,
// 'n_text_state': 1024,
// 'n_text_head': 16,
// 'n_text_layer': 24
// }
//
// default hparams (Whisper tiny)
struct whisper_hparams {
int32_t n_vocab = 51864 ;
int32_t n_audio_ctx = 1500 ;
int32_t n_audio_state = 384 ;
int32_t n_audio_head = 6 ;
int32_t n_audio_layer = 4 ;
int32_t n_text_ctx = 448 ;
int32_t n_text_state = 384 ;
int32_t n_text_head = 6 ;
int32_t n_text_layer = 4 ;
int32_t n_mels = 80 ;
2023-04-30 15:51:57 +00:00
int32_t ftype = 1 ;
2022-10-04 17:35:01 +00:00
} ;
// audio encoding layer
struct whisper_layer_encoder {
// encoder.blocks.*.attn_ln
struct ggml_tensor * attn_ln_0_w ;
struct ggml_tensor * attn_ln_0_b ;
// encoder.blocks.*.attn.out
struct ggml_tensor * attn_ln_1_w ;
struct ggml_tensor * attn_ln_1_b ;
// encoder.blocks.*.attn.query
struct ggml_tensor * attn_q_w ;
struct ggml_tensor * attn_q_b ;
// encoder.blocks.*.attn.key
struct ggml_tensor * attn_k_w ;
// encoder.blocks.*.attn.value
struct ggml_tensor * attn_v_w ;
struct ggml_tensor * attn_v_b ;
// encoder.blocks.*.mlp_ln
struct ggml_tensor * mlp_ln_w ;
struct ggml_tensor * mlp_ln_b ;
// encoder.blocks.*.mlp.0
struct ggml_tensor * mlp_0_w ;
struct ggml_tensor * mlp_0_b ;
// encoder.blocks.*.mlp.2
struct ggml_tensor * mlp_1_w ;
struct ggml_tensor * mlp_1_b ;
} ;
// token decoding layer
struct whisper_layer_decoder {
// decoder.blocks.*.attn_ln
struct ggml_tensor * attn_ln_0_w ;
struct ggml_tensor * attn_ln_0_b ;
// decoder.blocks.*.attn.out
struct ggml_tensor * attn_ln_1_w ;
struct ggml_tensor * attn_ln_1_b ;
// decoder.blocks.*.attn.query
struct ggml_tensor * attn_q_w ;
struct ggml_tensor * attn_q_b ;
// decoder.blocks.*.attn.key
struct ggml_tensor * attn_k_w ;
// decoder.blocks.*.attn.value
struct ggml_tensor * attn_v_w ;
struct ggml_tensor * attn_v_b ;
// decoder.blocks.*.cross_attn_ln
struct ggml_tensor * cross_attn_ln_0_w ;
struct ggml_tensor * cross_attn_ln_0_b ;
// decoder.blocks.*.cross_attn.out
struct ggml_tensor * cross_attn_ln_1_w ;
struct ggml_tensor * cross_attn_ln_1_b ;
// decoder.blocks.*.cross_attn.query
struct ggml_tensor * cross_attn_q_w ;
struct ggml_tensor * cross_attn_q_b ;
// decoder.blocks.*.cross_attn.key
struct ggml_tensor * cross_attn_k_w ;
// decoder.blocks.*.cross_attn.value
struct ggml_tensor * cross_attn_v_w ;
struct ggml_tensor * cross_attn_v_b ;
// decoder.blocks.*.mlp_ln
struct ggml_tensor * mlp_ln_w ;
struct ggml_tensor * mlp_ln_b ;
// decoder.blocks.*.mlp.0
struct ggml_tensor * mlp_0_w ;
struct ggml_tensor * mlp_0_b ;
// decoder.blocks.*.mlp.2
struct ggml_tensor * mlp_1_w ;
struct ggml_tensor * mlp_1_b ;
} ;
2023-01-15 09:29:57 +00:00
struct whisper_kv_cache {
struct ggml_tensor * k ;
struct ggml_tensor * v ;
struct ggml_context * ctx ;
std : : vector < uint8_t > buf ;
int n ; // number of tokens currently in the cache
} ;
2022-10-04 17:35:01 +00:00
struct whisper_model {
e_model type = MODEL_UNKNOWN ;
whisper_hparams hparams ;
whisper_filters filters ;
// encoder.positional_embedding
struct ggml_tensor * e_pe ;
// encoder.conv1
struct ggml_tensor * e_conv_1_w ;
struct ggml_tensor * e_conv_1_b ;
// encoder.conv2
struct ggml_tensor * e_conv_2_w ;
struct ggml_tensor * e_conv_2_b ;
// encoder.ln_post
struct ggml_tensor * e_ln_w ;
struct ggml_tensor * e_ln_b ;
// decoder.positional_embedding
2023-01-15 09:29:57 +00:00
struct ggml_tensor * d_pe ;
2022-10-04 17:35:01 +00:00
// decoder.token_embedding
2023-01-15 09:29:57 +00:00
struct ggml_tensor * d_te ;
2022-10-04 17:35:01 +00:00
// decoder.ln
2023-01-15 09:29:57 +00:00
struct ggml_tensor * d_ln_w ;
struct ggml_tensor * d_ln_b ;
2022-10-04 17:35:01 +00:00
std : : vector < whisper_layer_encoder > layers_encoder ;
std : : vector < whisper_layer_decoder > layers_decoder ;
2022-10-21 12:57:20 +00:00
// context
2022-10-04 17:35:01 +00:00
struct ggml_context * ctx ;
2023-01-15 09:29:57 +00:00
// the model memory buffer is read-only and can be shared between processors
std : : vector < uint8_t > * buf ;
2022-10-21 12:57:20 +00:00
// tensors
int n_loaded ;
2022-10-04 17:35:01 +00:00
std : : map < std : : string , struct ggml_tensor * > tensors ;
} ;
2023-01-15 09:29:57 +00:00
struct whisper_sequence {
std : : vector < whisper_token_data > tokens ;
// the accumulated transcription in the current interation (used to truncate the tokens array)
int result_len ;
double sum_logprobs_all ; // the sum of the log probabilities of the tokens
double sum_logprobs ; // the sum of the log probabilities of the tokens (first result_len tokens)
double avg_logprobs ; // the average log probability of the tokens
double entropy ; // the entropy of the tokens
double score ; // likelihood rank score
} ;
// TAGS: WHISPER_DECODER_INIT
struct whisper_decoder {
// each decoders keeps its own KV-cache
whisper_kv_cache kv_self ;
// the currently generated sequence of tokens
whisper_sequence sequence ;
int seek_delta ; // the window shift found so far based on the decoded timestamp tokens
bool failed ; // has the current segment failed to decode?
bool completed ; // has the decoder completed the current segment?
bool has_ts ; // have we already sampled a non-beg timestamp token for the current segment?
// new token probs, logits and logprobs after the last whisper_decode (1-dimensional array: [n_vocab])
std : : vector < float > probs ;
std : : vector < float > logits ;
std : : vector < float > logprobs ;
std : : vector < whisper_token > tokens_tmp ; // used for whisper_decode calls
} ;
2023-03-05 19:42:19 +00:00
struct whisper_state {
2022-10-04 17:35:01 +00:00
int64_t t_sample_us = 0 ;
int64_t t_encode_us = 0 ;
int64_t t_decode_us = 0 ;
2023-03-05 19:42:19 +00:00
int64_t t_mel_us = 0 ;
2022-10-04 17:35:01 +00:00
2023-01-19 16:50:33 +00:00
int32_t n_sample = 0 ; // number of tokens sampled
int32_t n_encode = 0 ; // number of encoder calls
int32_t n_decode = 0 ; // number of decoder calls
int32_t n_fail_p = 0 ; // number of logprob threshold failures
int32_t n_fail_h = 0 ; // number of entropy threshold failures
2023-01-15 09:29:57 +00:00
// cross-attention KV cache for the decoders
// shared between all decoders
whisper_kv_cache kv_cross ;
2023-03-05 19:42:19 +00:00
whisper_mel mel ;
2022-10-04 17:35:01 +00:00
2023-01-15 09:29:57 +00:00
whisper_decoder decoders [ WHISPER_MAX_DECODERS ] = { } ;
// memory buffers used by encode / decode contexts
std : : vector < uint8_t > buf_compute ;
2023-02-04 07:45:52 +00:00
std : : vector < uint8_t > buf_scratch [ WHISPER_MAX_SCRATCH_BUFFERS ] ;
int buf_last = 0 ;
size_t buf_max_size [ WHISPER_MAX_SCRATCH_BUFFERS ] = { 0 } ;
2023-01-15 09:29:57 +00:00
// decode output (2-dimensional array: [n_tokens][n_vocab])
2022-10-04 17:35:01 +00:00
std : : vector < float > logits ;
2022-10-04 19:43:37 +00:00
std : : vector < whisper_segment > result_all ;
2023-01-15 09:29:57 +00:00
std : : vector < whisper_token > prompt_past ;
// work container used to avoid memory allocations
std : : vector < std : : pair < double , whisper_vocab : : id > > logits_id ;
2022-10-07 19:30:44 +00:00
2023-01-15 09:29:57 +00:00
mutable std : : mt19937 rng ; // used for sampling at t > 0.0
2022-11-02 19:18:20 +00:00
2023-02-21 17:00:42 +00:00
int lang_id = 0 ; // english by default
2023-02-05 12:46:26 +00:00
2023-04-15 10:21:27 +00:00
std : : string path_model ; // populated by whisper_init_from_file()
# ifdef WHISPER_USE_COREML
2023-04-29 07:49:02 +00:00
whisper_coreml_context * ctx_coreml = nullptr ;
2023-04-15 10:21:27 +00:00
# endif
2022-11-02 19:18:20 +00:00
// [EXPERIMENTAL] token-level timestamps data
2023-02-21 17:00:42 +00:00
int64_t t_beg = 0 ;
int64_t t_last = 0 ;
2022-11-02 19:18:20 +00:00
whisper_token tid_last ;
std : : vector < float > energy ; // PCM signal energy
2022-11-20 19:12:01 +00:00
// [EXPERIMENTAL] speed-up techniques
2023-02-21 11:42:10 +00:00
int32_t exp_n_audio_ctx = 0 ; // 0 - use default
2023-02-04 07:45:52 +00:00
void use_buf ( struct ggml_context * ctx , int i ) {
# if defined(WHISPER_USE_SCRATCH)
size_t last_size = 0 ;
if ( i = = - 1 ) {
last_size = ggml_set_scratch ( ctx , { 0 , 0 , nullptr , } ) ;
} else {
auto & buf = buf_scratch [ i ] ;
last_size = ggml_set_scratch ( ctx , { 0 , buf . size ( ) , buf . data ( ) , } ) ;
}
if ( buf_last > = 0 ) {
buf_max_size [ buf_last ] = std : : max ( buf_max_size [ buf_last ] , last_size ) ;
}
buf_last = i ;
# else
( void ) i ;
( void ) ctx ;
# endif
}
size_t get_buf_max_mem ( int i ) const {
# if defined(WHISPER_USE_SCRATCH)
return buf_max_size [ i ] ;
# else
( void ) i ;
return 0 ;
# endif
}
2022-10-04 17:35:01 +00:00
} ;
2023-03-05 19:42:19 +00:00
struct whisper_context {
2023-04-30 15:51:57 +00:00
int64_t t_load_us = 0 ;
2023-03-05 19:42:19 +00:00
int64_t t_start_us = 0 ;
2023-04-30 15:51:57 +00:00
ggml_type wtype = ggml_type : : GGML_TYPE_F16 ; // weight type (FP32 / FP16 / QX)
ggml_type itype = ggml_type : : GGML_TYPE_F16 ; // intermediate type (FP32 or FP16)
2023-03-05 19:42:19 +00:00
whisper_model model ;
whisper_vocab vocab ;
whisper_state * state = nullptr ;
2023-03-27 18:00:32 +00:00
std : : string path_model ; // populated by whisper_init_from_file()
2023-03-05 19:42:19 +00:00
} ;
2022-12-10 12:09:31 +00:00
template < typename T >
2023-01-08 11:03:33 +00:00
static void read_safe ( whisper_model_loader * loader , T & dest ) {
loader - > read ( loader - > context , & dest , sizeof ( T ) ) ;
2023-01-23 18:48:10 +00:00
BYTESWAP_VALUE ( dest ) ;
2022-12-10 12:09:31 +00:00
}
2023-01-15 09:29:57 +00:00
static bool kv_cache_init (
const struct whisper_hparams & hparams ,
const size_t mem_bytes ,
struct whisper_kv_cache & cache ,
ggml_type wtype ,
int n_ctx ) {
cache . buf . resize ( mem_bytes ) ;
2023-04-10 19:28:54 +00:00
struct ggml_init_params params = {
/*.mem_size =*/ cache . buf . size ( ) ,
/*.mem_buffer =*/ cache . buf . data ( ) ,
/*.no_alloc =*/ false ,
} ;
2023-01-15 09:29:57 +00:00
cache . ctx = ggml_init ( params ) ;
if ( ! cache . ctx ) {
fprintf ( stderr , " %s: failed to allocate memory for kv cache \n " , __func__ ) ;
return false ;
}
const int n_text_state = hparams . n_text_state ;
const int n_text_layer = hparams . n_text_layer ;
const int n_mem = n_text_layer * n_ctx ;
const int n_elements = n_text_state * n_mem ;
cache . k = ggml_new_tensor_1d ( cache . ctx , wtype , n_elements ) ;
cache . v = ggml_new_tensor_1d ( cache . ctx , wtype , n_elements ) ;
return true ;
}
static bool kv_cache_reinit ( struct whisper_kv_cache & cache ) {
WHISPER_ASSERT ( cache . ctx ) ;
const int n_elements = ggml_nelements ( cache . k ) ;
WHISPER_ASSERT ( n_elements = = ggml_nelements ( cache . v ) ) ;
const ggml_type wtype = cache . k - > type ;
WHISPER_ASSERT ( wtype = = cache . v - > type ) ;
2023-04-30 15:51:57 +00:00
WHISPER_ASSERT ( cache . buf . size ( ) > = 2 * n_elements * ggml_type_sizef ( wtype ) ) ;
2023-01-15 09:29:57 +00:00
2023-04-10 19:28:54 +00:00
struct ggml_init_params params = {
/*.mem_size =*/ cache . buf . size ( ) ,
/*.mem_buffer =*/ cache . buf . data ( ) ,
/*.no_alloc =*/ false ,
} ;
2023-01-15 09:29:57 +00:00
cache . ctx = ggml_init ( params ) ;
if ( ! cache . ctx ) {
fprintf ( stderr , " %s: failed to allocate memory for kv cache \n " , __func__ ) ;
return false ;
}
cache . k = ggml_new_tensor_1d ( cache . ctx , wtype , n_elements ) ;
cache . v = ggml_new_tensor_1d ( cache . ctx , wtype , n_elements ) ;
return true ;
}
static void kv_cache_free ( struct whisper_kv_cache & cache ) {
if ( cache . ctx ) {
ggml_free ( cache . ctx ) ;
cache . ctx = nullptr ;
}
}
2022-10-04 17:35:01 +00:00
// load the model from a ggml file
//
// file format:
//
// - hparams
// - pre-computed mel filters
// - vocab
// - weights
//
// see the convert-pt-to-ggml.py script for details
//
2023-01-08 11:03:33 +00:00
static bool whisper_model_load ( struct whisper_model_loader * loader , whisper_context & wctx ) {
fprintf ( stderr , " %s: loading model \n " , __func__ ) ;
2022-10-04 17:35:01 +00:00
2023-01-15 09:29:57 +00:00
const int64_t t_start_us = ggml_time_us ( ) ;
wctx . t_start_us = t_start_us ;
2022-10-04 17:35:01 +00:00
auto & model = wctx . model ;
auto & vocab = wctx . vocab ;
// verify magic
{
uint32_t magic ;
2023-01-08 11:03:33 +00:00
read_safe ( loader , magic ) ;
2023-07-02 18:45:27 +00:00
if ( magic ! = GGML_FILE_MAGIC ) {
2023-01-08 11:03:33 +00:00
fprintf ( stderr , " %s: invalid model data (bad magic) \n " , __func__ ) ;
2022-10-04 17:35:01 +00:00
return false ;
}
}
//load hparams
{
auto & hparams = model . hparams ;
2023-01-08 11:03:33 +00:00
read_safe ( loader , hparams . n_vocab ) ;
read_safe ( loader , hparams . n_audio_ctx ) ;
read_safe ( loader , hparams . n_audio_state ) ;
read_safe ( loader , hparams . n_audio_head ) ;
read_safe ( loader , hparams . n_audio_layer ) ;
read_safe ( loader , hparams . n_text_ctx ) ;
read_safe ( loader , hparams . n_text_state ) ;
read_safe ( loader , hparams . n_text_head ) ;
read_safe ( loader , hparams . n_text_layer ) ;
read_safe ( loader , hparams . n_mels ) ;
2023-04-30 15:51:57 +00:00
read_safe ( loader , hparams . ftype ) ;
2022-10-04 17:35:01 +00:00
assert ( hparams . n_text_state = = hparams . n_audio_state ) ;
if ( hparams . n_audio_layer = = 4 ) {
model . type = e_model : : MODEL_TINY ;
}
if ( hparams . n_audio_layer = = 6 ) {
model . type = e_model : : MODEL_BASE ;
}
if ( hparams . n_audio_layer = = 12 ) {
model . type = e_model : : MODEL_SMALL ;
}
if ( hparams . n_audio_layer = = 24 ) {
model . type = e_model : : MODEL_MEDIUM ;
}
if ( hparams . n_audio_layer = = 32 ) {
model . type = e_model : : MODEL_LARGE ;
}
2023-05-14 15:04:23 +00:00
const int32_t qntvr = hparams . ftype / GGML_QNT_VERSION_FACTOR ;
hparams . ftype % = GGML_QNT_VERSION_FACTOR ;
2023-04-30 15:51:57 +00:00
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
2023-01-15 09:29:57 +00:00
// in order to save memory and also to speed up the computation
2023-04-30 15:51:57 +00:00
wctx . wtype = ggml_ftype_to_ggml_type ( ( ggml_ftype ) ( model . hparams . ftype ) ) ;
if ( wctx . wtype = = GGML_TYPE_COUNT ) {
fprintf ( stderr , " %s: invalid model (bad ftype value %d) \n " , __func__ , model . hparams . ftype ) ;
return false ;
}
2023-01-15 09:29:57 +00:00
2023-04-30 15:51:57 +00:00
const size_t scale = model . hparams . ftype ? 1 : 2 ;
2023-01-15 09:29:57 +00:00
2022-10-08 14:28:06 +00:00
fprintf ( stderr , " %s: n_vocab = %d \n " , __func__ , hparams . n_vocab ) ;
fprintf ( stderr , " %s: n_audio_ctx = %d \n " , __func__ , hparams . n_audio_ctx ) ;
fprintf ( stderr , " %s: n_audio_state = %d \n " , __func__ , hparams . n_audio_state ) ;
fprintf ( stderr , " %s: n_audio_head = %d \n " , __func__ , hparams . n_audio_head ) ;
fprintf ( stderr , " %s: n_audio_layer = %d \n " , __func__ , hparams . n_audio_layer ) ;
fprintf ( stderr , " %s: n_text_ctx = %d \n " , __func__ , hparams . n_text_ctx ) ;
fprintf ( stderr , " %s: n_text_state = %d \n " , __func__ , hparams . n_text_state ) ;
fprintf ( stderr , " %s: n_text_head = %d \n " , __func__ , hparams . n_text_head ) ;
fprintf ( stderr , " %s: n_text_layer = %d \n " , __func__ , hparams . n_text_layer ) ;
fprintf ( stderr , " %s: n_mels = %d \n " , __func__ , hparams . n_mels ) ;
2023-04-30 15:51:57 +00:00
fprintf ( stderr , " %s: ftype = %d \n " , __func__ , model . hparams . ftype ) ;
2023-05-14 15:04:23 +00:00
fprintf ( stderr , " %s: qntvr = %d \n " , __func__ , qntvr ) ;
2022-10-08 14:28:06 +00:00
fprintf ( stderr , " %s: type = %d \n " , __func__ , model . type ) ;
2022-10-04 17:35:01 +00:00
2023-01-15 09:29:57 +00:00
// print memory requirements
{
// this is the total memory required to run the inference
const size_t mem_required =
2023-04-30 15:51:57 +00:00
MEM_REQ_SCRATCH0 . at ( model . type ) +
MEM_REQ_SCRATCH1 . at ( model . type ) +
MEM_REQ_SCRATCH2 . at ( model . type ) +
MEM_REQ_SCRATCH3 . at ( model . type ) +
scale * MEM_REQ_MODEL . at ( wctx . wtype ) . at ( model . type ) +
2023-02-04 07:45:52 +00:00
scale * MEM_REQ_KV_CROSS . at ( model . type ) +
2023-02-19 16:35:01 +00:00
scale * std : : max ( MEM_REQ_ENCODE . at ( model . type ) , MEM_REQ_DECODE . at ( model . type ) ) ;
2023-01-15 09:29:57 +00:00
// this is the memory required by one decoder
const size_t mem_required_decoder =
scale * MEM_REQ_KV_SELF . at ( model . type ) ;
fprintf ( stderr , " %s: mem required = %7.2f MB (+ %7.2f MB per decoder) \n " , __func__ ,
mem_required / 1024.0 / 1024.0 , mem_required_decoder / 1024.0 / 1024.0 ) ;
}
// initialize all memory buffers
// always have at least one decoder
wctx . model . buf = new std : : vector < uint8_t > ( ) ;
2023-04-30 15:51:57 +00:00
wctx . model . buf - > resize ( scale * MEM_REQ_MODEL . at ( wctx . wtype ) . at ( model . type ) ) ;
2023-01-15 09:29:57 +00:00
2023-03-05 19:42:19 +00:00
// we skip initialization of the state until it is needed
// because it might be that state will always be provided externally.
2022-10-04 17:35:01 +00:00
}
// load mel filters
{
auto & filters = wctx . model . filters ;
2023-01-08 11:03:33 +00:00
read_safe ( loader , filters . n_mel ) ;
read_safe ( loader , filters . n_fft ) ;
2022-10-04 17:35:01 +00:00
filters . data . resize ( filters . n_mel * filters . n_fft ) ;
2023-01-08 11:03:33 +00:00
loader - > read ( loader - > context , filters . data . data ( ) , filters . data . size ( ) * sizeof ( float ) ) ;
2023-01-23 18:48:10 +00:00
BYTESWAP_FILTERS ( filters ) ;
2022-10-04 17:35:01 +00:00
}
// load vocab
{
int32_t n_vocab = 0 ;
2023-01-08 11:03:33 +00:00
read_safe ( loader , n_vocab ) ;
2022-10-04 17:35:01 +00:00
//if (n_vocab != model.hparams.n_vocab) {
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
// return false;
//}
std : : string word ;
2022-12-13 21:13:55 +00:00
std : : vector < char > tmp ;
2022-12-30 11:42:35 +00:00
tmp . reserve ( 128 ) ;
2022-10-04 17:35:01 +00:00
for ( int i = 0 ; i < n_vocab ; i + + ) {
uint32_t len ;
2023-01-08 11:03:33 +00:00
read_safe ( loader , len ) ;
2022-10-04 17:35:01 +00:00
2022-12-13 21:13:55 +00:00
if ( len > 0 ) {
tmp . resize ( len ) ;
2023-01-08 11:03:33 +00:00
loader - > read ( loader - > context , & tmp [ 0 ] , tmp . size ( ) ) ; // read to buffer
2022-12-13 21:13:55 +00:00
word . assign ( & tmp [ 0 ] , tmp . size ( ) ) ;
} else {
// seems like we have an empty-string token in multi-language models (i = 50256)
//fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
word = " " ;
}
2022-10-04 17:35:01 +00:00
vocab . token_to_id [ word ] = i ;
vocab . id_to_token [ i ] = word ;
//printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
}
vocab . n_vocab = model . hparams . n_vocab ;
if ( vocab . is_multilingual ( ) ) {
vocab . token_eot + + ;
vocab . token_sot + + ;
2023-07-04 06:45:00 +00:00
vocab . token_translate + + ;
vocab . token_transcribe + + ;
2022-10-04 17:35:01 +00:00
vocab . token_solm + + ;
2023-07-04 06:45:00 +00:00
vocab . token_prev + + ;
vocab . token_nosp + + ;
2022-10-04 17:35:01 +00:00
vocab . token_not + + ;
vocab . token_beg + + ;
}
if ( n_vocab < model . hparams . n_vocab ) {
2022-10-08 14:28:06 +00:00
fprintf ( stderr , " %s: adding %d extra tokens \n " , __func__ , model . hparams . n_vocab - n_vocab ) ;
2022-10-04 17:35:01 +00:00
for ( int i = n_vocab ; i < model . hparams . n_vocab ; i + + ) {
if ( i > vocab . token_beg ) {
word = " [_TT_ " + std : : to_string ( i - vocab . token_beg ) + " ] " ;
} else if ( i = = vocab . token_eot ) {
word = " [_EOT_] " ;
} else if ( i = = vocab . token_sot ) {
word = " [_SOT_] " ;
2023-07-04 06:45:00 +00:00
} else if ( i = = vocab . token_solm ) {
word = " [_SOLM_] " ;
2022-10-04 17:35:01 +00:00
} else if ( i = = vocab . token_prev ) {
word = " [_PREV_] " ;
2023-07-04 06:45:00 +00:00
} else if ( i = = vocab . token_nosp ) {
word = " [_NOSP_] " ;
2022-10-04 17:35:01 +00:00
} else if ( i = = vocab . token_not ) {
word = " [_NOT_] " ;
} else if ( i = = vocab . token_beg ) {
word = " [_BEG_] " ;
} else {
word = " [_extra_token_ " + std : : to_string ( i ) + " ] " ;
}
vocab . token_to_id [ word ] = i ;
vocab . id_to_token [ i ] = word ;
}
}
2022-11-24 15:54:16 +00:00
}
2023-01-15 09:29:57 +00:00
size_t ctx_size = 0 ;
2023-01-05 20:56:25 +00:00
const ggml_type wtype = wctx . wtype ;
2023-04-30 15:51:57 +00:00
const ggml_type vtype = wctx . wtype = = GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16 ; // conv type
2022-10-04 17:35:01 +00:00
{
const auto & hparams = model . hparams ;
const int n_vocab = hparams . n_vocab ;
const int n_audio_ctx = hparams . n_audio_ctx ;
const int n_audio_state = hparams . n_audio_state ;
const int n_audio_layer = hparams . n_audio_layer ;
2022-11-11 20:33:10 +00:00
const int n_text_ctx = hparams . n_text_ctx ;
2022-10-04 17:35:01 +00:00
const int n_text_state = hparams . n_text_state ;
const int n_text_layer = hparams . n_text_layer ;
const int n_mels = hparams . n_mels ;
// encoder
{
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_ctx * n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ; // e_pe;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = 3 * n_mels * n_audio_state * ggml_type_sizef ( vtype ) ; // e_conv_1_w
ctx_size + = n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ; // e_conv_1_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = 3 * n_audio_state * n_audio_state * ggml_type_sizef ( vtype ) ; // e_conv_2_w
ctx_size + = n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ; // e_conv_2_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ; // e_ln_w;
ctx_size + = n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ; // e_ln_b;
2022-10-04 17:35:01 +00:00
}
// decoder
{
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_ctx * n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ; // d_pe;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_vocab * n_text_state * ggml_type_sizef ( wtype ) ; // d_te;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ; // d_ln_w;
ctx_size + = n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ; // d_ln_b;
2022-10-04 17:35:01 +00:00
}
// encoder layers
{
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_layer * ( n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // mlp_ln_w
ctx_size + = n_audio_layer * ( n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // mlp_ln_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_layer * ( 4 * n_audio_state * n_audio_state * ggml_type_sizef ( wtype ) ) ; // mlp_0_w
ctx_size + = n_audio_layer * ( 4 * n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // mlp_0_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_layer * ( 4 * n_audio_state * n_audio_state * ggml_type_sizef ( wtype ) ) ; // mlp_1_w
ctx_size + = n_audio_layer * ( n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // mlp_1_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_layer * ( n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_ln_0_w
ctx_size + = n_audio_layer * ( n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_ln_0_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_layer * ( n_audio_state * n_audio_state * ggml_type_sizef ( wtype ) ) ; // attn_q_w
ctx_size + = n_audio_layer * ( n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_q_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_layer * ( n_audio_state * n_audio_state * ggml_type_sizef ( wtype ) ) ; // attn_k_w
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_layer * ( n_audio_state * n_audio_state * ggml_type_sizef ( wtype ) ) ; // attn_v_w
ctx_size + = n_audio_layer * ( n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_v_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_audio_layer * ( n_audio_state * n_audio_state * ggml_type_sizef ( wtype ) ) ; // attn_ln_1_w
ctx_size + = n_audio_layer * ( n_audio_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_ln_1_b
2022-10-04 17:35:01 +00:00
}
// decoder layers
{
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // mlp_ln_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // mlp_ln_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( 4 * n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // mlp_0_w
ctx_size + = n_text_layer * ( 4 * n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // mlp_0_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( 4 * n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // mlp_1_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // mlp_1_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_ln_0_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_ln_0_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // attn_q_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_q_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // attn_k_w
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // attn_v_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_v_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // attn_ln_1_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // attn_ln_1_b
2022-10-04 17:35:01 +00:00
//
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // cross_attn_ln_0_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // cross_attn_ln_0_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // cross_attn_q_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // cross_attn_q_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // cross_attn_k_w
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // cross_attn_v_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // cross_attn_v_b
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
ctx_size + = n_text_layer * ( n_text_state * n_text_state * ggml_type_sizef ( wtype ) ) ; // cross_attn_ln_1_w
ctx_size + = n_text_layer * ( n_text_state * ggml_type_sizef ( GGML_TYPE_F32 ) ) ; // cross_attn_ln_1_b
2022-10-04 17:35:01 +00:00
}
2023-05-14 15:04:23 +00:00
ctx_size + = ( 15 + 15 * n_audio_layer + 24 * n_text_layer ) * 512 ; // object overhead
2022-10-04 17:35:01 +00:00
2023-01-15 09:29:57 +00:00
fprintf ( stderr , " %s: model ctx = %7.2f MB \n " , __func__ , ctx_size / ( 1024.0 * 1024.0 ) ) ;
2022-10-04 17:35:01 +00:00
}
// create the ggml context
{
2023-04-10 19:28:54 +00:00
struct ggml_init_params params = {
/*.mem_size =*/ wctx . model . buf - > size ( ) ,
/*.mem_buffer =*/ wctx . model . buf - > data ( ) ,
/*.no_alloc =*/ false ,
} ;
2022-10-04 17:35:01 +00:00
model . ctx = ggml_init ( params ) ;
if ( ! model . ctx ) {
fprintf ( stderr , " %s: ggml_init() failed \n " , __func__ ) ;
return false ;
}
}
// prepare memory for the weights
{
auto & ctx = model . ctx ;
const auto & hparams = model . hparams ;
const int n_vocab = hparams . n_vocab ;
const int n_audio_ctx = hparams . n_audio_ctx ;
const int n_audio_state = hparams . n_audio_state ;
const int n_audio_layer = hparams . n_audio_layer ;
2022-11-11 20:33:10 +00:00
const int n_text_ctx = hparams . n_text_ctx ;
2022-10-04 17:35:01 +00:00
const int n_text_state = hparams . n_text_state ;
const int n_text_layer = hparams . n_text_layer ;
const int n_mels = hparams . n_mels ;
model . layers_encoder . resize ( n_audio_layer ) ;
model . layers_decoder . resize ( n_text_layer ) ;
// encoder
{
2023-04-30 15:51:57 +00:00
model . e_pe = ggml_new_tensor_2d ( ctx , GGML_TYPE_F32 , n_audio_state , n_audio_ctx ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . e_conv_1_w = ggml_new_tensor_3d ( ctx , vtype , 3 , n_mels , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
model . e_conv_1_b = ggml_new_tensor_2d ( ctx , GGML_TYPE_F32 , 1 , n_audio_state ) ;
2023-04-30 15:51:57 +00:00
model . e_conv_2_w = ggml_new_tensor_3d ( ctx , vtype , 3 , n_audio_state , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
model . e_conv_2_b = ggml_new_tensor_2d ( ctx , GGML_TYPE_F32 , 1 , n_audio_state ) ;
2023-04-30 15:51:57 +00:00
model . e_ln_w = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
model . e_ln_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
// map by name
model . tensors [ " encoder.positional_embedding " ] = model . e_pe ;
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.conv1.weight " ] = model . e_conv_1_w ;
model . tensors [ " encoder.conv1.bias " ] = model . e_conv_1_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.conv2.weight " ] = model . e_conv_2_w ;
model . tensors [ " encoder.conv2.bias " ] = model . e_conv_2_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.ln_post.weight " ] = model . e_ln_w ;
model . tensors [ " encoder.ln_post.bias " ] = model . e_ln_b ;
2022-10-04 17:35:01 +00:00
for ( int i = 0 ; i < n_audio_layer ; + + i ) {
auto & layer = model . layers_encoder [ i ] ;
2023-04-30 15:51:57 +00:00
layer . mlp_ln_w = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
layer . mlp_ln_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . mlp_0_w = ggml_new_tensor_2d ( ctx , wtype , n_audio_state , 4 * n_audio_state ) ;
layer . mlp_0_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , 4 * n_audio_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . mlp_1_w = ggml_new_tensor_2d ( ctx , wtype , 4 * n_audio_state , n_audio_state ) ;
layer . mlp_1_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_ln_0_w = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
layer . attn_ln_0_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_q_w = ggml_new_tensor_2d ( ctx , wtype , n_audio_state , n_audio_state ) ;
layer . attn_q_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_k_w = ggml_new_tensor_2d ( ctx , wtype , n_audio_state , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_v_w = ggml_new_tensor_2d ( ctx , wtype , n_audio_state , n_audio_state ) ;
layer . attn_v_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_ln_1_w = ggml_new_tensor_2d ( ctx , wtype , n_audio_state , n_audio_state ) ;
layer . attn_ln_1_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_audio_state ) ;
2022-10-04 17:35:01 +00:00
// map by name
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .mlp_ln.weight " ] = layer . mlp_ln_w ;
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .mlp_ln.bias " ] = layer . mlp_ln_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .mlp.0.weight " ] = layer . mlp_0_w ;
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .mlp.0.bias " ] = layer . mlp_0_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .mlp.2.weight " ] = layer . mlp_1_w ;
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .mlp.2.bias " ] = layer . mlp_1_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn_ln.weight " ] = layer . attn_ln_0_w ;
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn_ln.bias " ] = layer . attn_ln_0_b ;
2022-10-04 17:35:01 +00:00
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn.query.weight " ] = layer . attn_q_w ;
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn.query.bias " ] = layer . attn_q_b ;
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn.key.weight " ] = layer . attn_k_w ;
2022-10-04 17:35:01 +00:00
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn.value.weight " ] = layer . attn_v_w ;
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn.value.bias " ] = layer . attn_v_b ;
2023-04-30 15:51:57 +00:00
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn.out.weight " ] = layer . attn_ln_1_w ;
model . tensors [ " encoder.blocks. " + std : : to_string ( i ) + " .attn.out.bias " ] = layer . attn_ln_1_b ;
2022-10-04 17:35:01 +00:00
}
}
// decoder
{
2023-04-30 15:51:57 +00:00
model . d_pe = ggml_new_tensor_2d ( ctx , GGML_TYPE_F32 , n_text_state , n_text_ctx ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . d_te = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_vocab ) ;
2022-10-04 17:35:01 +00:00
model . d_ln_w = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
model . d_ln_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
// map by name
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.positional_embedding " ] = model . d_pe ;
2022-10-04 17:35:01 +00:00
model . tensors [ " decoder.token_embedding.weight " ] = model . d_te ;
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.ln.weight " ] = model . d_ln_w ;
model . tensors [ " decoder.ln.bias " ] = model . d_ln_b ;
2022-10-04 17:35:01 +00:00
for ( int i = 0 ; i < n_text_layer ; + + i ) {
auto & layer = model . layers_decoder [ i ] ;
2023-04-30 15:51:57 +00:00
layer . mlp_ln_w = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
layer . mlp_ln_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . mlp_0_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , 4 * n_text_state ) ;
layer . mlp_0_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , 4 * n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . mlp_1_w = ggml_new_tensor_2d ( ctx , wtype , 4 * n_text_state , n_text_state ) ;
layer . mlp_1_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_ln_0_w = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
layer . attn_ln_0_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_q_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_text_state ) ;
layer . attn_q_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_k_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_v_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_text_state ) ;
layer . attn_v_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . attn_ln_1_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_text_state ) ;
layer . attn_ln_1_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . cross_attn_ln_0_w = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
layer . cross_attn_ln_0_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . cross_attn_q_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_text_state ) ;
layer . cross_attn_q_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . cross_attn_k_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . cross_attn_v_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_text_state ) ;
layer . cross_attn_v_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
layer . cross_attn_ln_1_w = ggml_new_tensor_2d ( ctx , wtype , n_text_state , n_text_state ) ;
layer . cross_attn_ln_1_b = ggml_new_tensor_1d ( ctx , GGML_TYPE_F32 , n_text_state ) ;
2022-10-04 17:35:01 +00:00
// map by name
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .mlp_ln.weight " ] = layer . mlp_ln_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .mlp_ln.bias " ] = layer . mlp_ln_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .mlp.0.weight " ] = layer . mlp_0_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .mlp.0.bias " ] = layer . mlp_0_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .mlp.2.weight " ] = layer . mlp_1_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .mlp.2.bias " ] = layer . mlp_1_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn_ln.weight " ] = layer . attn_ln_0_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn_ln.bias " ] = layer . attn_ln_0_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn.query.weight " ] = layer . attn_q_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn.query.bias " ] = layer . attn_q_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn.key.weight " ] = layer . attn_k_w ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn.value.weight " ] = layer . attn_v_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn.value.bias " ] = layer . attn_v_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn.out.weight " ] = layer . attn_ln_1_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .attn.out.bias " ] = layer . attn_ln_1_b ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn_ln.weight " ] = layer . cross_attn_ln_0_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn_ln.bias " ] = layer . cross_attn_ln_0_b ;
2022-10-04 17:35:01 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn.query.weight " ] = layer . cross_attn_q_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn.query.bias " ] = layer . cross_attn_q_b ;
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn.key.weight " ] = layer . cross_attn_k_w ;
2022-10-04 17:35:01 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn.value.weight " ] = layer . cross_attn_v_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn.value.bias " ] = layer . cross_attn_v_b ;
2023-04-30 15:51:57 +00:00
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn.out.weight " ] = layer . cross_attn_ln_1_w ;
model . tensors [ " decoder.blocks. " + std : : to_string ( i ) + " .cross_attn.out.bias " ] = layer . cross_attn_ln_1_b ;
2022-10-04 17:35:01 +00:00
}
}
}
// load weights
{
size_t total_size = 0 ;
2022-10-21 12:57:20 +00:00
model . n_loaded = 0 ;
2022-10-04 17:35:01 +00:00
while ( true ) {
int32_t n_dims ;
int32_t length ;
2023-04-30 15:51:57 +00:00
int32_t ttype ;
2022-10-04 17:35:01 +00:00
2023-01-08 11:03:33 +00:00
read_safe ( loader , n_dims ) ;
read_safe ( loader , length ) ;
2023-04-30 15:51:57 +00:00
read_safe ( loader , ttype ) ;
2022-10-04 17:35:01 +00:00
2023-01-08 11:03:33 +00:00
if ( loader - > eof ( loader - > context ) ) {
2022-10-04 17:35:01 +00:00
break ;
}
2023-04-14 16:16:34 +00:00
int32_t nelements = 1 ;
2023-04-30 19:50:04 +00:00
int32_t ne [ 4 ] = { 1 , 1 , 1 , 1 } ;
2022-10-04 17:35:01 +00:00
for ( int i = 0 ; i < n_dims ; + + i ) {
2023-04-14 16:16:34 +00:00
read_safe ( loader , ne [ i ] ) ;
2022-10-04 17:35:01 +00:00
nelements * = ne [ i ] ;
}
2022-12-10 12:09:31 +00:00
std : : string name ;
std : : vector < char > tmp ( length ) ; // create a buffer
2023-01-08 11:03:33 +00:00
loader - > read ( loader - > context , & tmp [ 0 ] , tmp . size ( ) ) ; // read to buffer
2022-12-10 12:09:31 +00:00
name . assign ( & tmp [ 0 ] , tmp . size ( ) ) ;
2022-10-04 17:35:01 +00:00
2022-12-22 15:06:19 +00:00
if ( model . tensors . find ( name ) = = model . tensors . end ( ) ) {
2022-10-04 17:35:01 +00:00
fprintf ( stderr , " %s: unknown tensor '%s' in model file \n " , __func__ , name . data ( ) ) ;
return false ;
}
auto tensor = model . tensors [ name . data ( ) ] ;
if ( ggml_nelements ( tensor ) ! = nelements ) {
fprintf ( stderr , " %s: tensor '%s' has wrong size in model file \n " , __func__ , name . data ( ) ) ;
2023-04-30 19:50:04 +00:00
fprintf ( stderr , " %s: shape: [%d, %d, %d], expected: [%d, %d, %d] \n " ,
__func__ , ne [ 0 ] , ne [ 1 ] , ne [ 2 ] , ( int ) tensor - > ne [ 0 ] , ( int ) tensor - > ne [ 1 ] , ( int ) tensor - > ne [ 2 ] ) ;
2022-10-04 17:35:01 +00:00
return false ;
}
if ( tensor - > ne [ 0 ] ! = ne [ 0 ] | | tensor - > ne [ 1 ] ! = ne [ 1 ] | | tensor - > ne [ 2 ] ! = ne [ 2 ] ) {
2023-04-14 16:16:34 +00:00
fprintf ( stderr , " %s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d] \n " ,
__func__ , name . data ( ) , ( int ) tensor - > ne [ 0 ] , ( int ) tensor - > ne [ 1 ] , ( int ) tensor - > ne [ 2 ] , ne [ 0 ] , ne [ 1 ] , ne [ 2 ] ) ;
2022-10-04 17:35:01 +00:00
return false ;
}
2023-04-30 15:51:57 +00:00
const size_t bpe = ggml_type_size ( ggml_type ( ttype ) ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
if ( ( nelements * bpe ) / ggml_blck_size ( tensor - > type ) ! = ggml_nbytes ( tensor ) ) {
2023-04-14 16:16:34 +00:00
fprintf ( stderr , " %s: tensor '%s' has wrong size in model file: got %zu, expected %zu \n " ,
2022-10-04 17:35:01 +00:00
__func__ , name . data ( ) , ggml_nbytes ( tensor ) , nelements * bpe ) ;
return false ;
}
2023-01-08 11:03:33 +00:00
loader - > read ( loader - > context , tensor - > data , ggml_nbytes ( tensor ) ) ;
2023-01-23 18:48:10 +00:00
BYTESWAP_TENSOR ( tensor ) ;
2022-10-04 17:35:01 +00:00
2023-04-30 15:51:57 +00:00
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
2022-10-04 17:35:01 +00:00
total_size + = ggml_nbytes ( tensor ) ;
2022-10-21 12:57:20 +00:00
model . n_loaded + + ;
2022-10-04 17:35:01 +00:00
}
2022-11-24 15:54:16 +00:00
fprintf ( stderr , " %s: model size = %7.2f MB \n " , __func__ , total_size / 1024.0 / 1024.0 ) ;
2022-10-08 07:56:59 +00:00
2022-10-21 12:57:20 +00:00
if ( model . n_loaded = = 0 ) {
2022-10-08 14:28:06 +00:00
fprintf ( stderr , " %s: WARN no tensors loaded from model file - assuming empty model for testing \n " , __func__ ) ;
2022-10-21 12:57:20 +00:00
} else if ( model . n_loaded ! = ( int ) model . tensors . size ( ) ) {
fprintf ( stderr , " %s: ERROR not all tensors loaded from model file - expected %zu, got %d \n " , __func__ , model . tensors . size ( ) , model . n_loaded ) ;
2022-10-08 07:56:59 +00:00
return false ;
}
2022-10-04 17:35:01 +00:00
}
2023-01-15 09:29:57 +00:00
wctx . t_load_us = ggml_time_us ( ) - t_start_us ;
2022-10-04 17:35:01 +00:00
return true ;
}
2023-03-05 19:42:19 +00:00
// evaluate the encoder with the given state
2022-10-04 17:35:01 +00:00
//
// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder
// part of the transformer model and returns the encoded features
//
2023-03-05 19:42:19 +00:00
// - wctx: the model
// - wstate: the state of the encoder
2022-10-04 17:35:01 +00:00
// - n_threads: number of threads to use
// - mel_offset: offset in the mel spectrogram (i.e. audio offset)
//
2023-03-05 19:42:19 +00:00
static bool whisper_encode_internal (
2023-01-15 09:29:57 +00:00
whisper_context & wctx ,
2023-03-05 19:42:19 +00:00
whisper_state & wstate ,
2023-01-15 09:29:57 +00:00
const int mel_offset ,
2023-03-05 19:42:19 +00:00
const int n_threads ) {
2023-01-15 09:29:57 +00:00
const int64_t t_start_us = ggml_time_us ( ) ;
2022-10-04 17:35:01 +00:00
const auto & model = wctx . model ;
2023-03-05 19:42:19 +00:00
const auto & mel_inp = wstate . mel ;
2022-10-04 17:35:01 +00:00
const auto & hparams = model . hparams ;
2023-03-05 19:42:19 +00:00
const int n_ctx = wstate . exp_n_audio_ctx > 0 ? wstate . exp_n_audio_ctx : hparams . n_audio_ctx ;
2022-10-04 17:35:01 +00:00
const int n_state = hparams . n_audio_state ;
const int n_head = hparams . n_audio_head ;
const int n_layer = hparams . n_audio_layer ;
const int n_mels = hparams . n_mels ;
assert ( mel_inp . n_mel = = n_mels ) ;
2023-04-10 19:28:54 +00:00
struct ggml_init_params params = {
/*.mem_size =*/ wstate . buf_compute . size ( ) ,
/*.mem_buffer =*/ wstate . buf_compute . data ( ) ,
/*.no_alloc =*/ false ,
} ;
2022-10-04 17:35:01 +00:00
struct ggml_context * ctx0 = ggml_init ( params ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
struct ggml_tensor * mel = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , 2 * n_ctx , n_mels ) ;
assert ( mel - > type = = GGML_TYPE_F32 ) ;
{
float * dst = ( float * ) mel - > data ;
memset ( dst , 0 , ggml_nbytes ( mel ) ) ;
const int i0 = std : : min ( mel_offset , mel_inp . n_len ) ;
const int i1 = std : : min ( mel_offset + 2 * n_ctx , mel_inp . n_len ) ;
for ( int j = 0 ; j < mel_inp . n_mel ; + + j ) {
for ( int i = i0 ; i < i1 ; + + i ) {
dst [ j * 2 * n_ctx + ( i - i0 ) ] = mel_inp . data [ j * mel_inp . n_len + i ] ;
}
}
}
struct ggml_tensor * cur ;
2023-04-29 07:49:02 +00:00
# ifndef WHISPER_USE_COREML
const bool use_coreml = false ;
# else
const bool use_coreml = wstate . ctx_coreml ! = nullptr ;
# endif
2023-04-29 07:55:24 +00:00
if ( ! use_coreml ) {
2023-04-29 07:49:02 +00:00
// convolution + gelu
{
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
2023-07-02 18:45:27 +00:00
cur = ggml_conv_1d_ph ( ctx0 , model . e_conv_1_w , mel , 1 , 1 ) ;
2023-04-29 07:49:02 +00:00
cur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_repeat ( ctx0 ,
model . e_conv_1_b ,
cur ) ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_gelu ( ctx0 , cur ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2023-07-02 18:45:27 +00:00
cur = ggml_conv_1d_ph ( ctx0 , model . e_conv_2_w , cur , 2 , 1 ) ;
2023-04-29 07:49:02 +00:00
cur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_repeat ( ctx0 ,
model . e_conv_2_b ,
cur ) ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_gelu ( ctx0 , cur ) ;
}
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 3 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
// ===================================================================
// NOTE: experimenting with partial evaluation of the encoder (ignore)
//static int iter = -1;
//const int n_iter = 1500/n_ctx;
2022-11-11 20:33:10 +00:00
2023-04-29 07:49:02 +00:00
//iter = (iter + 1) % n_iter;
2022-11-11 20:33:10 +00:00
2023-04-29 07:49:02 +00:00
//if (iter == 0) {
// memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
// memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
//}
2022-11-11 20:33:10 +00:00
2023-04-29 07:49:02 +00:00
static int iter = 0 ;
2023-03-22 19:37:36 +00:00
2023-04-29 07:49:02 +00:00
const size_t e_pe_stride = model . e_pe - > ne [ 0 ] * ggml_element_size ( model . e_pe ) ;
const size_t e_pe_offset = model . e_pe - > ne [ 0 ] * ggml_element_size ( model . e_pe ) * n_ctx * iter ;
2022-11-11 20:33:10 +00:00
2023-04-29 07:49:02 +00:00
struct ggml_tensor * e_pe = ggml_view_2d ( ctx0 , model . e_pe , model . e_pe - > ne [ 0 ] , n_ctx , e_pe_stride , e_pe_offset ) ;
2022-11-11 20:33:10 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_add ( ctx0 , e_pe , ggml_transpose ( ctx0 , cur ) ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
// ===================================================================
2022-11-20 19:12:01 +00:00
2023-04-29 07:49:02 +00:00
// original:
//cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
struct ggml_tensor * inpL = cur ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
for ( int il = 0 ; il < n_layer ; + + il ) {
const auto & layer = model . layers_encoder [ il ] ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// norm
{
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_norm ( ctx0 , inpL ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// cur = ln_0_w*cur + ln_0_b
cur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , layer . attn_ln_0_w , cur ) ,
cur ) ,
ggml_repeat ( ctx0 , layer . attn_ln_0_b , cur ) ) ;
2023-04-29 07:49:02 +00:00
}
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// self-attention
{
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
struct ggml_tensor * Qcur = ggml_mul_mat ( ctx0 ,
2023-04-29 07:55:24 +00:00
layer . attn_q_w ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
Qcur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_repeat ( ctx0 ,
layer . attn_q_b ,
Qcur ) ,
Qcur ) ;
2022-10-04 17:35:01 +00:00
2023-05-14 15:04:23 +00:00
//Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat ( ctx0 ,
2023-04-29 07:55:24 +00:00
layer . attn_k_w ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-05-14 15:04:23 +00:00
//Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
struct ggml_tensor * Vcur = ggml_mul_mat ( ctx0 ,
2023-04-29 07:55:24 +00:00
layer . attn_v_w ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
Vcur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_repeat ( ctx0 ,
layer . attn_v_b ,
Vcur ) ,
Vcur ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// ------
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:55:24 +00:00
# ifdef WHISPER_USE_FLASH_ATTN
2023-04-29 07:49:02 +00:00
struct ggml_tensor * Q =
ggml_permute ( ctx0 ,
ggml_cpy ( ctx0 ,
Qcur ,
2023-04-30 15:51:57 +00:00
ggml_new_tensor_3d ( ctx0 , wctx . itype , n_state / n_head , n_head , n_ctx ) ) ,
2023-04-29 07:49:02 +00:00
0 , 2 , 1 , 3 ) ;
struct ggml_tensor * K =
ggml_permute ( ctx0 ,
ggml_cpy ( ctx0 ,
Kcur ,
2023-04-30 15:51:57 +00:00
ggml_new_tensor_3d ( ctx0 , wctx . itype , n_state / n_head , n_head , n_ctx ) ) ,
2023-04-29 07:49:02 +00:00
0 , 2 , 1 , 3 ) ;
struct ggml_tensor * V =
ggml_cpy ( ctx0 ,
ggml_permute ( ctx0 ,
ggml_reshape_3d ( ctx0 ,
Vcur ,
n_state / n_head , n_head , n_ctx ) ,
1 , 2 , 0 , 3 ) ,
2023-04-30 15:51:57 +00:00
ggml_new_tensor_3d ( ctx0 , wctx . itype , n_ctx , n_state / n_head , n_head ) ) ;
2023-04-29 07:49:02 +00:00
struct ggml_tensor * KQV = ggml_flash_attn ( ctx0 , Q , K , V , false ) ;
2023-04-29 07:55:24 +00:00
# else
2023-04-29 07:49:02 +00:00
struct ggml_tensor * Q =
ggml_permute ( ctx0 ,
ggml_cpy ( ctx0 ,
Qcur ,
ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_state / n_head , n_head , n_ctx ) ) ,
0 , 2 , 1 , 3 ) ;
struct ggml_tensor * K =
ggml_permute ( ctx0 ,
ggml_cpy ( ctx0 ,
Kcur ,
2023-04-30 15:51:57 +00:00
ggml_new_tensor_3d ( ctx0 , wctx . itype , n_state / n_head , n_head , n_ctx ) ) ,
2023-04-29 07:49:02 +00:00
0 , 2 , 1 , 3 ) ;
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
struct ggml_tensor * KQ_scaled =
2023-05-14 15:04:23 +00:00
ggml_scale_inplace ( ctx0 ,
2023-04-29 07:49:02 +00:00
KQ ,
ggml_new_f32 ( ctx0 , 1.0f / sqrt ( float ( n_state ) / n_head ) )
) ;
2023-05-14 15:04:23 +00:00
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace ( ctx0 , KQ_scaled ) ;
2023-04-29 07:49:02 +00:00
struct ggml_tensor * V =
ggml_cpy ( ctx0 ,
ggml_permute ( ctx0 ,
ggml_reshape_3d ( ctx0 ,
Vcur ,
n_state / n_head , n_head , n_ctx ) ,
2023-04-30 09:14:33 +00:00
1 , 2 , 0 , 3 ) ,
2023-04-30 15:51:57 +00:00
ggml_new_tensor_3d ( ctx0 , wctx . itype , n_ctx , n_state / n_head , n_head )
2023-04-29 07:49:02 +00:00
) ;
2023-04-30 09:14:33 +00:00
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
2023-04-29 07:55:24 +00:00
# endif
2023-04-29 07:49:02 +00:00
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_cpy ( ctx0 ,
2023-04-29 07:55:24 +00:00
KQV_merged ,
ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_state , n_ctx ) ) ;
2023-04-29 07:49:02 +00:00
}
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// projection
{
wstate . use_buf ( ctx0 , 0 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_mul_mat ( ctx0 ,
2023-04-29 07:55:24 +00:00
layer . attn_ln_1_w ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_repeat ( ctx0 , layer . attn_ln_1_b , cur ) ,
cur ) ;
2023-04-29 07:49:02 +00:00
}
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 2 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// add the input
cur = ggml_add ( ctx0 , cur , inpL ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
struct ggml_tensor * inpFF = cur ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// feed-forward network
{
// norm
{
wstate . use_buf ( ctx0 , 0 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_norm ( ctx0 , inpFF ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
// cur = mlp_ln_w*cur + mlp_ln_b
cur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , layer . mlp_ln_w , cur ) ,
cur ) ,
ggml_repeat ( ctx0 , layer . mlp_ln_b , cur ) ) ;
2023-04-29 07:49:02 +00:00
}
2022-10-04 17:35:01 +00:00
2023-04-29 07:55:24 +00:00
# ifdef WHISPER_USE_FLASH_FF
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_flash_ff ( ctx0 ,
2023-04-30 15:51:57 +00:00
ggml_cpy ( ctx0 , cur , ggml_new_tensor_2d ( ctx0 , wstate . itype , n_state , n_ctx ) ) ,
2023-04-29 07:55:24 +00:00
layer . mlp_0_w , layer . mlp_0_b , layer . mlp_1_w , layer . mlp_1_b ) ;
# else
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// fully connected
cur = ggml_mul_mat ( ctx0 ,
2023-04-29 07:55:24 +00:00
layer . mlp_0_w ,
cur ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_repeat ( ctx0 , layer . mlp_0_b , cur ) ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
// GELU activation
cur = ggml_gelu ( ctx0 , cur ) ;
2023-02-04 07:45:52 +00:00
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// projection
cur = ggml_mul_mat ( ctx0 ,
2023-04-29 07:55:24 +00:00
layer . mlp_1_w ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_repeat ( ctx0 , layer . mlp_1_b , cur ) ,
cur ) ;
# endif
2023-04-29 07:49:02 +00:00
}
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , 3 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
inpL = ggml_add ( ctx0 , cur , inpFF ) ;
}
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
cur = inpL ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// norm
{
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_norm ( ctx0 , cur ) ;
2023-02-04 07:45:52 +00:00
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// cur = ln_f_g*cur + ln_f_b
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 ,
2023-04-29 07:55:24 +00:00
ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , model . e_ln_w , cur ) ,
cur ) ,
ggml_repeat ( ctx0 , model . e_ln_b , cur ) ) ;
2023-03-27 18:00:32 +00:00
}
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , - 1 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
// run the computation
{
struct ggml_cgraph gf = { } ;
gf . n_threads = n_threads ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
ggml_build_forward_expand ( & gf , cur ) ;
ggml_graph_compute ( ctx0 , & gf ) ;
2023-02-04 07:45:52 +00:00
2023-04-29 07:49:02 +00:00
//ggml_graph_print(&gf);
}
2022-10-04 17:35:01 +00:00
}
2023-04-29 07:49:02 +00:00
# ifdef WHISPER_USE_COREML
else
2022-10-04 17:35:01 +00:00
{
2023-04-29 07:49:02 +00:00
wstate . use_buf ( ctx0 , - 1 ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
cur = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_state , n_ctx ) ;
2022-10-04 17:35:01 +00:00
2023-04-29 07:49:02 +00:00
whisper_coreml_encode ( wstate . ctx_coreml , ( float * ) mel - > data , ( float * ) cur - > data ) ;
2022-10-04 17:35:01 +00:00
}
2023-04-15 10:21:27 +00:00
# endif
2022-10-04 17:35:01 +00:00
// cur
//{
// printf("ne0 = %d\n", cur->ne[0]);
// printf("ne1 = %d\n", cur->ne[1]);
// for (int i = 0; i < 10; ++i) {
// printf("%8.4f ", ((float *)(cur->data))[i]);
// }
// printf("... ");
// for (int i = cur->ne[0] - 10; i < cur->ne[0]; ++i) {
// printf("%8.4f ", ((float *)(cur->data))[i]);
// }
// printf("\n");
//}
// pre-compute cross-attention memory
{
2022-10-05 18:34:41 +00:00
struct ggml_cgraph gf = { } ;
gf . n_threads = n_threads ;
2022-10-04 17:35:01 +00:00
// TODO: hack to disconnect the encoded features from the previous graph
cur - > op = GGML_OP_NONE ;
2022-12-22 14:35:18 +00:00
cur - > src0 = nullptr ;
cur - > src1 = nullptr ;
2022-10-04 17:35:01 +00:00
for ( int il = 0 ; il < model . hparams . n_text_layer ; + + il ) {
2023-03-05 19:42:19 +00:00
auto & layer = model . layers_decoder [ il ] ;
2022-10-04 17:35:01 +00:00
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2023-03-05 19:42:19 +00:00
struct ggml_tensor * Kcross = ggml_mul_mat ( ctx0 ,
layer . cross_attn_k_w ,
cur ) ;
2022-10-04 17:35:01 +00:00
2023-05-14 15:04:23 +00:00
Kcross = ggml_scale_inplace ( ctx0 , Kcross , ggml_new_f32 ( ctx0 , pow ( float ( n_state ) / n_head , - 0.25 ) ) ) ;
2022-10-04 17:35:01 +00:00
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
2023-03-05 19:42:19 +00:00
struct ggml_tensor * Vcross = ggml_mul_mat ( ctx0 ,
layer . cross_attn_v_w ,
cur ) ;
2022-10-04 17:35:01 +00:00
Vcross = ggml_add ( ctx0 ,
2023-03-05 19:42:19 +00:00
ggml_repeat ( ctx0 ,
layer . cross_attn_v_b ,
Vcross ) ,
Vcross ) ;
2022-10-04 17:35:01 +00:00
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , - 1 ) ;
2023-02-04 07:45:52 +00:00
2023-04-10 19:28:54 +00:00
Vcross = ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , Vcross , n_state , n_ctx ) ) ;
struct ggml_tensor * k = ggml_view_1d ( ctx0 , wstate . kv_cross . k , n_state * n_ctx , ( ggml_element_size ( wstate . kv_cross . k ) * n_state ) * ( il * n_ctx ) ) ;
struct ggml_tensor * v = ggml_view_2d ( ctx0 , wstate . kv_cross . v , n_ctx , n_state ,
( n_ctx ) * ggml_element_size ( wstate . kv_cross . v ) ,
( il * n_ctx ) * ggml_element_size ( wstate . kv_cross . v ) * n_state ) ;
2022-10-04 17:35:01 +00:00
ggml_build_forward_expand ( & gf , ggml_cpy ( ctx0 , Kcross , k ) ) ;
ggml_build_forward_expand ( & gf , ggml_cpy ( ctx0 , Vcross , v ) ) ;
}
ggml_graph_compute ( ctx0 , & gf ) ;
2023-01-08 10:35:56 +00:00
//ggml_graph_print(&gf);
2022-10-04 17:35:01 +00:00
}
////////////////////////////////////////////////////////////////////////////
2023-02-04 07:45:52 +00:00
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
// ggml_used_mem(ctx0)/1024.0/1024.0,
2023-03-28 07:36:16 +00:00
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
2022-10-04 17:35:01 +00:00
ggml_free ( ctx0 ) ;
2023-03-05 19:42:19 +00:00
wstate . t_encode_us + = ggml_time_us ( ) - t_start_us ;
wstate . n_encode + + ;
2023-01-15 09:29:57 +00:00
2022-10-04 17:35:01 +00:00
return true ;
}
// evaluate the decoder
//
2023-02-04 07:45:52 +00:00
// given text prompt + audio features -> computes the logits for the next token
2022-10-04 17:35:01 +00:00
//
// - model: the model
// - n_threads: number of threads to use
2022-10-07 19:30:44 +00:00
// - tokens: text prompt
// - n_tokens: number of tokens in the prompt
// - n_past: number of past tokens to prefix the prompt with
2022-10-04 17:35:01 +00:00
//
2023-03-05 19:42:19 +00:00
static bool whisper_decode_internal (
2023-01-15 09:29:57 +00:00
whisper_context & wctx ,
2023-03-05 19:42:19 +00:00
whisper_state & wstate ,
2023-01-15 09:29:57 +00:00
whisper_decoder & decoder ,
const whisper_token * tokens ,
const int n_tokens ,
const int n_past ,
const int n_threads ) {
const int64_t t_start_us = ggml_time_us ( ) ;
2022-10-04 17:35:01 +00:00
const auto & model = wctx . model ;
const auto & hparams = model . hparams ;
2023-01-15 09:29:57 +00:00
auto & kv_self = decoder . kv_self ;
WHISPER_ASSERT ( ! ! kv_self . ctx ) ;
2023-03-05 19:42:19 +00:00
auto & logits_out = wstate . logits ;
2022-10-04 17:35:01 +00:00
const int n_vocab = hparams . n_vocab ;
const int n_ctx = hparams . n_text_ctx ;
const int n_state = hparams . n_text_state ;
const int n_head = hparams . n_text_head ;
const int n_layer = hparams . n_text_layer ;
const int N = n_tokens ;
2023-03-05 19:42:19 +00:00
const int M = wstate . exp_n_audio_ctx > 0 ? wstate . exp_n_audio_ctx : hparams . n_audio_ctx ;
2022-10-04 17:35:01 +00:00
2023-01-15 09:29:57 +00:00
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
2023-04-10 19:28:54 +00:00
struct ggml_init_params params = {
/*.mem_size =*/ wstate . buf_compute . size ( ) ,
/*.mem_buffer =*/ wstate . buf_compute . data ( ) ,
/*.no_alloc =*/ false ,
} ;
2022-10-04 17:35:01 +00:00
struct ggml_context * ctx0 = ggml_init ( params ) ;
2023-02-04 07:45:52 +00:00
struct ggml_cgraph gf = { } ;
gf . n_threads = n_threads ;
2022-10-04 17:35:01 +00:00
struct ggml_tensor * embd = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , N ) ;
memcpy ( embd - > data , tokens , N * ggml_element_size ( embd ) ) ;
struct ggml_tensor * position = ggml_new_tensor_1d ( ctx0 , GGML_TYPE_I32 , N ) ;
for ( int i = 0 ; i < N ; + + i ) {
( ( int32_t * ) position - > data ) [ i ] = n_past + i ;
}
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 3 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
// token encoding + position encoding
struct ggml_tensor * cur =
ggml_add ( ctx0 ,
ggml_get_rows ( ctx0 , model . d_te , embd ) ,
ggml_get_rows ( ctx0 , model . d_pe , position ) ) ;
struct ggml_tensor * inpL = cur ;
for ( int il = 0 ; il < n_layer ; + + il ) {
const auto & layer = model . layers_decoder [ il ] ;
// norm
{
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
cur = ggml_norm ( ctx0 , inpL ) ;
2022-10-04 17:35:01 +00:00
// cur = ln_0_w*cur + ln_0_b
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 ,
ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , layer . attn_ln_0_w , cur ) ,
2022-10-04 17:35:01 +00:00
cur ) ,
2023-02-04 07:45:52 +00:00
ggml_repeat ( ctx0 , layer . attn_ln_0_b , cur ) ) ;
2022-10-04 17:35:01 +00:00
}
// self-attention
{
2023-02-04 07:45:52 +00:00
struct ggml_tensor * Qcur = ggml_mul_mat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . attn_q_w ,
cur ) ;
2023-02-04 07:45:52 +00:00
Qcur = ggml_add ( ctx0 ,
ggml_repeat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . attn_q_b ,
Qcur ) ,
Qcur ) ;
2023-05-14 15:04:23 +00:00
Qcur = ggml_scale_inplace ( ctx0 , Qcur , ggml_new_f32 ( ctx0 , pow ( float ( n_state ) / n_head , - 0.25 ) ) ) ;
2022-10-04 17:35:01 +00:00
// note: no bias for Key
2023-02-04 07:45:52 +00:00
struct ggml_tensor * Kcur = ggml_mul_mat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . attn_k_w ,
cur ) ;
2023-05-14 15:04:23 +00:00
Kcur = ggml_scale_inplace ( ctx0 , Kcur , ggml_new_f32 ( ctx0 , pow ( float ( n_state ) / n_head , - 0.25 ) ) ) ;
2022-10-04 17:35:01 +00:00
// store key and value to memory
{
2023-04-10 19:28:54 +00:00
struct ggml_tensor * Vcur = ggml_mul_mat ( ctx0 ,
layer . attn_v_w ,
cur ) ;
Vcur = ggml_add ( ctx0 ,
ggml_repeat ( ctx0 ,
layer . attn_v_b ,
Vcur ) ,
Vcur ) ;
Vcur = ggml_transpose ( ctx0 , ggml_reshape_2d ( ctx0 , Vcur , n_state , N ) ) ;
2023-02-04 07:45:52 +00:00
struct ggml_tensor * k = ggml_view_1d ( ctx0 , kv_self . k , N * n_state , ( ggml_element_size ( kv_self . k ) * n_state ) * ( il * n_ctx + n_past ) ) ;
2023-04-10 19:28:54 +00:00
struct ggml_tensor * v = ggml_view_2d ( ctx0 , kv_self . v , N , n_state ,
( n_ctx ) * ggml_element_size ( kv_self . v ) ,
( il * n_ctx ) * ggml_element_size ( kv_self . v ) * n_state + n_past * ggml_element_size ( kv_self . v ) ) ;
2022-10-04 17:35:01 +00:00
2023-02-04 07:45:52 +00:00
ggml_build_forward_expand ( & gf , ggml_cpy ( ctx0 , Kcur , k ) ) ;
ggml_build_forward_expand ( & gf , ggml_cpy ( ctx0 , Vcur , v ) ) ;
2022-10-04 17:35:01 +00:00
}
// ------
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
struct ggml_tensor * Q =
2023-02-04 07:45:52 +00:00
ggml_permute ( ctx0 ,
ggml_cpy ( ctx0 ,
2022-10-04 17:35:01 +00:00
Qcur ,
2023-02-04 07:45:52 +00:00
ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_state / n_head , n_head , N ) ) ,
2022-10-04 17:35:01 +00:00
0 , 2 , 1 , 3 ) ;
struct ggml_tensor * K =
2023-02-04 07:45:52 +00:00
ggml_permute ( ctx0 ,
ggml_reshape_3d ( ctx0 ,
ggml_view_1d ( ctx0 , kv_self . k , ( n_past + N ) * n_state , il * n_ctx * ggml_element_size ( kv_self . k ) * n_state ) ,
2022-10-04 17:35:01 +00:00
n_state / n_head , n_head , n_past + N ) ,
0 , 2 , 1 , 3 ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
// K * Q
2023-02-04 07:45:52 +00:00
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
2022-10-04 17:35:01 +00:00
//struct ggml_tensor * KQ_scaled =
2023-05-14 15:04:23 +00:00
// ggml_scale_inplace(ctx0,
2022-10-04 17:35:01 +00:00
// KQ,
2023-02-04 07:45:52 +00:00
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2022-10-04 17:35:01 +00:00
// );
2023-05-14 15:04:23 +00:00
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace ( ctx0 , KQ , n_past ) ;
2023-02-04 07:45:52 +00:00
2023-05-14 15:04:23 +00:00
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace ( ctx0 , KQ_masked ) ;
2023-02-04 07:45:52 +00:00
2023-04-10 19:28:54 +00:00
struct ggml_tensor * V =
ggml_view_3d ( ctx0 , kv_self . v ,
n_past + N , n_state / n_head , n_head ,
n_ctx * ggml_element_size ( kv_self . v ) ,
n_ctx * ggml_element_size ( kv_self . v ) * n_state / n_head ,
il * n_ctx * ggml_element_size ( kv_self . v ) * n_state ) ;
2023-02-04 07:45:52 +00:00
2023-04-10 19:28:54 +00:00
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
2022-10-04 17:35:01 +00:00
2023-02-04 07:45:52 +00:00
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
2022-10-04 17:35:01 +00:00
2023-02-04 07:45:52 +00:00
cur = ggml_cpy ( ctx0 ,
2022-10-04 17:35:01 +00:00
KQV_merged ,
2023-02-04 07:45:52 +00:00
ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_state , N ) ) ;
2022-10-04 17:35:01 +00:00
}
2023-02-04 07:45:52 +00:00
// projection
2022-10-04 17:35:01 +00:00
{
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
cur = ggml_mul_mat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . attn_ln_1_w ,
cur ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 ,
ggml_repeat ( ctx0 , layer . attn_ln_1_b , cur ) ,
2022-10-04 17:35:01 +00:00
cur ) ;
}
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 2 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
// add the input
2023-02-04 07:45:52 +00:00
struct ggml_tensor * inpCA = ggml_add ( ctx0 , cur , inpL ) ;
2022-10-04 17:35:01 +00:00
// norm
{
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
cur = ggml_norm ( ctx0 , inpCA ) ; // note: we use inpCA here
2022-10-04 17:35:01 +00:00
// cur = ln_0_w*cur + ln_0_b
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 ,
ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , layer . cross_attn_ln_0_w , cur ) ,
2022-10-04 17:35:01 +00:00
cur ) ,
2023-02-04 07:45:52 +00:00
ggml_repeat ( ctx0 , layer . cross_attn_ln_0_b , cur ) ) ;
2022-10-04 17:35:01 +00:00
}
// cross-attention
{
2023-02-04 07:45:52 +00:00
struct ggml_tensor * Qcur = ggml_mul_mat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . cross_attn_q_w ,
cur ) ;
2023-02-04 07:45:52 +00:00
Qcur = ggml_add ( ctx0 ,
ggml_repeat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . cross_attn_q_b ,
Qcur ) ,
Qcur ) ;
2023-05-14 15:04:23 +00:00
Qcur = ggml_scale_inplace ( ctx0 , Qcur , ggml_new_f32 ( ctx0 , pow ( float ( n_state ) / n_head , - 0.25 ) ) ) ;
2022-10-04 17:35:01 +00:00
// Kcross is already scaled
struct ggml_tensor * Kcross =
2023-02-04 07:45:52 +00:00
ggml_reshape_3d ( ctx0 ,
2023-03-05 19:42:19 +00:00
ggml_view_1d ( ctx0 , wstate . kv_cross . k , M * n_state , il * M * ggml_element_size ( wstate . kv_cross . k ) * n_state ) ,
2022-10-04 17:35:01 +00:00
n_state / n_head , n_head , M ) ;
2023-04-10 19:28:54 +00:00
//struct ggml_tensor * Vcross =
// ggml_reshape_3d(ctx0,
// ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
// n_state/n_head, n_head, M);
2022-10-04 17:35:01 +00:00
2023-04-10 19:28:54 +00:00
//struct ggml_tensor * V_trans =
// ggml_cpy(ctx0,
// ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
// ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
struct ggml_tensor * V =
ggml_view_3d ( ctx0 , wstate . kv_cross . v ,
M , n_state / n_head , n_head ,
M * ggml_element_size ( wstate . kv_cross . v ) ,
M * ggml_element_size ( wstate . kv_cross . v ) * n_state / n_head ,
il * M * ggml_element_size ( wstate . kv_cross . v ) * n_state ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
// ------
struct ggml_tensor * Q =
2023-02-04 07:45:52 +00:00
ggml_permute ( ctx0 ,
ggml_cpy ( ctx0 ,
2022-10-04 17:35:01 +00:00
Qcur ,
2023-02-04 07:45:52 +00:00
ggml_new_tensor_3d ( ctx0 , GGML_TYPE_F32 , n_state / n_head , n_head , N ) ) ,
2022-10-04 17:35:01 +00:00
0 , 2 , 1 , 3 ) ;
2023-02-04 07:45:52 +00:00
struct ggml_tensor * K = ggml_permute ( ctx0 , Kcross , 0 , 2 , 1 , 3 ) ;
2022-10-04 17:35:01 +00:00
// K * Q
2023-02-04 07:45:52 +00:00
struct ggml_tensor * KQ = ggml_mul_mat ( ctx0 , K , Q ) ;
2022-10-04 17:35:01 +00:00
//struct ggml_tensor * KQ_scaled =
2023-05-14 15:04:23 +00:00
// ggml_scale_inplace(ctx0,
2022-10-04 17:35:01 +00:00
// KQ,
2023-02-04 07:45:52 +00:00
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
2022-10-04 17:35:01 +00:00
// );
// no masking for cross-attention
2023-05-14 15:04:23 +00:00
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2023-02-04 07:45:52 +00:00
2023-05-14 15:04:23 +00:00
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace ( ctx0 , KQ ) ;
2022-10-04 17:35:01 +00:00
2023-04-10 19:28:54 +00:00
struct ggml_tensor * KQV = ggml_mul_mat ( ctx0 , V , KQ_soft_max ) ;
2022-10-04 17:35:01 +00:00
2023-02-04 07:45:52 +00:00
struct ggml_tensor * KQV_merged = ggml_permute ( ctx0 , KQV , 0 , 2 , 1 , 3 ) ;
2022-10-04 17:35:01 +00:00
// cur = KQV_merged.contiguous().view(n_state, N)
2023-02-04 07:45:52 +00:00
cur = ggml_cpy ( ctx0 ,
2022-10-04 17:35:01 +00:00
KQV_merged ,
2023-02-04 07:45:52 +00:00
ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , n_state , N ) ) ;
2022-10-04 17:35:01 +00:00
}
// projection
{
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
cur = ggml_mul_mat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . cross_attn_ln_1_w ,
cur ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 ,
ggml_repeat ( ctx0 , layer . cross_attn_ln_1_b , cur ) ,
2022-10-04 17:35:01 +00:00
cur ) ;
}
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 2 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
// add the input
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 , cur , inpCA ) ;
2022-10-04 17:35:01 +00:00
struct ggml_tensor * inpFF = cur ;
// feed-forward network
{
// norm
{
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
cur = ggml_norm ( ctx0 , inpFF ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2022-10-04 17:35:01 +00:00
// cur = mlp_ln_w*cur + mlp_ln_b
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 ,
ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , layer . mlp_ln_w , cur ) ,
2022-10-04 17:35:01 +00:00
cur ) ,
2023-02-04 07:45:52 +00:00
ggml_repeat ( ctx0 , layer . mlp_ln_b , cur ) ) ;
2022-10-04 17:35:01 +00:00
}
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
// fully connected
2023-02-04 07:45:52 +00:00
cur = ggml_mul_mat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . mlp_0_w ,
cur ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 ,
ggml_repeat ( ctx0 , layer . mlp_0_b , cur ) ,
2022-10-04 17:35:01 +00:00
cur ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
// GELU activation
2023-02-04 07:45:52 +00:00
cur = ggml_gelu ( ctx0 , cur ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2022-10-04 17:35:01 +00:00
// projection
2023-02-04 07:45:52 +00:00
cur = ggml_mul_mat ( ctx0 ,
2022-10-04 17:35:01 +00:00
layer . mlp_1_w ,
cur ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2022-10-04 17:35:01 +00:00
2023-02-04 07:45:52 +00:00
cur = ggml_add ( ctx0 ,
ggml_repeat ( ctx0 , layer . mlp_1_b , cur ) ,
cur ) ;
2022-10-04 17:35:01 +00:00
}
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 3 ) ;
2022-10-04 17:35:01 +00:00
2023-02-04 07:45:52 +00:00
inpL = ggml_add ( ctx0 , cur , inpFF ) ;
2022-10-04 17:35:01 +00:00
}
cur = inpL ;
// norm
{
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
cur = ggml_norm ( ctx0 , cur ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 1 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
cur = ggml_add ( ctx0 ,
ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , model . d_ln_w , cur ) ,
cur ) ,
ggml_repeat ( ctx0 , model . d_ln_b , cur ) ) ;
}
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , 0 ) ;
2023-02-04 07:45:52 +00:00
// compute logits only for the last token
// comment this line to compute logits for all N tokens
// might be useful in the future
cur = ggml_view_2d ( ctx0 , cur , cur - > ne [ 0 ] , 1 , cur - > nb [ 1 ] , ( cur - > ne [ 1 ] - 1 ) * cur - > nb [ 1 ] ) ;
2022-10-04 17:35:01 +00:00
struct ggml_tensor * logits = ggml_mul_mat ( ctx0 , model . d_te , cur ) ;
2023-03-05 19:42:19 +00:00
wstate . use_buf ( ctx0 , - 1 ) ;
2023-02-04 07:45:52 +00:00
2022-10-04 17:35:01 +00:00
// run the computation
{
2023-01-15 09:29:57 +00:00
ggml_build_forward_expand ( & gf , logits ) ;
2022-10-04 17:35:01 +00:00
ggml_graph_compute ( ctx0 , & gf ) ;
}
2023-02-04 07:45:52 +00:00
// extract logits for all N tokens
//logits_out.resize(N*n_vocab);
//memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
// extract logits only for the last token
logits_out . resize ( n_vocab ) ;
memcpy ( logits_out . data ( ) , ggml_get_data ( logits ) , sizeof ( float ) * n_vocab ) ;
2022-10-04 17:35:01 +00:00
if ( N > 1 ) {
2023-02-04 07:45:52 +00:00
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
// ggml_used_mem(ctx0)/1024.0/1024.0,
2023-03-28 07:36:16 +00:00
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
2022-10-04 17:35:01 +00:00
}
ggml_free ( ctx0 ) ;
2023-03-05 19:42:19 +00:00
wstate . t_decode_us + = ggml_time_us ( ) - t_start_us ;
wstate . n_decode + + ;
2022-10-21 14:33:59 +00:00
2023-01-15 09:29:57 +00:00
return true ;
2022-10-04 17:35:01 +00:00
}
2022-10-29 11:08:23 +00:00
// 500 -> 00:05.000
// 6000 -> 01:00.000
2022-10-29 11:14:23 +00:00
static std : : string to_timestamp ( int64_t t , bool comma = false ) {
2022-10-29 11:08:23 +00:00
int64_t msec = t * 10 ;
int64_t hr = msec / ( 1000 * 60 * 60 ) ;
msec = msec - hr * ( 1000 * 60 * 60 ) ;
int64_t min = msec / ( 1000 * 60 ) ;
msec = msec - min * ( 1000 * 60 ) ;
int64_t sec = msec / 1000 ;
msec = msec - sec * 1000 ;
2022-10-04 20:16:33 +00:00
char buf [ 32 ] ;
2022-10-29 11:08:23 +00:00
snprintf ( buf , sizeof ( buf ) , " %02d:%02d:%02d%s%03d " , ( int ) hr , ( int ) min , ( int ) sec , comma ? " , " : " . " , ( int ) msec ) ;
2022-10-04 20:16:33 +00:00
return std : : string ( buf ) ;
}
2022-10-04 17:35:01 +00:00
// naive Discrete Fourier Transform
// input is real-valued
// output is complex-valued
2022-11-02 19:18:20 +00:00
static void dft ( const std : : vector < float > & in , std : : vector < float > & out ) {
2022-10-04 17:35:01 +00:00
int N = in . size ( ) ;
out . resize ( N * 2 ) ;
for ( int k = 0 ; k < N ; k + + ) {
float re = 0 ;
float im = 0 ;
for ( int n = 0 ; n < N ; n + + ) {
float angle = 2 * M_PI * k * n / N ;
re + = in [ n ] * cos ( angle ) ;
im - = in [ n ] * sin ( angle ) ;
}
out [ k * 2 + 0 ] = re ;
out [ k * 2 + 1 ] = im ;
}
}
// Cooley-Tukey FFT
// poor man's implementation - use something better
// input is real-valued
// output is complex-valued
2022-11-02 19:18:20 +00:00
static void fft ( const std : : vector < float > & in , std : : vector < float > & out ) {
2022-10-04 17:35:01 +00:00
out . resize ( in . size ( ) * 2 ) ;
int N = in . size ( ) ;
if ( N = = 1 ) {
out [ 0 ] = in [ 0 ] ;
out [ 1 ] = 0 ;
return ;
}
if ( N % 2 = = 1 ) {
dft ( in , out ) ;
return ;
}
std : : vector < float > even ;
std : : vector < float > odd ;
2022-12-30 11:42:35 +00:00
even . reserve ( N / 2 ) ;
odd . reserve ( N / 2 ) ;
2022-10-04 17:35:01 +00:00
for ( int i = 0 ; i < N ; i + + ) {
if ( i % 2 = = 0 ) {
even . push_back ( in [ i ] ) ;
} else {
odd . push_back ( in [ i ] ) ;
}
}
std : : vector < float > even_fft ;
std : : vector < float > odd_fft ;
fft ( even , even_fft ) ;
fft ( odd , odd_fft ) ;
for ( int k = 0 ; k < N / 2 ; k + + ) {
float theta = 2 * M_PI * k / N ;
float re = cos ( theta ) ;
float im = - sin ( theta ) ;
float re_odd = odd_fft [ 2 * k + 0 ] ;
float im_odd = odd_fft [ 2 * k + 1 ] ;
out [ 2 * k + 0 ] = even_fft [ 2 * k + 0 ] + re * re_odd - im * im_odd ;
out [ 2 * k + 1 ] = even_fft [ 2 * k + 1 ] + re * im_odd + im * re_odd ;
out [ 2 * ( k + N / 2 ) + 0 ] = even_fft [ 2 * k + 0 ] - re * re_odd + im * im_odd ;
out [ 2 * ( k + N / 2 ) + 1 ] = even_fft [ 2 * k + 1 ] - re * im_odd - im * re_odd ;
}
}
2023-04-14 19:35:34 +00:00
static void log_mel_spectrogram_worker_thread ( int ith , const std : : vector < float > & hann , const float * samples ,
int n_samples , int fft_size , int fft_step , int n_threads ,
const whisper_filters & filters , bool speed_up , whisper_mel & mel ) {
std : : vector < float > fft_in ( fft_size , 0.0 ) ;
std : : vector < float > fft_out ( 2 * fft_size ) ;
int n_fft = 1 + ( speed_up ? fft_size / 4 : fft_size / 2 ) ;
2023-04-15 11:18:46 +00:00
2023-04-14 19:35:34 +00:00
for ( int i = ith ; i < mel . n_len ; i + = n_threads ) {
const int offset = i * fft_step ;
2023-04-15 11:18:46 +00:00
2023-04-14 19:35:34 +00:00
// apply Hanning window
for ( int j = 0 ; j < fft_size ; j + + ) {
if ( offset + j < n_samples ) {
fft_in [ j ] = hann [ j ] * samples [ offset + j ] ;
} else {
fft_in [ j ] = 0.0 ;
}
}
2023-04-15 11:18:46 +00:00
2023-04-14 19:35:34 +00:00
// FFT -> mag^2
fft ( fft_in , fft_out ) ;
2023-04-15 11:18:46 +00:00
2023-04-14 19:35:34 +00:00
for ( int j = 0 ; j < fft_size ; j + + ) {
fft_out [ j ] = ( fft_out [ 2 * j + 0 ] * fft_out [ 2 * j + 0 ] + fft_out [ 2 * j + 1 ] * fft_out [ 2 * j + 1 ] ) ;
}
for ( int j = 1 ; j < fft_size / 2 ; j + + ) {
fft_out [ j ] + = fft_out [ fft_size - j ] ;
}
2023-04-15 11:18:46 +00:00
2023-04-14 19:35:34 +00:00
if ( speed_up ) {
// scale down in the frequency domain results in a speed up in the time domain
for ( int j = 0 ; j < n_fft ; j + + ) {
fft_out [ j ] = 0.5 * ( fft_out [ 2 * j ] + fft_out [ 2 * j + 1 ] ) ;
}
}
2023-04-15 11:18:46 +00:00
2023-04-14 19:35:34 +00:00
// mel spectrogram
for ( int j = 0 ; j < mel . n_mel ; j + + ) {
double sum = 0.0 ;
2023-04-15 11:18:46 +00:00
// unroll loop (suggested by GH user @lunixbochs)
int k = 0 ;
for ( k = 0 ; k < n_fft - 3 ; k + = 4 ) {
sum + =
fft_out [ k + 0 ] * filters . data [ j * n_fft + k + 0 ] +
fft_out [ k + 1 ] * filters . data [ j * n_fft + k + 1 ] +
fft_out [ k + 2 ] * filters . data [ j * n_fft + k + 2 ] +
fft_out [ k + 3 ] * filters . data [ j * n_fft + k + 3 ] ;
}
// handle n_fft remainder
for ( ; k < n_fft ; k + + ) {
2023-04-14 19:35:34 +00:00
sum + = fft_out [ k ] * filters . data [ j * n_fft + k ] ;
}
2023-04-15 11:18:46 +00:00
2023-04-29 07:06:25 +00:00
sum = log10 ( std : : max ( sum , 1e-10 ) ) ;
2023-04-15 11:18:46 +00:00
2023-04-14 19:35:34 +00:00
mel . data [ j * mel . n_len + i ] = sum ;
}
}
}
2022-10-04 17:35:01 +00:00
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
2022-11-02 19:18:20 +00:00
static bool log_mel_spectrogram (
2023-03-05 19:42:19 +00:00
whisper_state & wstate ,
2023-01-15 09:29:57 +00:00
const float * samples ,
const int n_samples ,
const int /*sample_rate*/ ,
const int fft_size ,
const int fft_step ,
const int n_mel ,
const int n_threads ,
const whisper_filters & filters ,
const bool speed_up ,
whisper_mel & mel ) {
const int64_t t_start_us = ggml_time_us ( ) ;
2022-10-04 17:35:01 +00:00
// Hanning window
std : : vector < float > hann ;
hann . resize ( fft_size ) ;
for ( int i = 0 ; i < fft_size ; i + + ) {
hann [ i ] = 0.5 * ( 1.0 - cos ( ( 2.0 * M_PI * i ) / ( fft_size ) ) ) ;
}
2023-04-15 14:18:43 +00:00
mel . n_mel = n_mel ;
mel . n_len = n_samples / fft_step ;
mel . n_len_org = mel . n_len ;
std : : vector < float > samples_padded ;
// pad audio with at least one extra chunk of zeros
{
const int pad = ( 100 * WHISPER_CHUNK_SIZE ) / 2 ;
if ( mel . n_len % pad ! = 0 ) {
mel . n_len = ( mel . n_len / pad + 1 ) * pad ;
}
mel . n_len + = pad ;
samples_padded . resize ( mel . n_len * fft_step ) ;
memcpy ( samples_padded . data ( ) , samples , n_samples * sizeof ( float ) ) ;
memset ( samples_padded . data ( ) + n_samples , 0 , ( mel . n_len * fft_step - n_samples ) * sizeof ( float ) ) ;
samples = samples_padded . data ( ) ;
}
2022-10-04 17:35:01 +00:00
mel . data . resize ( mel . n_mel * mel . n_len ) ;
2022-10-04 19:43:37 +00:00
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
//printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
2022-10-04 17:35:01 +00:00
2023-04-15 11:18:46 +00:00
{
std : : vector < std : : thread > workers ( n_threads - 1 ) ;
for ( int iw = 0 ; iw < n_threads - 1 ; + + iw ) {
workers [ iw ] = std : : thread (
log_mel_spectrogram_worker_thread , iw + 1 , std : : cref ( hann ) , samples ,
n_samples , fft_size , fft_step , n_threads ,
std : : cref ( filters ) , speed_up , std : : ref ( mel ) ) ;
2023-04-14 19:35:34 +00:00
}
2022-10-04 17:35:01 +00:00
2023-04-15 11:18:46 +00:00
// main thread
log_mel_spectrogram_worker_thread ( 0 , hann , samples , n_samples , fft_size , fft_step , n_threads , filters , speed_up , mel ) ;
for ( int iw = 0 ; iw < n_threads - 1 ; + + iw ) {
2023-04-14 19:35:34 +00:00
workers [ iw ] . join ( ) ;
}
2022-10-04 17:35:01 +00:00
}
// clamping and normalization
double mmax = - 1e20 ;
for ( int i = 0 ; i < mel . n_mel * mel . n_len ; i + + ) {
if ( mel . data [ i ] > mmax ) {
mmax = mel . data [ i ] ;
}
}
//printf("%s: max = %f\n", __func__, mmax);
mmax - = 8.0 ;
for ( int i = 0 ; i < mel . n_mel * mel . n_len ; i + + ) {
if ( mel . data [ i ] < mmax ) {
mel . data [ i ] = mmax ;
}
mel . data [ i ] = ( mel . data [ i ] + 4.0 ) / 4.0 ;
}
2023-03-05 19:42:19 +00:00
wstate . t_mel_us + = ggml_time_us ( ) - t_start_us ;
2023-01-15 09:29:57 +00:00
2023-04-15 14:18:43 +00:00
//printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
2022-10-04 17:35:01 +00:00
return true ;
}
2022-12-13 17:21:07 +00:00
// split text into tokens
//
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
//
// Regex (Python):
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
//
// Regex (C++):
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
//
static std : : vector < whisper_vocab : : id > tokenize ( const whisper_vocab & vocab , const std : : string & text ) {
std : : vector < std : : string > words ;
// first split the text into words
{
std : : string str = text ;
std : : string pat = R " ('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^ \ s[:alpha:][:digit:]]+| \ s+(?! \ S)| \ s+) " ;
std : : regex re ( pat ) ;
std : : smatch m ;
while ( std : : regex_search ( str , m , re ) ) {
for ( auto x : m ) {
words . push_back ( x ) ;
}
str = m . suffix ( ) ;
}
}
// find the longest tokens that form the words:
std : : vector < whisper_vocab : : id > tokens ;
for ( const auto & word : words ) {
2022-12-22 15:06:19 +00:00
if ( word . empty ( ) ) continue ;
2022-12-13 17:21:07 +00:00
int i = 0 ;
int n = word . size ( ) ;
while ( i < n ) {
int j = n ;
2023-04-14 17:35:03 +00:00
bool found = false ;
2022-12-13 17:21:07 +00:00
while ( j > i ) {
2023-04-14 17:35:03 +00:00
auto sub = word . substr ( i , j - i ) ;
auto it = vocab . token_to_id . find ( sub ) ;
2022-12-13 17:21:07 +00:00
if ( it ! = vocab . token_to_id . end ( ) ) {
tokens . push_back ( it - > second ) ;
i = j ;
2023-04-14 17:35:03 +00:00
found = true ;
2022-12-13 17:21:07 +00:00
break ;
}
- - j ;
}
2023-04-14 17:35:03 +00:00
if ( ! found ) {
fprintf ( stderr , " unknown token \n " ) ;
2022-12-13 17:21:07 +00:00
+ + i ;
}
}
}
return tokens ;
}
2022-10-04 17:35:01 +00:00
//
// interface implementation
//
2023-04-15 10:21:27 +00:00
# ifdef WHISPER_USE_COREML
// replace .bin with -encoder.mlmodelc
static std : : string whisper_get_coreml_path_encoder ( std : : string path_bin ) {
auto pos = path_bin . rfind ( ' . ' ) ;
if ( pos ! = std : : string : : npos ) {
path_bin = path_bin . substr ( 0 , pos ) ;
}
2023-05-14 15:09:44 +00:00
// match "-qx_x"
pos = path_bin . rfind ( ' - ' ) ;
if ( pos ! = std : : string : : npos ) {
auto sub = path_bin . substr ( pos ) ;
if ( sub . size ( ) = = 5 & & sub [ 1 ] = = ' q ' & & sub [ 3 ] = = ' _ ' ) {
path_bin = path_bin . substr ( 0 , pos ) ;
}
}
2023-04-15 10:21:27 +00:00
path_bin + = " -encoder.mlmodelc " ;
return path_bin ;
}
# endif
2023-03-05 19:42:19 +00:00
struct whisper_state * whisper_init_state ( whisper_context * ctx ) {
whisper_state * state = new whisper_state ;
2023-04-30 15:51:57 +00:00
const size_t scale = ctx - > model . hparams . ftype ? 1 : 2 ;
2023-03-05 19:42:19 +00:00
2023-04-30 15:51:57 +00:00
if ( ! kv_cache_init ( ctx - > model . hparams , scale * MEM_REQ_KV_SELF . at ( ctx - > model . type ) , state - > decoders [ 0 ] . kv_self , ctx - > itype , ctx - > model . hparams . n_text_ctx ) ) {
2023-03-05 19:42:19 +00:00
fprintf ( stderr , " %s: kv_cache_init() failed for self-attention cache \n " , __func__ ) ;
2023-04-14 17:05:56 +00:00
delete state ;
2023-03-05 19:42:19 +00:00
return nullptr ;
}
{
const size_t memory_size = ggml_nbytes ( state - > decoders [ 0 ] . kv_self . k ) + ggml_nbytes ( state - > decoders [ 0 ] . kv_self . v ) ;
fprintf ( stderr , " %s: kv self size = %7.2f MB \n " , __func__ , memory_size / 1024.0 / 1024.0 ) ;
}
2023-04-30 15:51:57 +00:00
if ( ! kv_cache_init ( ctx - > model . hparams , scale * MEM_REQ_KV_CROSS . at ( ctx - > model . type ) , state - > kv_cross , ctx - > itype , ctx - > model . hparams . n_audio_ctx ) ) {
2023-03-05 19:42:19 +00:00
fprintf ( stderr , " %s: kv_cache_init() failed for cross-attention cache \n " , __func__ ) ;
2023-04-14 17:05:56 +00:00
delete state ;
2023-03-05 19:42:19 +00:00
return nullptr ;
}
{
const size_t memory_size = ggml_nbytes ( state - > kv_cross . k ) + ggml_nbytes ( state - > kv_cross . v ) ;
fprintf ( stderr , " %s: kv cross size = %7.2f MB \n " , __func__ , memory_size / 1024.0 / 1024.0 ) ;
}
2023-04-15 10:21:27 +00:00
# ifdef WHISPER_USE_COREML
const auto path_coreml = whisper_get_coreml_path_encoder ( ctx - > path_model ) ;
fprintf ( stderr , " %s: loading Core ML model from '%s' \n " , __func__ , path_coreml . c_str ( ) ) ;
fprintf ( stderr , " %s: first run on a device may take a while ... \n " , __func__ ) ;
state - > ctx_coreml = whisper_coreml_init ( path_coreml . c_str ( ) ) ;
if ( ! state - > ctx_coreml ) {
fprintf ( stderr , " %s: failed to load Core ML model from '%s' \n " , __func__ , path_coreml . c_str ( ) ) ;
2023-04-29 07:55:24 +00:00
# ifndef WHISPER_COREML_ALLOW_FALLBACK
2023-04-15 10:21:27 +00:00
return nullptr ;
2023-04-29 07:49:02 +00:00
# endif
} else {
2023-04-29 07:55:24 +00:00
fprintf ( stderr , " %s: Core ML model loaded \n " , __func__ ) ;
2023-04-15 10:21:27 +00:00
}
# endif
2023-03-05 19:42:19 +00:00
state - > logits . reserve ( ctx - > vocab . n_vocab * ctx - > model . hparams . n_text_ctx ) ;
state - > logits_id . reserve ( ctx - > model . hparams . n_vocab ) ;
// TAGS: WHISPER_DECODER_INIT
state - > decoders [ 0 ] . sequence . tokens . reserve ( ctx - > model . hparams . n_text_ctx ) ;
state - > decoders [ 0 ] . probs . reserve ( ctx - > vocab . n_vocab ) ;
state - > decoders [ 0 ] . logits . reserve ( ctx - > vocab . n_vocab ) ;
state - > decoders [ 0 ] . logprobs . reserve ( ctx - > vocab . n_vocab ) ;
state - > buf_compute . resize ( scale * std : : max ( MEM_REQ_ENCODE . at ( ctx - > model . type ) , MEM_REQ_DECODE . at ( ctx - > model . type ) ) ) ;
state - > buf_scratch [ 0 ] . resize ( MEM_REQ_SCRATCH0 . at ( ctx - > model . type ) ) ;
state - > buf_scratch [ 1 ] . resize ( MEM_REQ_SCRATCH1 . at ( ctx - > model . type ) ) ;
state - > buf_scratch [ 2 ] . resize ( MEM_REQ_SCRATCH2 . at ( ctx - > model . type ) ) ;
state - > buf_scratch [ 3 ] . resize ( MEM_REQ_SCRATCH3 . at ( ctx - > model . type ) ) ;
state - > rng = std : : mt19937 ( 0 ) ;
return state ;
}
struct whisper_context * whisper_init_from_file_no_state ( const char * path_model ) {
2023-01-08 11:03:33 +00:00
fprintf ( stderr , " %s: loading model from '%s' \n " , __func__ , path_model ) ;
auto fin = std : : ifstream ( path_model , std : : ios : : binary ) ;
if ( ! fin ) {
fprintf ( stderr , " %s: failed to open '%s' \n " , __func__ , path_model ) ;
return nullptr ;
}
2023-04-29 07:36:50 +00:00
whisper_model_loader loader = { } ;
2023-04-29 07:42:14 +00:00
loader . context = & fin ;
2023-04-29 07:36:50 +00:00
loader . read = [ ] ( void * ctx , void * output , size_t read_size ) {
std : : ifstream * fin = ( std : : ifstream * ) ctx ;
fin - > read ( ( char * ) output , read_size ) ;
return read_size ;
} ;
loader . eof = [ ] ( void * ctx ) {
std : : ifstream * fin = ( std : : ifstream * ) ctx ;
return fin - > eof ( ) ;
} ;
loader . close = [ ] ( void * ctx ) {
std : : ifstream * fin = ( std : : ifstream * ) ctx ;
fin - > close ( ) ;
2023-01-08 11:03:33 +00:00
} ;
2023-03-27 18:00:32 +00:00
auto ctx = whisper_init_no_state ( & loader ) ;
if ( ctx ) {
ctx - > path_model = path_model ;
}
return ctx ;
2023-01-08 11:03:33 +00:00
}
2023-03-05 19:42:19 +00:00
struct whisper_context * whisper_init_from_buffer_no_state ( void * buffer , size_t buffer_size ) {
2023-01-08 11:03:33 +00:00
struct buf_context {
uint8_t * buffer ;
size_t size ;
size_t current_offset ;
} ;
buf_context ctx = { reinterpret_cast < uint8_t * > ( buffer ) , buffer_size , 0 } ;
fprintf ( stderr , " %s: loading model from buffer \n " , __func__ ) ;
2023-04-29 07:36:50 +00:00
whisper_model_loader loader = { } ;
2023-01-08 11:03:33 +00:00
2023-04-29 07:36:50 +00:00
loader . context = & ctx ;
2023-01-08 11:03:33 +00:00
2023-04-29 07:36:50 +00:00
loader . read = [ ] ( void * ctx , void * output , size_t read_size ) {
buf_context * buf = reinterpret_cast < buf_context * > ( ctx ) ;
2023-01-08 11:03:33 +00:00
2023-04-29 07:36:50 +00:00
size_t size_to_copy = buf - > current_offset + read_size < buf - > size ? read_size : buf - > size - buf - > current_offset ;
2023-01-08 11:03:33 +00:00
2023-04-29 07:36:50 +00:00
memcpy ( output , buf - > buffer + buf - > current_offset , size_to_copy ) ;
buf - > current_offset + = size_to_copy ;
2023-01-08 11:03:33 +00:00
2023-04-29 07:36:50 +00:00
return size_to_copy ;
} ;
2023-01-08 11:03:33 +00:00
2023-04-29 07:36:50 +00:00
loader . eof = [ ] ( void * ctx ) {
buf_context * buf = reinterpret_cast < buf_context * > ( ctx ) ;
return buf - > current_offset > = buf - > size ;
} ;
2023-01-08 11:03:33 +00:00
2023-04-29 07:36:50 +00:00
loader . close = [ ] ( void * /*ctx*/ ) { } ;
2023-01-08 11:03:33 +00:00
2023-03-05 19:42:19 +00:00
return whisper_init_no_state ( & loader ) ;
2023-01-08 11:03:33 +00:00
}
2023-03-05 19:42:19 +00:00
struct whisper_context * whisper_init_no_state ( struct whisper_model_loader * loader ) {
2022-10-11 17:57:52 +00:00
ggml_time_init ( ) ;
2022-10-04 17:35:01 +00:00
whisper_context * ctx = new whisper_context ;
2023-01-08 11:03:33 +00:00
if ( ! whisper_model_load ( loader , * ctx ) ) {
loader - > close ( loader - > context ) ;
fprintf ( stderr , " %s: failed to load model \n " , __func__ ) ;
2022-12-23 09:06:17 +00:00
delete ctx ;
2022-12-22 14:35:18 +00:00
return nullptr ;
2022-10-04 17:35:01 +00:00
}
2023-01-08 11:03:33 +00:00
loader - > close ( loader - > context ) ;
2022-10-04 17:35:01 +00:00
return ctx ;
}
2023-03-05 19:42:19 +00:00
struct whisper_context * whisper_init_from_file ( const char * path_model ) {
whisper_context * ctx = whisper_init_from_file_no_state ( path_model ) ;
if ( ! ctx ) {
return nullptr ;
}
ctx - > state = whisper_init_state ( ctx ) ;
if ( ! ctx - > state ) {
whisper_free ( ctx ) ;
return nullptr ;
}
return ctx ;
}
struct whisper_context * whisper_init_from_buffer ( void * buffer , size_t buffer_size ) {
whisper_context * ctx = whisper_init_from_buffer_no_state ( buffer , buffer_size ) ;
if ( ! ctx ) {
return nullptr ;
}
ctx - > state = whisper_init_state ( ctx ) ;
if ( ! ctx - > state ) {
whisper_free ( ctx ) ;
return nullptr ;
}
return ctx ;
}
struct whisper_context * whisper_init ( struct whisper_model_loader * loader ) {
whisper_context * ctx = whisper_init_no_state ( loader ) ;
if ( ! ctx ) {
return nullptr ;
}
ctx - > state = whisper_init_state ( ctx ) ;
if ( ! ctx - > state ) {
whisper_free ( ctx ) ;
return nullptr ;
}
return ctx ;
}
void whisper_free_state ( struct whisper_state * state )
{
if ( state ) {
kv_cache_free ( state - > kv_cross ) ;
for ( int i = 0 ; i < WHISPER_MAX_DECODERS ; + + i ) {
kv_cache_free ( state - > decoders [ i ] . kv_self ) ;
}
2023-04-15 10:21:27 +00:00
# ifdef WHISPER_USE_COREML
2023-04-29 07:49:02 +00:00
if ( state - > ctx_coreml ! = nullptr ) {
whisper_coreml_free ( state - > ctx_coreml ) ;
state - > ctx_coreml = nullptr ;
}
2023-04-15 10:21:27 +00:00
# endif
2023-03-05 19:42:19 +00:00
delete state ;
}
}
2022-10-04 17:35:01 +00:00
void whisper_free ( struct whisper_context * ctx ) {
if ( ctx ) {
2022-11-17 20:12:51 +00:00
if ( ctx - > model . ctx ) {
ggml_free ( ctx - > model . ctx ) ;
}
2023-01-15 09:29:57 +00:00
if ( ctx - > model . buf ) {
delete ctx - > model . buf ;
}
2023-03-05 19:42:19 +00:00
whisper_free_state ( ctx - > state ) ;
2022-10-04 17:35:01 +00:00
delete ctx ;
}
}
2023-05-28 23:38:58 +00:00
void whisper_free_params ( struct whisper_full_params * params ) {
if ( params ) {
delete params ;
}
}
2023-03-05 19:42:19 +00:00
int whisper_pcm_to_mel_with_state ( struct whisper_context * ctx , struct whisper_state * state , const float * samples , int n_samples , int n_threads ) {
if ( ! log_mel_spectrogram ( * state , samples , n_samples , WHISPER_SAMPLE_RATE , WHISPER_N_FFT , WHISPER_HOP_LENGTH , WHISPER_N_MEL , n_threads , ctx - > model . filters , false , state - > mel ) ) {
fprintf ( stderr , " %s: failed to compute mel spectrogram \n " , __func__ ) ;
return - 1 ;
}
return 0 ;
}
2022-10-04 17:35:01 +00:00
int whisper_pcm_to_mel ( struct whisper_context * ctx , const float * samples , int n_samples , int n_threads ) {
2023-03-05 19:42:19 +00:00
return whisper_pcm_to_mel_with_state ( ctx , ctx - > state , samples , n_samples , n_threads ) ;
}
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
int whisper_pcm_to_mel_phase_vocoder_with_state ( struct whisper_context * ctx , struct whisper_state * state , const float * samples , int n_samples , int n_threads ) {
if ( ! log_mel_spectrogram ( * state , samples , n_samples , WHISPER_SAMPLE_RATE , 2 * WHISPER_N_FFT , 2 * WHISPER_HOP_LENGTH , WHISPER_N_MEL , n_threads , ctx - > model . filters , true , state - > mel ) ) {
2022-11-12 16:03:49 +00:00
fprintf ( stderr , " %s: failed to compute mel spectrogram \n " , __func__ ) ;
return - 1 ;
}
return 0 ;
}
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
int whisper_pcm_to_mel_phase_vocoder ( struct whisper_context * ctx , const float * samples , int n_samples , int n_threads ) {
2023-03-05 19:42:19 +00:00
return whisper_pcm_to_mel_phase_vocoder_with_state ( ctx , ctx - > state , samples , n_samples , n_threads ) ;
}
int whisper_set_mel_with_state (
struct whisper_context * /*ctx*/ ,
struct whisper_state * state ,
const float * data ,
int n_len ,
int n_mel ) {
if ( n_mel ! = WHISPER_N_MEL ) {
fprintf ( stderr , " %s: invalid number of mel bands: %d (expected %d) \n " , __func__ , n_mel , WHISPER_N_MEL ) ;
2022-10-04 17:35:01 +00:00
return - 1 ;
}
2023-04-15 14:18:43 +00:00
state - > mel . n_len = n_len ;
state - > mel . n_len_org = n_len ;
state - > mel . n_mel = n_mel ;
2023-03-05 19:42:19 +00:00
state - > mel . data . resize ( n_len * n_mel ) ;
memcpy ( state - > mel . data . data ( ) , data , n_len * n_mel * sizeof ( float ) ) ;
2022-10-04 17:35:01 +00:00
return 0 ;
}
int whisper_set_mel (
struct whisper_context * ctx ,
const float * data ,
int n_len ,
int n_mel ) {
2023-03-05 19:42:19 +00:00
return whisper_set_mel_with_state ( ctx , ctx - > state , data , n_len , n_mel ) ;
}
int whisper_encode_with_state ( struct whisper_context * ctx , struct whisper_state * state , int offset , int n_threads ) {
if ( ! whisper_encode_internal ( * ctx , * state , offset , n_threads ) ) {
fprintf ( stderr , " %s: failed to eval \n " , __func__ ) ;
2022-10-04 17:35:01 +00:00
return - 1 ;
}
return 0 ;
}
int whisper_encode ( struct whisper_context * ctx , int offset , int n_threads ) {
2023-03-05 19:42:19 +00:00
if ( ! whisper_encode_internal ( * ctx , * ctx - > state , offset , n_threads ) ) {
2022-10-04 17:35:01 +00:00
fprintf ( stderr , " %s: failed to eval \n " , __func__ ) ;
return - 1 ;
}
return 0 ;
}
2023-03-05 19:42:19 +00:00
int whisper_decode_with_state ( struct whisper_context * ctx , struct whisper_state * state , const whisper_token * tokens , int n_tokens , int n_past , int n_threads ) {
const int selected_decoder_id = 0 ;
if ( ! whisper_decode_internal ( * ctx , * state , state - > decoders [ selected_decoder_id ] , tokens , n_tokens , n_past , n_threads ) ) {
fprintf ( stderr , " %s: failed to eval \n " , __func__ ) ;
return 1 ;
}
return 0 ;
}
2022-10-04 17:35:01 +00:00
int whisper_decode ( struct whisper_context * ctx , const whisper_token * tokens , int n_tokens , int n_past , int n_threads ) {
2023-03-05 19:42:19 +00:00
// TODO: add selected_decoder_id to state
2023-01-15 09:29:57 +00:00
const int selected_decoder_id = 0 ;
2022-10-04 17:35:01 +00:00
2023-03-05 19:42:19 +00:00
if ( ctx - > state = = nullptr ) {
fprintf ( stderr , " %s: ERROR state was not loaded. \n " , __func__ ) ;
return false ;
}
if ( ! whisper_decode_internal ( * ctx , * ctx - > state , ctx - > state - > decoders [ selected_decoder_id ] , tokens , n_tokens , n_past , n_threads ) ) {
2022-10-04 17:35:01 +00:00
fprintf ( stderr , " %s: failed to eval \n " , __func__ ) ;
return 1 ;
}
return 0 ;
}
2022-12-13 17:21:07 +00:00
int whisper_tokenize ( struct whisper_context * ctx , const char * text , whisper_token * tokens , int n_max_tokens ) {
const auto res = tokenize ( ctx - > vocab , text ) ;
2022-12-19 18:19:01 +00:00
if ( n_max_tokens < ( int ) res . size ( ) ) {
2022-12-13 17:21:07 +00:00
fprintf ( stderr , " %s: too many resulting tokens: %d (max %d) \n " , __func__ , ( int ) res . size ( ) , n_max_tokens ) ;
return - 1 ;
}
2022-12-19 18:19:01 +00:00
for ( int i = 0 ; i < ( int ) res . size ( ) ; i + + ) {
2022-12-13 17:21:07 +00:00
tokens [ i ] = res [ i ] ;
}
return res . size ( ) ;
}
2022-12-17 15:58:08 +00:00
int whisper_lang_max_id ( ) {
auto max_id = 0 ;
for ( const auto & kv : g_lang ) {
max_id = std : : max ( max_id , kv . second . first ) ;
}
return max_id ;
}
2022-10-04 17:35:01 +00:00
int whisper_lang_id ( const char * lang ) {
if ( ! g_lang . count ( lang ) ) {
2022-12-17 15:58:08 +00:00
for ( const auto & kv : g_lang ) {
if ( kv . second . second = = lang ) {
return kv . second . first ;
}
}
2022-10-04 17:35:01 +00:00
fprintf ( stderr , " %s: unknown language '%s' \n " , __func__ , lang ) ;
return - 1 ;
}
return g_lang . at ( lang ) . first ;
}
2022-12-17 15:58:08 +00:00
const char * whisper_lang_str ( int id ) {
for ( const auto & kv : g_lang ) {
if ( kv . second . first = = id ) {
return kv . first . c_str ( ) ;
}
}
fprintf ( stderr , " %s: unknown language id %d \n " , __func__ , id ) ;
2022-12-22 14:35:18 +00:00
return nullptr ;
2022-12-17 15:58:08 +00:00
}
2023-03-05 19:42:19 +00:00
int whisper_lang_auto_detect_with_state (
2022-12-17 15:58:08 +00:00
struct whisper_context * ctx ,
2023-03-05 19:42:19 +00:00
struct whisper_state * state ,
int offset_ms ,
int n_threads ,
float * lang_probs ) {
2022-12-17 15:58:08 +00:00
const int seek = offset_ms / 10 ;
if ( seek < 0 ) {
fprintf ( stderr , " %s: offset %dms is before the start of the audio \n " , __func__ , offset_ms ) ;
return - 1 ;
}
2023-04-15 14:18:43 +00:00
if ( seek > = state - > mel . n_len_org ) {
fprintf ( stderr , " %s: offset %dms is past the end of the audio (%dms) \n " , __func__ , offset_ms , state - > mel . n_len_org * 10 ) ;
2022-12-17 15:58:08 +00:00
return - 2 ;
}
// run the encoder
2023-03-22 19:47:09 +00:00
if ( whisper_encode_with_state ( ctx , state , seek , n_threads ) ! = 0 ) {
2022-12-17 15:58:08 +00:00
fprintf ( stderr , " %s: failed to encode \n " , __func__ ) ;
return - 6 ;
}
const std : : vector < whisper_token > prompt = { whisper_token_sot ( ctx ) } ;
2023-03-05 19:42:19 +00:00
if ( whisper_decode_with_state ( ctx , state , prompt . data ( ) , prompt . size ( ) , 0 , n_threads ) ! = 0 ) {
2022-12-17 15:58:08 +00:00
fprintf ( stderr , " %s: failed to decode \n " , __func__ ) ;
return - 7 ;
}
2023-03-05 19:42:19 +00:00
auto & logits_id = state - > logits_id ;
2023-01-15 09:29:57 +00:00
logits_id . clear ( ) ;
2022-12-19 18:19:01 +00:00
for ( const auto & kv : g_lang ) {
2022-12-17 15:58:08 +00:00
const auto token_lang = whisper_token_lang ( ctx , kv . second . first ) ;
2023-03-05 19:42:19 +00:00
logits_id . emplace_back ( state - > logits [ token_lang ] , kv . second . first ) ;
2022-12-17 15:58:08 +00:00
}
// sort descending
{
2023-01-15 09:29:57 +00:00
using pair_type = std : : remove_reference < decltype ( logits_id ) > : : type : : value_type ;
std : : sort ( logits_id . begin ( ) , logits_id . end ( ) , [ ] ( const pair_type & a , const pair_type & b ) {
2022-12-17 15:58:08 +00:00
return a . first > b . first ;
} ) ;
}
// softmax
{
2023-01-15 09:29:57 +00:00
const auto max = logits_id [ 0 ] . first ;
double sum = 0.0f ;
for ( auto & kv : logits_id ) {
kv . first = exp ( kv . first - max ) ;
sum + = kv . first ;
2022-12-17 15:58:08 +00:00
}
2023-01-15 09:29:57 +00:00
for ( auto & kv : logits_id ) {
kv . first / = sum ;
2022-12-17 15:58:08 +00:00
}
}
{
2023-01-15 09:29:57 +00:00
for ( const auto & prob : logits_id ) {
2022-12-17 15:58:08 +00:00
if ( lang_probs ) {
2022-12-31 22:58:30 +00:00
lang_probs [ prob . second ] = prob . first ;
2022-12-17 15:58:08 +00:00
}
2022-12-31 22:58:30 +00:00
//printf("%s: lang %2d (%3s): %f\n", __func__, prob.second, whisper_lang_str(prob.second), prob.first);
2022-12-17 15:58:08 +00:00
}
}
2023-01-15 09:29:57 +00:00
return logits_id [ 0 ] . second ;
2022-12-17 15:58:08 +00:00
}
2023-03-05 19:42:19 +00:00
int whisper_lang_auto_detect (
struct whisper_context * ctx ,
int offset_ms ,
int n_threads ,
float * lang_probs ) {
return whisper_lang_auto_detect_with_state ( ctx , ctx - > state , offset_ms , n_threads , lang_probs ) ;
}
2023-03-22 19:37:36 +00:00
int whisper_model_n_vocab ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_vocab ;
}
int whisper_model_n_audio_ctx ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_audio_ctx ;
}
int whisper_model_n_audio_state ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_audio_state ;
}
int whisper_model_n_audio_head ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_audio_head ;
}
int whisper_model_n_audio_layer ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_audio_layer ;
}
int whisper_model_n_text_ctx ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_text_ctx ;
}
int whisper_model_n_text_state ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_text_state ;
}
int whisper_model_n_text_head ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_text_head ;
}
int whisper_model_n_text_layer ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_text_layer ;
}
int whisper_model_n_mels ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_mels ;
}
2023-04-30 15:51:57 +00:00
int whisper_model_ftype ( struct whisper_context * ctx ) {
return ctx - > model . hparams . ftype ;
2023-03-22 19:37:36 +00:00
}
int whisper_model_type ( struct whisper_context * ctx ) {
return ctx - > model . type ;
}
const char * whisper_model_type_readable ( struct whisper_context * ctx ) {
switch ( ctx - > model . type ) {
case e_model : : MODEL_TINY :
return " tiny " ;
case e_model : : MODEL_BASE :
return " base " ;
case e_model : : MODEL_SMALL :
return " small " ;
case e_model : : MODEL_MEDIUM :
return " medium " ;
case e_model : : MODEL_LARGE :
return " large " ;
default :
return " unknown " ;
}
}
2023-03-05 19:42:19 +00:00
int whisper_n_len_from_state ( struct whisper_state * state ) {
2023-04-15 14:18:43 +00:00
return state - > mel . n_len_org ;
2023-03-05 19:42:19 +00:00
}
2022-10-04 17:35:01 +00:00
int whisper_n_len ( struct whisper_context * ctx ) {
2023-04-15 14:18:43 +00:00
return ctx - > state - > mel . n_len_org ;
2022-10-04 17:35:01 +00:00
}
int whisper_n_vocab ( struct whisper_context * ctx ) {
return ctx - > vocab . n_vocab ;
}
int whisper_n_text_ctx ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_text_ctx ;
}
2022-12-31 07:55:33 +00:00
int whisper_n_audio_ctx ( struct whisper_context * ctx ) {
return ctx - > model . hparams . n_audio_ctx ;
}
2022-10-04 17:35:01 +00:00
int whisper_is_multilingual ( struct whisper_context * ctx ) {
return ctx - > vocab . is_multilingual ( ) ? 1 : 0 ;
}
2023-01-15 09:29:57 +00:00
float * whisper_get_logits ( struct whisper_context * ctx ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > logits . data ( ) ;
}
float * whisper_get_logits_from_state ( struct whisper_state * state ) {
return state - > logits . data ( ) ;
2022-10-04 17:35:01 +00:00
}
const char * whisper_token_to_str ( struct whisper_context * ctx , whisper_token token ) {
return ctx - > vocab . id_to_token . at ( token ) . c_str ( ) ;
}
whisper_token whisper_token_eot ( struct whisper_context * ctx ) {
return ctx - > vocab . token_eot ;
}
whisper_token whisper_token_sot ( struct whisper_context * ctx ) {
return ctx - > vocab . token_sot ;
}
2023-07-04 06:45:00 +00:00
whisper_token whisper_token_solm ( struct whisper_context * ctx ) {
return ctx - > vocab . token_solm ;
}
2022-10-04 17:35:01 +00:00
whisper_token whisper_token_prev ( struct whisper_context * ctx ) {
return ctx - > vocab . token_prev ;
}
2023-07-04 06:45:00 +00:00
whisper_token whisper_token_nosp ( struct whisper_context * ctx ) {
return ctx - > vocab . token_nosp ;
2022-10-04 17:35:01 +00:00
}
whisper_token whisper_token_not ( struct whisper_context * ctx ) {
return ctx - > vocab . token_not ;
}
whisper_token whisper_token_beg ( struct whisper_context * ctx ) {
return ctx - > vocab . token_beg ;
}
2022-12-17 15:58:08 +00:00
whisper_token whisper_token_lang ( struct whisper_context * ctx , int lang_id ) {
return whisper_token_sot ( ctx ) + 1 + lang_id ;
}
2023-07-04 06:45:00 +00:00
whisper_token whisper_token_translate ( struct whisper_context * ctx ) {
return ctx - > vocab . token_translate ;
2022-10-04 17:35:01 +00:00
}
2023-07-04 06:45:00 +00:00
whisper_token whisper_token_transcribe ( struct whisper_context * ctx ) {
return ctx - > vocab . token_transcribe ;
2022-10-04 17:35:01 +00:00
}
void whisper_print_timings ( struct whisper_context * ctx ) {
const int64_t t_end_us = ggml_time_us ( ) ;
2022-10-08 14:28:06 +00:00
fprintf ( stderr , " \n " ) ;
2023-03-05 19:42:19 +00:00
fprintf ( stderr , " %s: load time = %8.2f ms \n " , __func__ , ctx - > t_load_us / 1000.0f ) ;
if ( ctx - > state ! = nullptr ) {
const int32_t n_sample = std : : max ( 1 , ctx - > state - > n_sample ) ;
const int32_t n_encode = std : : max ( 1 , ctx - > state - > n_encode ) ;
const int32_t n_decode = std : : max ( 1 , ctx - > state - > n_decode ) ;
fprintf ( stderr , " %s: fallbacks = %3d p / %3d h \n " , __func__ , ctx - > state - > n_fail_p , ctx - > state - > n_fail_h ) ;
fprintf ( stderr , " %s: mel time = %8.2f ms \n " , __func__ , ctx - > state - > t_mel_us / 1000.0f ) ;
fprintf ( stderr , " %s: sample time = %8.2f ms / %5d runs (%8.2f ms per run) \n " , __func__ , 1e-3 f * ctx - > state - > t_sample_us , n_sample , 1e-3 f * ctx - > state - > t_sample_us / n_sample ) ;
fprintf ( stderr , " %s: encode time = %8.2f ms / %5d runs (%8.2f ms per run) \n " , __func__ , 1e-3 f * ctx - > state - > t_encode_us , n_encode , 1e-3 f * ctx - > state - > t_encode_us / n_encode ) ;
fprintf ( stderr , " %s: decode time = %8.2f ms / %5d runs (%8.2f ms per run) \n " , __func__ , 1e-3 f * ctx - > state - > t_decode_us , n_decode , 1e-3 f * ctx - > state - > t_decode_us / n_decode ) ;
}
2022-10-08 14:28:06 +00:00
fprintf ( stderr , " %s: total time = %8.2f ms \n " , __func__ , ( t_end_us - ctx - > t_start_us ) / 1000.0f ) ;
2022-10-04 17:35:01 +00:00
}
2022-10-04 19:43:37 +00:00
2022-11-25 21:07:42 +00:00
void whisper_reset_timings ( struct whisper_context * ctx ) {
2023-03-05 19:42:19 +00:00
if ( ctx - > state ! = nullptr ) {
ctx - > state - > t_sample_us = 0 ;
ctx - > state - > t_encode_us = 0 ;
ctx - > state - > t_decode_us = 0 ;
}
2022-11-25 21:07:42 +00:00
}
2023-04-15 10:21:27 +00:00
static int whisper_has_coreml ( void ) {
# ifdef WHISPER_USE_COREML
return 1 ;
# else
return 0 ;
# endif
}
2022-11-26 15:28:28 +00:00
const char * whisper_print_system_info ( void ) {
static std : : string s ;
s = " " ;
s + = " AVX = " + std : : to_string ( ggml_cpu_has_avx ( ) ) + " | " ;
s + = " AVX2 = " + std : : to_string ( ggml_cpu_has_avx2 ( ) ) + " | " ;
s + = " AVX512 = " + std : : to_string ( ggml_cpu_has_avx512 ( ) ) + " | " ;
2022-12-24 08:22:28 +00:00
s + = " FMA = " + std : : to_string ( ggml_cpu_has_fma ( ) ) + " | " ;
2022-11-26 15:28:28 +00:00
s + = " NEON = " + std : : to_string ( ggml_cpu_has_neon ( ) ) + " | " ;
2022-12-22 14:43:57 +00:00
s + = " ARM_FMA = " + std : : to_string ( ggml_cpu_has_arm_fma ( ) ) + " | " ;
2022-12-06 19:56:56 +00:00
s + = " F16C = " + std : : to_string ( ggml_cpu_has_f16c ( ) ) + " | " ;
2022-11-26 15:28:28 +00:00
s + = " FP16_VA = " + std : : to_string ( ggml_cpu_has_fp16_va ( ) ) + " | " ;
s + = " WASM_SIMD = " + std : : to_string ( ggml_cpu_has_wasm_simd ( ) ) + " | " ;
s + = " BLAS = " + std : : to_string ( ggml_cpu_has_blas ( ) ) + " | " ;
2023-01-06 16:45:59 +00:00
s + = " SSE3 = " + std : : to_string ( ggml_cpu_has_sse3 ( ) ) + " | " ;
2023-01-05 04:00:30 +00:00
s + = " VSX = " + std : : to_string ( ggml_cpu_has_vsx ( ) ) + " | " ;
2023-04-15 10:21:27 +00:00
s + = " COREML = " + std : : to_string ( whisper_has_coreml ( ) ) + " | " ;
2022-11-26 15:28:28 +00:00
return s . c_str ( ) ;
}
2022-10-04 19:43:37 +00:00
////////////////////////////////////////////////////////////////////////////
2023-05-28 23:38:58 +00:00
struct whisper_full_params * whisper_full_default_params_by_ref ( enum whisper_sampling_strategy strategy ) {
struct whisper_full_params params = whisper_full_default_params ( strategy ) ;
struct whisper_full_params * result = new whisper_full_params ( ) ;
* result = params ;
return result ;
}
2022-10-18 15:17:24 +00:00
struct whisper_full_params whisper_full_default_params ( enum whisper_sampling_strategy strategy ) {
2023-01-15 09:29:57 +00:00
struct whisper_full_params result = {
2023-07-04 06:45:00 +00:00
/*.strategy =*/ strategy ,
/*.n_threads =*/ std : : min ( 4 , ( int32_t ) std : : thread : : hardware_concurrency ( ) ) ,
/*.n_max_text_ctx =*/ 16384 ,
/*.offset_ms =*/ 0 ,
/*.duration_ms =*/ 0 ,
/*.translate =*/ false ,
/*.no_context =*/ true ,
/*.single_segment =*/ false ,
/*.print_special =*/ false ,
/*.print_progress =*/ true ,
/*.print_realtime =*/ false ,
/*.print_timestamps =*/ true ,
/*.token_timestamps =*/ false ,
/*.thold_pt =*/ 0.01f ,
/*.thold_ptsum =*/ 0.01f ,
/*.max_len =*/ 0 ,
/*.split_on_word =*/ false ,
/*.max_tokens =*/ 0 ,
/*.speed_up =*/ false ,
/*.audio_ctx =*/ 0 ,
/*.tdrz_enable =*/ false ,
/*.initial_prompt =*/ nullptr ,
/*.prompt_tokens =*/ nullptr ,
/*.prompt_n_tokens =*/ 0 ,
/*.language =*/ " en " ,
/*.detect_language =*/ false ,
/*.suppress_blank =*/ true ,
2023-02-15 19:48:49 +00:00
/*.suppress_non_speech_tokens =*/ false ,
2023-01-15 09:29:57 +00:00
2023-07-04 06:45:00 +00:00
/*.temperature =*/ 0.0f ,
/*.max_initial_ts =*/ 1.0f ,
/*.length_penalty =*/ - 1.0f ,
2023-01-15 09:29:57 +00:00
2023-07-04 06:45:00 +00:00
/*.temperature_inc =*/ 0.4f ,
/*.entropy_thold =*/ 2.4f ,
/*.logprob_thold =*/ - 1.0f ,
/*.no_speech_thold =*/ 0.6f ,
2023-01-15 09:29:57 +00:00
2023-07-04 06:45:00 +00:00
/*.greedy =*/ {
2023-01-15 09:29:57 +00:00
/*.best_of =*/ - 1 ,
} ,
/*.beam_search =*/ {
/*.beam_size =*/ - 1 ,
/*.patience =*/ - 1.0f ,
} ,
/*.new_segment_callback =*/ nullptr ,
/*.new_segment_callback_user_data =*/ nullptr ,
2023-03-30 17:29:29 +00:00
/*.progress_callback =*/ nullptr ,
/*.progress_callback_user_data =*/ nullptr ,
2023-01-15 09:29:57 +00:00
/*.encoder_begin_callback =*/ nullptr ,
/*.encoder_begin_callback_user_data =*/ nullptr ,
2023-02-19 16:35:01 +00:00
/*.logits_filter_callback =*/ nullptr ,
/*.logits_filter_callback_user_data =*/ nullptr ,
2023-01-15 09:29:57 +00:00
} ;
2022-10-04 19:43:37 +00:00
switch ( strategy ) {
2022-10-18 15:17:24 +00:00
case WHISPER_SAMPLING_GREEDY :
2022-10-04 19:43:37 +00:00
{
2023-01-15 09:29:57 +00:00
result . greedy = {
2023-04-15 13:04:07 +00:00
/*.best_of =*/ 2 , // TODO: increase to 5 when we speed-up batch decoding
2022-10-04 19:43:37 +00:00
} ;
} break ;
2022-10-18 15:17:24 +00:00
case WHISPER_SAMPLING_BEAM_SEARCH :
2022-10-04 19:43:37 +00:00
{
2023-01-15 09:29:57 +00:00
result . beam_search = {
2023-04-15 13:04:07 +00:00
/*.beam_size =*/ 2 , // TODO: increase to 5 when we speed-up batch decoding
2023-01-15 09:29:57 +00:00
/*.patience =*/ - 1.0f ,
2022-10-04 19:43:37 +00:00
} ;
} break ;
}
return result ;
}
2022-10-07 19:30:44 +00:00
2022-11-02 19:18:20 +00:00
// forward declarations
static std : : vector < float > get_signal_energy ( const float * signal , int n_samples , int n_samples_per_half_window ) ;
static void whisper_exp_compute_token_level_timestamps (
2023-01-15 09:29:57 +00:00
struct whisper_context & ctx ,
2023-03-05 19:42:19 +00:00
struct whisper_state & state ,
2023-01-15 09:29:57 +00:00
int i_segment ,
float thold_pt ,
float thold_ptsum ) ;
2022-11-02 19:18:20 +00:00
2023-02-05 12:44:23 +00:00
static inline bool should_split_on_word ( const char * txt , bool split_on_word ) {
if ( ! split_on_word ) return true ;
2023-02-08 06:43:23 +00:00
return txt [ 0 ] = = ' ' ;
2023-02-05 12:44:23 +00:00
}
2022-11-02 19:18:20 +00:00
// wrap the last segment to max_len characters
// returns the number of new segments
2023-03-05 19:42:19 +00:00
static int whisper_wrap_segment ( struct whisper_context & ctx , struct whisper_state & state , int max_len , bool split_on_word ) {
auto segment = state . result_all . back ( ) ;
2022-11-02 19:18:20 +00:00
int res = 1 ;
int acc = 0 ;
std : : string text ;
for ( int i = 0 ; i < ( int ) segment . tokens . size ( ) ; i + + ) {
const auto & token = segment . tokens [ i ] ;
2023-01-15 09:29:57 +00:00
if ( token . id > = whisper_token_eot ( & ctx ) ) {
2022-11-02 19:18:20 +00:00
continue ;
}
2023-01-15 09:29:57 +00:00
const auto txt = whisper_token_to_str ( & ctx , token . id ) ;
2022-11-02 19:18:20 +00:00
const int cur = strlen ( txt ) ;
2023-02-05 12:44:23 +00:00
if ( acc + cur > max_len & & i > 0 & & should_split_on_word ( txt , split_on_word ) ) {
2023-03-05 19:42:19 +00:00
state . result_all . back ( ) . text = std : : move ( text ) ;
state . result_all . back ( ) . t1 = token . t0 ;
state . result_all . back ( ) . tokens . resize ( i ) ;
2023-07-04 06:45:00 +00:00
state . result_all . back ( ) . speaker_turn_next = false ;
2022-11-02 19:18:20 +00:00
2023-03-05 19:42:19 +00:00
state . result_all . push_back ( { } ) ;
state . result_all . back ( ) . t0 = token . t0 ;
state . result_all . back ( ) . t1 = segment . t1 ;
2022-11-02 19:18:20 +00:00
// add tokens [i, end] to the new segment
2023-03-05 19:42:19 +00:00
state . result_all . back ( ) . tokens . insert (
state . result_all . back ( ) . tokens . end ( ) ,
2022-11-02 19:18:20 +00:00
segment . tokens . begin ( ) + i ,
segment . tokens . end ( ) ) ;
2023-07-04 06:45:00 +00:00
state . result_all . back ( ) . speaker_turn_next = segment . speaker_turn_next ;
2022-11-02 19:18:20 +00:00
acc = 0 ;
text = " " ;
2023-03-05 19:42:19 +00:00
segment = state . result_all . back ( ) ;
2022-11-02 19:18:20 +00:00
i = - 1 ;
res + + ;
} else {
acc + = cur ;
text + = txt ;
}
}
2023-03-05 19:42:19 +00:00
state . result_all . back ( ) . text = std : : move ( text ) ;
2022-11-02 19:18:20 +00:00
return res ;
}
2023-02-15 19:48:49 +00:00
static const std : : vector < std : : string > non_speech_tokens = {
2023-02-08 07:05:34 +00:00
" \" " , " # " , " ( " , " ) " , " * " , " + " , " / " , " : " , " ; " , " < " , " = " , " > " , " @ " , " [ " , " \\ " , " ] " , " ^ " ,
" _ " , " ` " , " { " , " | " , " } " , " ~ " , " 「 " , " 」 " , " 『 " , " 』 " , " << " , " >> " , " <<< " , " >>> " , " -- " ,
" --- " , " -( " , " -[ " , " (' " , " ( \" " , " (( " , " )) " , " ((( " , " ))) " , " [[ " , " ]] " , " {{ " , " }} " , " ♪♪ " ,
" ♪♪♪ " , " ♩ " , " ♪ " , " ♫ " , " ♬ " , " ♭ " , " ♮ " , " ♯ "
} ;
2023-01-15 09:29:57 +00:00
// process the logits for the selected decoder
// - applies logit filters
// - computes logprobs and probs
static void whisper_process_logits (
2023-02-19 16:35:01 +00:00
struct whisper_context & ctx ,
2023-03-05 19:42:19 +00:00
struct whisper_state & state ,
2023-01-15 09:29:57 +00:00
const struct whisper_full_params params ,
struct whisper_decoder & decoder ,
float temperature ) {
const auto & vocab = ctx . vocab ;
const auto & tokens_cur = decoder . sequence . tokens ;
const bool is_initial = tokens_cur . size ( ) = = 0 ;
const int n_logits = vocab . id_to_token . size ( ) ;
WHISPER_ASSERT ( n_logits = = ctx . vocab . n_vocab ) ;
// extract the logits for the last token
// we will be mutating and therefore we don't want to use the ctx.logits buffer directly
auto & probs = decoder . probs ;
auto & logits = decoder . logits ;
auto & logprobs = decoder . logprobs ;
{
logits . resize ( n_logits ) ;
2023-03-05 19:42:19 +00:00
memcpy ( logits . data ( ) , state . logits . data ( ) + ( state . logits . size ( ) - n_logits ) , n_logits * sizeof ( float ) ) ;
2022-10-15 06:55:28 +00:00
2023-01-15 09:29:57 +00:00
if ( temperature > 0.0f ) {
for ( int i = 0 ; i < n_logits ; i + + ) {
logits [ i ] / = temperature ;
}
2022-11-12 16:03:49 +00:00
}
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
// will be populated a bit later
probs . resize ( n_logits ) ;
logprobs . resize ( n_logits ) ;
}
2022-12-17 15:58:08 +00:00
2023-01-15 09:29:57 +00:00
// apply logit filters here
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L480-L493
{
// suppress blank
// https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L388-L390
if ( params . suppress_blank ) {
if ( is_initial ) {
logits [ vocab . token_eot ] = - INFINITY ;
logits [ vocab . token_to_id . at ( " " ) ] = - INFINITY ;
}
2022-12-17 15:58:08 +00:00
}
2023-01-15 09:29:57 +00:00
// suppress <|notimestamps|> token
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
logits [ vocab . token_not ] = - INFINITY ;
2022-12-17 15:58:08 +00:00
2023-07-04 06:45:00 +00:00
// suppress sot and nosp tokens
2023-01-15 09:29:57 +00:00
logits [ vocab . token_sot ] = - INFINITY ;
2023-07-04 06:45:00 +00:00
logits [ vocab . token_nosp ] = - INFINITY ; // TODO: ignore this token for now
// [TDRZ] when tinydiarize is disabled, suppress solm token
if ( params . tdrz_enable = = false ) {
logits [ vocab . token_solm ] = - INFINITY ;
}
2023-01-15 09:29:57 +00:00
2023-02-04 07:03:14 +00:00
// suppress task tokens
logits [ vocab . token_translate ] = - INFINITY ;
logits [ vocab . token_transcribe ] = - INFINITY ;
2023-02-19 16:35:01 +00:00
if ( params . logits_filter_callback ) {
2023-03-05 19:42:19 +00:00
params . logits_filter_callback ( & ctx , & state , tokens_cur . data ( ) , tokens_cur . size ( ) , logits . data ( ) , params . logits_filter_callback_user_data ) ;
2023-02-19 16:35:01 +00:00
}
2023-02-08 07:05:34 +00:00
// suppress non-speech tokens
// ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
2023-02-15 19:48:49 +00:00
if ( params . suppress_non_speech_tokens ) {
for ( const std : : string & token : non_speech_tokens ) {
const std : : string suppress_tokens [ ] = { token , " " + token } ;
for ( const std : : string & suppress_token : suppress_tokens ) {
if ( vocab . token_to_id . find ( suppress_token ) ! = vocab . token_to_id . end ( ) ) {
2023-02-08 07:05:34 +00:00
logits [ vocab . token_to_id . at ( suppress_token ) ] = - INFINITY ;
}
}
}
2023-02-15 19:48:49 +00:00
2023-02-08 07:05:34 +00:00
// allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
2023-02-15 19:48:49 +00:00
if ( vocab . token_to_id . find ( " - " ) ! = vocab . token_to_id . end ( ) ) {
2023-02-08 07:05:34 +00:00
logits [ vocab . token_to_id . at ( " - " ) ] = - INFINITY ;
}
2023-02-15 19:48:49 +00:00
if ( vocab . token_to_id . find ( " ' " ) ! = vocab . token_to_id . end ( ) ) {
2023-02-08 07:05:34 +00:00
logits [ vocab . token_to_id . at ( " ' " ) ] = - INFINITY ;
}
}
2023-01-15 09:29:57 +00:00
// timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
// https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L414-L424
{
const bool last_was_timestamp = tokens_cur . size ( ) > 0 & & tokens_cur . back ( ) . id > = vocab . token_beg ;
const bool penultimate_was_timestamp = tokens_cur . size ( ) < 2 | | tokens_cur [ tokens_cur . size ( ) - 2 ] . id > = vocab . token_beg ;
//fprintf(stderr, "last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
if ( last_was_timestamp ) {
if ( penultimate_was_timestamp ) {
for ( int i = vocab . token_beg ; i < n_logits ; + + i ) {
logits [ i ] = - INFINITY ;
}
} else {
for ( int i = 0 ; i < vocab . token_eot ; + + i ) {
logits [ i ] = - INFINITY ;
}
}
}
}
// the initial timestamp cannot be larger than max_initial_ts
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429
if ( is_initial & & params . max_initial_ts > 0.0f ) {
const float precision = float ( WHISPER_CHUNK_SIZE ) / ctx . model . hparams . n_audio_ctx ;
const int tid0 = std : : round ( params . max_initial_ts / precision ) ;
for ( int i = vocab . token_beg + tid0 + 1 ; i < n_logits ; + + i ) {
logits [ i ] = - INFINITY ;
}
}
2023-01-23 18:48:26 +00:00
// condition timestamp tokens to be increasing
// ref: https://github.com/openai/whisper/pull/831#issuecomment-1385910556
if ( decoder . has_ts ) {
const int tid0 = decoder . seek_delta / 2 ;
for ( int i = vocab . token_beg ; i < vocab . token_beg + tid0 ; + + i ) {
logits [ i ] = - INFINITY ;
}
}
2023-01-15 09:29:57 +00:00
// populate the logprobs array (log_softmax)
{
const float logit_max = * std : : max_element ( logits . begin ( ) , logits . end ( ) ) ;
float logsumexp = 0.0f ;
for ( int i = 0 ; i < n_logits ; + + i ) {
if ( logits [ i ] > - INFINITY ) {
logsumexp + = expf ( logits [ i ] - logit_max ) ;
}
}
logsumexp = logf ( logsumexp ) + logit_max ;
for ( int i = 0 ; i < n_logits ; + + i ) {
if ( logits [ i ] > - INFINITY ) {
logprobs [ i ] = logits [ i ] - logsumexp ;
} else {
logprobs [ i ] = - INFINITY ;
}
}
}
// if sum of probability over timestamps is above any other token, sample timestamp
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L431-L437
{
// logsumexp over timestamps
float timestamp_logprob = - INFINITY ;
{
float logsumexp = 0.0f ;
const float logprob_max = * std : : max_element ( logprobs . begin ( ) + vocab . token_beg , logprobs . end ( ) ) ;
for ( int i = vocab . token_beg ; i < n_logits ; + + i ) {
if ( logprobs [ i ] > - INFINITY ) {
logsumexp + = expf ( logprobs [ i ] - logprob_max ) ;
}
}
if ( logsumexp > 0.0f ) {
timestamp_logprob = logf ( logsumexp ) + logprob_max ;
}
}
const float max_text_token_logprob = * std : : max_element ( logprobs . begin ( ) , logprobs . begin ( ) + vocab . token_beg ) ;
//fprintf(stderr, "timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
if ( timestamp_logprob > max_text_token_logprob ) {
for ( int i = 0 ; i < vocab . token_beg ; + + i ) {
logits [ i ] = - INFINITY ;
logprobs [ i ] = - INFINITY ;
}
}
}
}
// compute probs
{
for ( int i = 0 ; i < n_logits ; + + i ) {
if ( logits [ i ] = = - INFINITY ) {
probs [ i ] = 0.0f ;
} else {
probs [ i ] = expf ( logprobs [ i ] ) ;
}
}
}
#if 0
// print first 100 logits - token string : logit
for ( int i = 0 ; i < 100 ; i + + ) {
const auto token = vocab . id_to_token . at ( i ) ;
const auto prob = probs [ i ] ;
const auto logit = logits [ i ] ;
const auto logprob = logprobs [ i ] ;
printf ( " %s : prob=%9.5f logit=%9.5f logprob=%9.5f \n " , token . c_str ( ) , prob , logit , logprob ) ;
}
// "And", "and", " And", " and"
printf ( " logits[ \" and \" ] = %f \n " , logits [ vocab . token_to_id . at ( " and " ) ] ) ;
printf ( " logits[ \" And \" ] = %f \n " , logits [ vocab . token_to_id . at ( " And " ) ] ) ;
printf ( " logits[ \" and \" ] = %f \n " , logits [ vocab . token_to_id . at ( " and " ) ] ) ;
printf ( " logits[ \" And \" ] = %f \n " , logits [ vocab . token_to_id . at ( " And " ) ] ) ;
printf ( " logits[ \" so \" ] = %f \n " , logits [ vocab . token_to_id . at ( " so " ) ] ) ;
printf ( " logprobs[ \" and \" ] = %f \n " , logprobs [ vocab . token_to_id . at ( " and " ) ] ) ;
printf ( " logprobs[ \" And \" ] = %f \n " , logprobs [ vocab . token_to_id . at ( " And " ) ] ) ;
printf ( " logprobs[ \" and \" ] = %f \n " , logprobs [ vocab . token_to_id . at ( " and " ) ] ) ;
printf ( " logprobs[ \" And \" ] = %f \n " , logprobs [ vocab . token_to_id . at ( " And " ) ] ) ;
printf ( " logprobs[ \" so \" ] = %f \n " , logprobs [ vocab . token_to_id . at ( " so " ) ] ) ;
printf ( " probs[ \" and \" ] = %f \n " , probs [ vocab . token_to_id . at ( " and " ) ] ) ;
printf ( " probs[ \" And \" ] = %f \n " , probs [ vocab . token_to_id . at ( " And " ) ] ) ;
printf ( " probs[ \" and \" ] = %f \n " , probs [ vocab . token_to_id . at ( " and " ) ] ) ;
printf ( " probs[ \" And \" ] = %f \n " , probs [ vocab . token_to_id . at ( " And " ) ] ) ;
printf ( " probs[ \" so \" ] = %f \n " , probs [ vocab . token_to_id . at ( " so " ) ] ) ;
# endif
}
static whisper_token_data whisper_sample_token (
2023-01-19 16:50:33 +00:00
whisper_context & ctx ,
2023-03-05 19:42:19 +00:00
whisper_state & state ,
2023-01-15 09:29:57 +00:00
const whisper_decoder & decoder ,
bool best ) {
whisper_token_data result = {
0 , 0 , 0.0f , 0.0f , 0.0f , 0.0f , - 1 , - 1 , 0.0f ,
} ;
const auto & vocab = ctx . vocab ;
const auto & probs = decoder . probs ;
const auto & logprobs = decoder . logprobs ;
const int n_logits = vocab . n_vocab ;
{
double sum_ts = 0.0 ;
double max_ts = 0.0 ;
for ( int i = vocab . token_beg ; i < n_logits ; i + + ) {
if ( probs [ i ] = = - INFINITY ) {
continue ;
}
sum_ts + = probs [ i ] ;
if ( max_ts < probs [ i ] ) {
max_ts = probs [ i ] ;
result . tid = i ;
}
}
result . pt = max_ts / ( sum_ts + 1e-10 ) ;
result . ptsum = sum_ts ;
}
if ( best ) {
for ( int i = 0 ; i < n_logits ; + + i ) {
if ( result . p < probs [ i ] ) {
result . id = i ;
result . p = probs [ i ] ;
result . plog = logprobs [ i ] ;
}
}
} else {
std : : discrete_distribution < > dist ( probs . begin ( ) , probs . end ( ) ) ;
2023-03-05 19:42:19 +00:00
result . id = dist ( state . rng ) ;
2023-01-15 09:29:57 +00:00
result . p = probs [ result . id ] ;
result . plog = logprobs [ result . id ] ;
}
if ( result . id > = vocab . token_beg ) {
result . tid = result . id ;
result . pt = result . p ;
}
2023-03-05 19:42:19 +00:00
state . n_sample + + ;
2023-01-19 16:50:33 +00:00
2023-01-15 09:29:57 +00:00
return result ;
}
static std : : vector < whisper_token_data > whisper_sample_token_topk (
whisper_context & ctx ,
2023-03-05 19:42:19 +00:00
whisper_state & state ,
2023-01-15 09:29:57 +00:00
const whisper_decoder & decoder ,
int k ) {
const auto & vocab = ctx . vocab ;
const auto & probs = decoder . probs ;
const auto & logits = decoder . logits ;
const auto & logprobs = decoder . logprobs ;
const int n_logits = vocab . n_vocab ;
2023-03-05 19:42:19 +00:00
auto & logits_id = state . logits_id ;
2023-01-15 09:29:57 +00:00
logits_id . clear ( ) ;
for ( int i = 0 ; i < n_logits ; + + i ) {
logits_id . push_back ( { logits [ i ] , i } ) ;
}
std : : partial_sort (
logits_id . begin ( ) ,
logits_id . begin ( ) + k , logits_id . end ( ) ,
[ ] ( const std : : pair < double , whisper_token > & a , const std : : pair < double , whisper_token > & b ) {
return a . first > b . first ;
} ) ;
std : : vector < whisper_token_data > result ;
result . reserve ( k ) ;
2023-01-16 19:44:40 +00:00
whisper_token tid = vocab . token_beg ;
2023-01-15 09:29:57 +00:00
2023-01-16 19:44:40 +00:00
float pt = 0.0 ;
float ptsum = 0.0 ;
2023-01-15 09:29:57 +00:00
{
double sum_ts = 0.0 ;
double max_ts = 0.0 ;
for ( int i = vocab . token_beg ; i < n_logits ; i + + ) {
if ( probs [ i ] = = - INFINITY ) {
continue ;
}
sum_ts + = probs [ i ] ;
if ( max_ts < probs [ i ] ) {
max_ts = probs [ i ] ;
tid = i ;
}
}
pt = max_ts / ( sum_ts + 1e-10 ) ;
ptsum = sum_ts ;
}
for ( int i = 0 ; i < k ; + + i ) {
const auto id = logits_id [ i ] . second ;
result . push_back ( { id , tid , probs [ id ] , logprobs [ id ] , pt , ptsum , - 1 , - 1 , 0.0f , } ) ;
if ( result [ i ] . id > = vocab . token_beg ) {
result [ i ] . tid = result [ i ] . id ;
result [ i ] . pt = result [ i ] . p ;
}
}
2023-03-05 19:42:19 +00:00
state . n_sample + + ;
2023-01-19 16:50:33 +00:00
2023-01-15 09:29:57 +00:00
return result ;
}
// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L178-L192
static void whisper_sequence_score (
const struct whisper_full_params & params ,
whisper_sequence & sequence ) {
if ( sequence . result_len = = 0 ) {
return ;
}
double result = 0.0f ;
for ( int i = 0 ; i < sequence . result_len ; + + i ) {
result + = sequence . tokens [ i ] . plog ;
}
sequence . sum_logprobs = result ;
sequence . avg_logprobs = result / sequence . result_len ;
double penalty = sequence . result_len ;
if ( params . length_penalty > 0.0f ) {
penalty = pow ( ( 5.0 + penalty ) / 6.0 , params . length_penalty ) ;
}
sequence . score = result / penalty ;
// compute the entropy of the sequence of the last 32 tokens
{
const int n = 32 ;
int cnt = 0 ;
double entropy = 0.0f ;
std : : map < whisper_token , int > token_counts ;
for ( int i = std : : max ( 0 , sequence . result_len - n ) ; i < sequence . result_len ; + + i ) {
token_counts [ sequence . tokens [ i ] . id ] + + ;
cnt + + ;
}
for ( const auto & kv : token_counts ) {
const auto p = kv . second / ( double ) cnt ;
entropy - = p * log ( p ) ;
//WHISPER_PRINT_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
}
sequence . entropy = entropy ;
}
}
2023-03-05 19:42:19 +00:00
int whisper_full_with_state (
2023-01-15 09:29:57 +00:00
struct whisper_context * ctx ,
2023-03-05 19:42:19 +00:00
struct whisper_state * state ,
struct whisper_full_params params ,
const float * samples ,
int n_samples ) {
2023-01-15 09:29:57 +00:00
// clear old results
2023-03-05 19:42:19 +00:00
auto & result_all = state - > result_all ;
2023-01-15 09:29:57 +00:00
result_all . clear ( ) ;
// compute log mel spectrogram
if ( params . speed_up ) {
2023-03-05 19:42:19 +00:00
if ( whisper_pcm_to_mel_phase_vocoder_with_state ( ctx , state , samples , n_samples , params . n_threads ) ! = 0 ) {
2023-01-15 09:29:57 +00:00
fprintf ( stderr , " %s: failed to compute log mel spectrogram \n " , __func__ ) ;
return - 1 ;
}
} else {
2023-03-05 19:42:19 +00:00
if ( whisper_pcm_to_mel_with_state ( ctx , state , samples , n_samples , params . n_threads ) ! = 0 ) {
2023-01-15 09:29:57 +00:00
fprintf ( stderr , " %s: failed to compute log mel spectrogram \n " , __func__ ) ;
return - 2 ;
}
}
// auto-detect language if not specified
2023-05-02 16:51:52 +00:00
if ( params . language = = nullptr | | strlen ( params . language ) = = 0 | | strcmp ( params . language , " auto " ) = = 0 | | params . detect_language ) {
2023-01-15 09:29:57 +00:00
std : : vector < float > probs ( whisper_lang_max_id ( ) + 1 , 0.0f ) ;
2023-03-05 19:42:19 +00:00
const auto lang_id = whisper_lang_auto_detect_with_state ( ctx , state , 0 , params . n_threads , probs . data ( ) ) ;
2023-01-15 09:29:57 +00:00
if ( lang_id < 0 ) {
fprintf ( stderr , " %s: failed to auto-detect language \n " , __func__ ) ;
return - 3 ;
}
2023-03-05 19:42:19 +00:00
state - > lang_id = lang_id ;
2023-01-15 09:29:57 +00:00
params . language = whisper_lang_str ( lang_id ) ;
fprintf ( stderr , " %s: auto-detected language: %s (p = %f) \n " , __func__ , params . language , probs [ whisper_lang_id ( params . language ) ] ) ;
2023-05-02 16:51:52 +00:00
if ( params . detect_language ) {
return 0 ;
}
2022-12-17 15:58:08 +00:00
}
2022-11-02 19:18:20 +00:00
if ( params . token_timestamps ) {
2023-03-05 19:42:19 +00:00
state - > t_beg = 0 ;
state - > t_last = 0 ;
state - > tid_last = 0 ;
state - > energy = get_signal_energy ( samples , n_samples , 32 ) ;
2022-11-02 19:18:20 +00:00
}
2022-10-29 06:27:08 +00:00
const int seek_start = params . offset_ms / 10 ;
2023-04-29 15:55:37 +00:00
const int seek_end = params . duration_ms = = 0 ? whisper_n_len_from_state ( state ) : seek_start + params . duration_ms / 10 ;
2022-10-29 06:27:08 +00:00
2022-10-11 17:49:48 +00:00
// if length of spectrogram is less than 1s (100 samples), then return
// basically don't process anything that is less than 1s
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
2023-01-15 10:41:42 +00:00
if ( seek_end < seek_start + ( params . speed_up ? 50 : 100 ) ) {
2022-10-11 17:49:48 +00:00
return 0 ;
}
2023-01-15 09:29:57 +00:00
// a set of temperatures to use
// [ t0, t0 + delta, t0 + 2*delta, ..., < 1.0f + 1e-6f ]
std : : vector < float > temperatures ;
if ( params . temperature_inc > 0.0f ) {
for ( float t = params . temperature ; t < 1.0f + 1e-6 f ; t + = params . temperature_inc ) {
temperatures . push_back ( t ) ;
}
} else {
temperatures . push_back ( params . temperature ) ;
}
// initialize the decoders
int n_decoders = 1 ;
switch ( params . strategy ) {
case WHISPER_SAMPLING_GREEDY :
{
n_decoders = params . greedy . best_of ;
} break ;
case WHISPER_SAMPLING_BEAM_SEARCH :
{
n_decoders = std : : max ( params . greedy . best_of , params . beam_search . beam_size ) ;
} break ;
} ;
n_decoders = std : : max ( 1 , n_decoders ) ;
// TAGS: WHISPER_DECODER_INIT
for ( int j = 1 ; j < n_decoders ; j + + ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2023-01-15 09:29:57 +00:00
if ( decoder . kv_self . ctx = = nullptr ) {
2023-03-05 19:42:19 +00:00
decoder . kv_self = state - > decoders [ 0 ] . kv_self ;
2023-01-15 09:29:57 +00:00
if ( ! kv_cache_reinit ( decoder . kv_self ) ) {
fprintf ( stderr , " %s: kv_cache_reinit() failed for self-attention, decoder %d \n " , __func__ , j ) ;
return - 4 ;
}
WHISPER_PRINT_DEBUG ( " %s: initialized self-attention kv cache, decoder %d \n " , __func__ , j ) ;
2023-03-05 19:42:19 +00:00
decoder . sequence . tokens . reserve ( state - > decoders [ 0 ] . sequence . tokens . capacity ( ) ) ;
2023-01-15 09:29:57 +00:00
decoder . probs . resize ( ctx - > vocab . n_vocab ) ;
decoder . logits . resize ( ctx - > vocab . n_vocab ) ;
decoder . logprobs . resize ( ctx - > vocab . n_vocab ) ;
}
}
2022-10-04 19:43:37 +00:00
// the accumulated text context so far
2023-03-05 19:42:19 +00:00
auto & prompt_past = state - > prompt_past ;
2022-10-07 19:30:44 +00:00
if ( params . no_context ) {
prompt_past . clear ( ) ;
}
2022-10-04 19:43:37 +00:00
2023-04-14 16:16:34 +00:00
// prepare prompt
{
2023-03-29 20:23:23 +00:00
std : : vector < whisper_token > prompt_tokens ;
2023-04-14 16:16:34 +00:00
// initial prompt
if ( ! params . prompt_tokens & & params . initial_prompt ) {
prompt_tokens . resize ( 1024 ) ;
prompt_tokens . resize ( whisper_tokenize ( ctx , params . initial_prompt , prompt_tokens . data ( ) , prompt_tokens . size ( ) ) ) ;
params . prompt_tokens = prompt_tokens . data ( ) ;
params . prompt_n_tokens = prompt_tokens . size ( ) ;
}
// prepend the prompt tokens to the prompt_past
if ( params . prompt_tokens & & params . prompt_n_tokens > 0 ) {
// parse tokens from the pointer
for ( int i = 0 ; i < params . prompt_n_tokens ; i + + ) {
prompt_past . push_back ( params . prompt_tokens [ i ] ) ;
}
std : : rotate ( prompt_past . begin ( ) , prompt_past . end ( ) - params . prompt_n_tokens , prompt_past . end ( ) ) ;
2022-11-22 16:10:35 +00:00
}
}
2022-12-31 07:55:33 +00:00
// overwrite audio_ctx, max allowed is hparams.n_audio_ctx
if ( params . audio_ctx > whisper_n_audio_ctx ( ctx ) ) {
fprintf ( stderr , " %s: audio_ctx is larger than the maximum allowed (%d > %d) \n " , __func__ , params . audio_ctx , whisper_n_audio_ctx ( ctx ) ) ;
2023-01-15 09:29:57 +00:00
return - 5 ;
2022-12-31 07:55:33 +00:00
}
2023-03-05 19:42:19 +00:00
state - > exp_n_audio_ctx = params . audio_ctx ;
2022-11-20 19:12:01 +00:00
2022-10-04 19:43:37 +00:00
// these tokens determine the task that will be performed
std : : vector < whisper_token > prompt_init = { whisper_token_sot ( ctx ) } ;
if ( whisper_is_multilingual ( ctx ) ) {
2022-12-17 15:58:08 +00:00
const int lang_id = whisper_lang_id ( params . language ) ;
2023-03-05 19:42:19 +00:00
state - > lang_id = lang_id ;
2022-12-17 15:58:08 +00:00
prompt_init . push_back ( whisper_token_lang ( ctx , lang_id ) ) ;
2022-10-04 19:43:37 +00:00
if ( params . translate ) {
2023-07-04 06:45:00 +00:00
prompt_init . push_back ( whisper_token_translate ( ctx ) ) ;
2022-10-04 19:43:37 +00:00
} else {
2023-07-04 06:45:00 +00:00
prompt_init . push_back ( whisper_token_transcribe ( ctx ) ) ;
2022-10-04 19:43:37 +00:00
}
}
int progress_prev = 0 ;
int progress_step = 5 ;
2023-01-15 09:29:57 +00:00
int seek = seek_start ;
2022-10-29 06:27:08 +00:00
std : : vector < whisper_token > prompt ;
prompt . reserve ( whisper_n_text_ctx ( ctx ) ) ;
2023-01-15 09:29:57 +00:00
// beam-search helpers
struct kv_buf {
std : : vector < uint8_t > k ;
std : : vector < uint8_t > v ;
} ;
std : : vector < kv_buf > kv_bufs ;
struct beam_candidate {
int decoder_idx ;
int seek_delta ;
bool has_ts ;
whisper_sequence sequence ;
} ;
std : : vector < beam_candidate > beam_candidates ;
2022-10-04 19:43:37 +00:00
// main loop
while ( true ) {
2022-11-07 18:14:52 +00:00
const int progress_cur = ( 100 * ( seek - seek_start ) ) / ( seek_end - seek_start ) ;
2022-10-04 19:43:37 +00:00
while ( progress_cur > = progress_prev + progress_step ) {
progress_prev + = progress_step ;
if ( params . print_progress ) {
2022-10-08 14:28:06 +00:00
fprintf ( stderr , " %s: progress = %3d%% \n " , __func__ , progress_prev ) ;
2022-10-04 19:43:37 +00:00
}
}
2023-03-30 17:29:29 +00:00
if ( params . progress_callback ) {
params . progress_callback (
ctx , ctx - > state , progress_prev , params . progress_callback_user_data ) ;
}
2022-10-04 19:43:37 +00:00
2022-12-16 16:31:17 +00:00
// of only 1 second left, then stop
2022-11-07 18:14:52 +00:00
if ( seek + 100 > = seek_end ) {
2022-10-04 19:43:37 +00:00
break ;
}
2022-11-27 18:28:36 +00:00
if ( params . encoder_begin_callback ) {
2023-03-05 19:42:19 +00:00
if ( params . encoder_begin_callback ( ctx , state , params . encoder_begin_callback_user_data ) = = false ) {
2022-11-27 18:28:36 +00:00
fprintf ( stderr , " %s: encoder_begin_callback returned false - aborting \n " , __func__ ) ;
break ;
}
}
2022-10-04 19:43:37 +00:00
// encode audio features starting at offset seek
2023-03-05 19:42:19 +00:00
if ( ! whisper_encode_internal ( * ctx , * state , seek , params . n_threads ) ) {
2022-10-04 19:43:37 +00:00
fprintf ( stderr , " %s: failed to encode \n " , __func__ ) ;
2023-01-15 09:29:57 +00:00
return - 6 ;
}
// if there is a very short audio segment left to process, we remove any past prompt since it tends
// to confuse the decoder and often make it repeat or hallucinate stuff
if ( seek > seek_start & & seek + 500 > = seek_end ) {
prompt_past . clear ( ) ;
2022-10-04 19:43:37 +00:00
}
2023-01-15 09:29:57 +00:00
int best_decoder_id = 0 ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
for ( int it = 0 ; it < ( int ) temperatures . size ( ) ; + + it ) {
const float t_cur = temperatures [ it ] ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
int n_decoders_cur = 1 ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
switch ( params . strategy ) {
case whisper_sampling_strategy : : WHISPER_SAMPLING_GREEDY :
{
if ( t_cur > 0.0f ) {
n_decoders_cur = params . greedy . best_of ;
}
} break ;
case whisper_sampling_strategy : : WHISPER_SAMPLING_BEAM_SEARCH :
{
if ( t_cur > 0.0f ) {
n_decoders_cur = params . greedy . best_of ;
} else {
n_decoders_cur = params . beam_search . beam_size ;
}
} break ;
} ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
n_decoders_cur = std : : max ( 1 , n_decoders_cur ) ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
WHISPER_PRINT_DEBUG ( " \n %s: decoding with %d decoders, temperature = %.2f \n " , __func__ , n_decoders_cur , t_cur ) ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
// TAGS: WHISPER_DECODER_INIT
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
decoder . kv_self . n = 0 ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
decoder . sequence . tokens . clear ( ) ;
decoder . sequence . result_len = 0 ;
decoder . sequence . sum_logprobs_all = 0.0 ;
decoder . sequence . sum_logprobs = - INFINITY ;
decoder . sequence . avg_logprobs = - INFINITY ;
decoder . sequence . entropy = 0.0 ;
decoder . sequence . score = - INFINITY ;
2022-12-02 19:51:50 +00:00
2023-01-15 09:29:57 +00:00
decoder . seek_delta = 100 * WHISPER_CHUNK_SIZE ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
decoder . failed = false ;
decoder . completed = false ;
decoder . has_ts = false ;
}
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
// init prompt and kv cache for the current iteration
// run whisper_decoder() only for decoder 0 and copy the results for the other decoders
2022-10-04 19:43:37 +00:00
{
2023-01-15 09:29:57 +00:00
prompt . clear ( ) ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
// if we have already generated some text, use it as a prompt to condition the next generation
2023-01-25 17:59:57 +00:00
if ( ! prompt_past . empty ( ) & & t_cur < 0.5f & & params . n_max_text_ctx > 0 ) {
2023-01-15 09:29:57 +00:00
int n_take = std : : min ( std : : min ( params . n_max_text_ctx , whisper_n_text_ctx ( ctx ) / 2 ) , int ( prompt_past . size ( ) ) ) ;
2022-12-02 19:51:50 +00:00
2023-01-15 09:29:57 +00:00
prompt = { whisper_token_prev ( ctx ) } ;
prompt . insert ( prompt . begin ( ) + 1 , prompt_past . end ( ) - n_take , prompt_past . end ( ) ) ;
}
// init new transcription with sot, language (opt) and task tokens
prompt . insert ( prompt . end ( ) , prompt_init . begin ( ) , prompt_init . end ( ) ) ;
// print the prompt
2023-01-25 17:59:57 +00:00
WHISPER_PRINT_DEBUG ( " \n \n " ) ;
for ( int i = 0 ; i < ( int ) prompt . size ( ) ; i + + ) {
WHISPER_PRINT_DEBUG ( " %s: prompt[%d] = %s \n " , __func__ , i , ctx - > vocab . id_to_token . at ( prompt [ i ] ) . c_str ( ) ) ;
}
WHISPER_PRINT_DEBUG ( " \n \n " ) ;
2023-01-15 09:29:57 +00:00
2023-03-05 19:42:19 +00:00
if ( ! whisper_decode_internal ( * ctx , * state , state - > decoders [ 0 ] , prompt . data ( ) , prompt . size ( ) , 0 , params . n_threads ) ) {
2023-01-15 09:29:57 +00:00
fprintf ( stderr , " %s: failed to decode \n " , __func__ ) ;
return - 7 ;
}
{
const int64_t t_start_sample_us = ggml_time_us ( ) ;
2023-03-05 19:42:19 +00:00
whisper_process_logits ( * ctx , * state , params , state - > decoders [ 0 ] , t_cur ) ;
2023-01-15 09:29:57 +00:00
2023-03-05 19:42:19 +00:00
state - > decoders [ 0 ] . kv_self . n + = prompt . size ( ) ;
2023-01-15 09:29:57 +00:00
for ( int j = 1 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2023-01-15 09:29:57 +00:00
2023-03-05 19:42:19 +00:00
memcpy ( decoder . kv_self . k - > data , state - > decoders [ 0 ] . kv_self . k - > data , ggml_nbytes ( decoder . kv_self . k ) ) ;
memcpy ( decoder . kv_self . v - > data , state - > decoders [ 0 ] . kv_self . v - > data , ggml_nbytes ( decoder . kv_self . v ) ) ;
2023-01-15 09:29:57 +00:00
decoder . kv_self . n + = prompt . size ( ) ;
2023-03-05 19:42:19 +00:00
memcpy ( decoder . probs . data ( ) , state - > decoders [ 0 ] . probs . data ( ) , decoder . probs . size ( ) * sizeof ( decoder . probs [ 0 ] ) ) ;
memcpy ( decoder . logits . data ( ) , state - > decoders [ 0 ] . logits . data ( ) , decoder . logits . size ( ) * sizeof ( decoder . logits [ 0 ] ) ) ;
memcpy ( decoder . logprobs . data ( ) , state - > decoders [ 0 ] . logprobs . data ( ) , decoder . logprobs . size ( ) * sizeof ( decoder . logprobs [ 0 ] ) ) ;
2022-12-02 19:51:50 +00:00
}
2023-03-05 19:42:19 +00:00
state - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
2022-10-04 19:43:37 +00:00
}
2023-01-15 09:29:57 +00:00
}
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
for ( int i = 0 , n_max = whisper_n_text_ctx ( ctx ) / 2 - 4 ; i < n_max ; + + i ) {
const int64_t t_start_sample_us = ggml_time_us ( ) ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
// store the KV caches of all decoders when doing beam-search
if ( params . strategy = = whisper_sampling_strategy : : WHISPER_SAMPLING_BEAM_SEARCH ) {
kv_bufs . resize ( n_decoders_cur ) ;
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
if ( decoder . completed | | decoder . failed ) {
continue ;
2022-10-17 21:53:06 +00:00
}
2022-11-11 20:33:10 +00:00
2023-01-15 09:29:57 +00:00
kv_bufs [ j ] . k . resize ( ggml_nbytes ( decoder . kv_self . k ) ) ;
kv_bufs [ j ] . v . resize ( ggml_nbytes ( decoder . kv_self . v ) ) ;
memcpy ( kv_bufs [ j ] . k . data ( ) , decoder . kv_self . k - > data , kv_bufs [ j ] . k . size ( ) ) ;
memcpy ( kv_bufs [ j ] . v . data ( ) , decoder . kv_self . v - > data , kv_bufs [ j ] . v . size ( ) ) ;
2022-11-20 18:45:10 +00:00
}
2022-11-11 20:33:10 +00:00
2023-01-15 09:29:57 +00:00
beam_candidates . clear ( ) ;
2022-10-04 19:43:37 +00:00
}
2022-10-21 12:57:20 +00:00
2023-01-15 09:29:57 +00:00
// generate new sequence candidates for each decoder
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2023-01-15 09:29:57 +00:00
if ( decoder . completed | | decoder . failed ) {
continue ;
}
switch ( params . strategy ) {
case whisper_sampling_strategy : : WHISPER_SAMPLING_GREEDY :
{
if ( t_cur < 1e-6 f ) {
2023-03-05 19:42:19 +00:00
decoder . sequence . tokens . push_back ( whisper_sample_token ( * ctx , * state , decoder , true ) ) ;
2023-01-15 09:29:57 +00:00
} else {
2023-03-05 19:42:19 +00:00
decoder . sequence . tokens . push_back ( whisper_sample_token ( * ctx , * state , decoder , false ) ) ;
2023-01-15 09:29:57 +00:00
}
decoder . sequence . sum_logprobs_all + = decoder . sequence . tokens . back ( ) . plog ;
} break ;
case whisper_sampling_strategy : : WHISPER_SAMPLING_BEAM_SEARCH :
{
2023-03-05 19:42:19 +00:00
const auto tokens_new = whisper_sample_token_topk ( * ctx , * state , decoder , params . beam_search . beam_size ) ;
2023-01-15 09:29:57 +00:00
for ( const auto & token : tokens_new ) {
beam_candidates . push_back ( { j , decoder . seek_delta , decoder . has_ts , decoder . sequence } ) ;
beam_candidates . back ( ) . sequence . tokens . push_back ( token ) ;
beam_candidates . back ( ) . sequence . sum_logprobs_all + = token . plog ;
//WHISPER_PRINT_DEBUG("%s: beam candidate: %s (%f, %f)\n", __func__, ctx->vocab.id_to_token.at(token.id).c_str(), token.plog, beam_candidates.back().sequence.sum_logprobs_all);
}
} break ;
} ;
2022-10-21 12:57:20 +00:00
}
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
// for beam-search, choose the top candidates and update the KV caches
if ( params . strategy = = whisper_sampling_strategy : : WHISPER_SAMPLING_BEAM_SEARCH ) {
std : : sort (
beam_candidates . begin ( ) ,
beam_candidates . end ( ) ,
[ ] ( const beam_candidate & a , const beam_candidate & b ) {
return a . sequence . sum_logprobs_all > b . sequence . sum_logprobs_all ;
} ) ;
2022-10-04 19:43:37 +00:00
2023-02-19 16:35:01 +00:00
uint32_t cur_c = 0 ;
2022-12-02 19:51:50 +00:00
2023-01-15 09:29:57 +00:00
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
if ( decoder . completed | | decoder . failed ) {
continue ;
}
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
auto & cur = beam_candidates [ cur_c + + ] ;
2022-10-04 19:43:37 +00:00
2023-02-08 07:01:47 +00:00
while ( beam_candidates . size ( ) > cur_c & & beam_candidates [ cur_c ] . sequence . sum_logprobs_all = = cur . sequence . sum_logprobs_all & & i > 0 ) {
2023-01-15 09:29:57 +00:00
+ + cur_c ;
}
2022-10-04 19:43:37 +00:00
2023-01-15 09:29:57 +00:00
decoder . sequence = cur . sequence ;
decoder . seek_delta = cur . seek_delta ;
decoder . has_ts = cur . has_ts ;
2022-10-21 14:33:59 +00:00
2023-01-15 09:29:57 +00:00
memcpy ( decoder . kv_self . k - > data , kv_bufs [ cur . decoder_idx ] . k . data ( ) , kv_bufs [ cur . decoder_idx ] . k . size ( ) ) ;
memcpy ( decoder . kv_self . v - > data , kv_bufs [ cur . decoder_idx ] . v . data ( ) , kv_bufs [ cur . decoder_idx ] . v . size ( ) ) ;
WHISPER_PRINT_DEBUG ( " %s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f \n " ,
__func__ , j , cur . decoder_idx , ctx - > vocab . id_to_token . at ( decoder . sequence . tokens . back ( ) . id ) . c_str ( ) , decoder . sequence . tokens . back ( ) . plog , decoder . sequence . sum_logprobs_all ) ;
}
2022-10-04 19:43:37 +00:00
}
2023-01-15 09:29:57 +00:00
// update the decoder state
// - check if the sequence is completed
// - check if the sequence is failed
// - update sliding window based on timestamp tokens
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2023-01-15 09:29:57 +00:00
if ( decoder . completed | | decoder . failed ) {
continue ;
}
auto & has_ts = decoder . has_ts ;
auto & failed = decoder . failed ;
auto & completed = decoder . completed ;
auto & seek_delta = decoder . seek_delta ;
auto & result_len = decoder . sequence . result_len ;
{
const auto & token = decoder . sequence . tokens . back ( ) ;
// timestamp token - update sliding window
if ( token . id > whisper_token_beg ( ctx ) ) {
const int seek_delta_new = 2 * ( token . id - whisper_token_beg ( ctx ) ) ;
// do not allow to go back in time
if ( has_ts & & seek_delta > seek_delta_new & & result_len < i ) {
failed = true ; // TODO: maybe this is not a failure ?
continue ;
2022-10-04 20:16:33 +00:00
}
2023-01-15 09:29:57 +00:00
seek_delta = seek_delta_new ;
result_len = i + 1 ;
has_ts = true ;
2022-10-21 14:33:59 +00:00
}
2022-11-02 19:18:20 +00:00
2023-01-15 09:29:57 +00:00
# ifdef WHISPER_DEBUG
{
const auto tt = token . pt > 0.10 ? ctx - > vocab . id_to_token . at ( token . tid ) : " [?] " ;
WHISPER_PRINT_DEBUG ( " %s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s' \n " ,
__func__ , i , j , token . id , token . p , tt . c_str ( ) , token . pt , result_len , ctx - > vocab . id_to_token . at ( token . id ) . c_str ( ) ) ;
}
# endif
2022-11-02 19:18:20 +00:00
2023-01-15 09:29:57 +00:00
// end of segment
if ( token . id = = whisper_token_eot ( ctx ) | | // end of text token
( params . max_tokens > 0 & & i > = params . max_tokens ) | | // max tokens per segment reached
( has_ts & & seek + seek_delta + 100 > = seek_end ) // end of audio reached
) {
if ( result_len = = 0 ) {
if ( seek + seek_delta + 100 > = seek_end ) {
result_len = i + 1 ;
} else {
failed = true ;
continue ;
}
}
2022-11-02 19:18:20 +00:00
2023-01-15 09:29:57 +00:00
if ( params . single_segment ) {
result_len = i + 1 ;
seek_delta = 100 * WHISPER_CHUNK_SIZE ;
2022-11-02 19:18:20 +00:00
}
2023-01-15 09:29:57 +00:00
completed = true ;
continue ;
2022-11-02 19:18:20 +00:00
}
2023-01-15 09:29:57 +00:00
// TESTS: if no tensors are loaded, it means we are running tests
if ( ctx - > model . n_loaded = = 0 ) {
seek_delta = 100 * WHISPER_CHUNK_SIZE ;
completed = true ;
continue ;
2022-10-22 18:06:50 +00:00
}
2022-10-04 19:43:37 +00:00
}
2023-01-15 09:29:57 +00:00
// sometimes, the decoding can get stuck in a repetition loop
// this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
if ( i = = n_max - 1 & & ( result_len = = 0 | | seek_delta < 100 * WHISPER_CHUNK_SIZE / 2 ) ) {
failed = true ;
continue ;
2022-10-04 19:43:37 +00:00
}
}
2023-01-15 09:29:57 +00:00
// check if all decoders have finished (i.e. completed or failed)
{
bool completed_all = true ;
2022-10-04 20:16:33 +00:00
2023-01-15 09:29:57 +00:00
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2022-11-12 16:03:49 +00:00
2023-01-15 09:29:57 +00:00
if ( decoder . completed | | decoder . failed ) {
continue ;
}
completed_all = false ;
}
if ( completed_all ) {
break ;
2022-10-04 20:16:33 +00:00
}
}
2023-03-05 19:42:19 +00:00
state - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
2023-01-15 09:29:57 +00:00
// obtain logits for the next token
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2023-01-15 09:29:57 +00:00
if ( decoder . failed | | decoder . completed ) {
continue ;
}
decoder . tokens_tmp . resize ( 1 ) ;
decoder . tokens_tmp [ 0 ] = decoder . sequence . tokens . back ( ) . id ;
//WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
2023-03-05 19:42:19 +00:00
if ( ! whisper_decode_internal ( * ctx , * state , decoder , decoder . tokens_tmp . data ( ) , decoder . tokens_tmp . size ( ) , decoder . kv_self . n , params . n_threads ) ) {
2023-01-15 09:29:57 +00:00
fprintf ( stderr , " %s: failed to decode \n " , __func__ ) ;
return - 8 ;
}
{
const int64_t t_start_sample_us = ggml_time_us ( ) ;
2023-03-05 19:42:19 +00:00
whisper_process_logits ( * ctx , * state , params , decoder , t_cur ) ;
2023-01-15 09:29:57 +00:00
+ + decoder . kv_self . n ;
2023-03-05 19:42:19 +00:00
state - > t_sample_us + = ggml_time_us ( ) - t_start_sample_us ;
2023-01-15 09:29:57 +00:00
}
2022-10-21 14:33:59 +00:00
}
2023-01-15 09:29:57 +00:00
}
// rank the resulting sequences and select the best one
{
double best_score = - INFINITY ;
for ( int j = 0 ; j < n_decoders_cur ; + + j ) {
2023-03-05 19:42:19 +00:00
auto & decoder = state - > decoders [ j ] ;
2023-01-15 09:29:57 +00:00
if ( decoder . failed ) {
continue ;
}
decoder . sequence . tokens . resize ( decoder . sequence . result_len ) ;
whisper_sequence_score ( params , decoder . sequence ) ;
2022-11-02 19:18:20 +00:00
2023-01-15 09:29:57 +00:00
WHISPER_PRINT_DEBUG ( " %s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f \n " ,
__func__ , j , decoder . sequence . score , decoder . sequence . result_len , decoder . sequence . avg_logprobs , decoder . sequence . entropy ) ;
2022-11-02 19:18:20 +00:00
2023-01-18 20:52:18 +00:00
if ( decoder . sequence . result_len > 32 & & decoder . sequence . entropy < params . entropy_thold ) {
2023-01-15 09:29:57 +00:00
WHISPER_PRINT_DEBUG ( " %s: decoder %2d: failed due to entropy %8.5f < %8.5f \n " ,
__func__ , j , decoder . sequence . entropy , params . entropy_thold ) ;
2022-11-02 19:18:20 +00:00
2023-01-15 09:29:57 +00:00
decoder . failed = true ;
2023-03-05 19:42:19 +00:00
state - > n_fail_h + + ;
2023-01-15 09:29:57 +00:00
continue ;
}
if ( best_score < decoder . sequence . score ) {
best_score = decoder . sequence . score ;
best_decoder_id = j ;
2022-11-02 19:18:20 +00:00
}
}
2023-01-15 09:29:57 +00:00
WHISPER_PRINT_DEBUG ( " %s: best decoder = %d \n " , __func__ , best_decoder_id ) ;
}
// was the decoding successful for the current temperature?
2023-04-15 14:18:43 +00:00
// do fallback only if:
// - we are not at the last temperature
// - we are not at the end of the audio (3 sec)
if ( it ! = ( int ) temperatures . size ( ) - 1 & &
seek_end - seek > 10 * WHISPER_CHUNK_SIZE ) {
2023-01-15 09:29:57 +00:00
bool success = true ;
2023-03-05 19:42:19 +00:00
const auto & decoder = state - > decoders [ best_decoder_id ] ;
2023-01-15 09:29:57 +00:00
if ( decoder . failed | | decoder . sequence . avg_logprobs < params . logprob_thold ) {
success = false ;
2023-03-05 19:42:19 +00:00
state - > n_fail_p + + ;
2023-01-15 09:29:57 +00:00
}
if ( success ) {
//for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
// WHISPER_PRINT_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
//}
break ;
2022-10-22 18:06:50 +00:00
}
2022-10-04 19:43:37 +00:00
}
2023-01-15 09:29:57 +00:00
WHISPER_PRINT_DEBUG ( " \n %s: failed to decode with temperature = %.2f \n " , __func__ , t_cur ) ;
2022-10-04 19:43:37 +00:00
}
2023-01-15 09:29:57 +00:00
// output results through a user-provided callback
{
2023-03-05 19:42:19 +00:00
const auto & best_decoder = state - > decoders [ best_decoder_id ] ;
2023-01-15 09:29:57 +00:00
const auto seek_delta = best_decoder . seek_delta ;
const auto result_len = best_decoder . sequence . result_len ;
const auto & tokens_cur = best_decoder . sequence . tokens ;
//WHISPER_PRINT_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
// update prompt_past
prompt_past . clear ( ) ;
if ( prompt . front ( ) = = whisper_token_prev ( ctx ) ) {
prompt_past . insert ( prompt_past . end ( ) , prompt . begin ( ) + 1 , prompt . end ( ) - prompt_init . size ( ) ) ;
}
for ( int i = 0 ; i < result_len ; + + i ) {
prompt_past . push_back ( tokens_cur [ i ] . id ) ;
}
if ( ! tokens_cur . empty ( ) & & ctx - > model . n_loaded > 0 ) {
int i0 = 0 ;
auto t0 = seek + 2 * ( tokens_cur . front ( ) . tid - whisper_token_beg ( ctx ) ) ;
std : : string text ;
2023-07-04 06:45:00 +00:00
bool speaker_turn_next = false ;
2023-01-15 09:29:57 +00:00
for ( int i = 0 ; i < ( int ) tokens_cur . size ( ) ; i + + ) {
//printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
// ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
// ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
2023-07-04 06:45:00 +00:00
if ( params . print_special | | tokens_cur [ i ] . id < whisper_token_eot ( ctx ) ) {
2023-01-15 09:29:57 +00:00
text + = whisper_token_to_str ( ctx , tokens_cur [ i ] . id ) ;
}
2023-07-04 06:45:00 +00:00
// [TDRZ] record if speaker turn was predicted after current segment
if ( params . tdrz_enable & & tokens_cur [ i ] . id = = whisper_token_solm ( ctx ) ) {
speaker_turn_next = true ;
}
2023-01-15 09:29:57 +00:00
if ( tokens_cur [ i ] . id > whisper_token_beg ( ctx ) & & ! params . single_segment ) {
const auto t1 = seek + 2 * ( tokens_cur [ i ] . tid - whisper_token_beg ( ctx ) ) ;
2023-01-18 19:00:41 +00:00
2023-01-15 09:29:57 +00:00
if ( ! text . empty ( ) ) {
const auto tt0 = params . speed_up ? 2 * t0 : t0 ;
const auto tt1 = params . speed_up ? 2 * t1 : t1 ;
if ( params . print_realtime ) {
if ( params . print_timestamps ) {
printf ( " [%s --> %s] %s \n " , to_timestamp ( tt0 ) . c_str ( ) , to_timestamp ( tt1 ) . c_str ( ) , text . c_str ( ) ) ;
} else {
printf ( " %s " , text . c_str ( ) ) ;
fflush ( stdout ) ;
}
}
//printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
2023-07-04 06:45:00 +00:00
result_all . push_back ( { tt0 , tt1 , text , { } , speaker_turn_next } ) ;
2023-01-15 09:29:57 +00:00
for ( int j = i0 ; j < = i ; j + + ) {
result_all . back ( ) . tokens . push_back ( tokens_cur [ j ] ) ;
}
int n_new = 1 ;
if ( params . token_timestamps ) {
whisper_exp_compute_token_level_timestamps (
2023-03-05 19:42:19 +00:00
* ctx , * state , result_all . size ( ) - 1 , params . thold_pt , params . thold_ptsum ) ;
2023-01-15 09:29:57 +00:00
if ( params . max_len > 0 ) {
2023-03-05 19:42:19 +00:00
n_new = whisper_wrap_segment ( * ctx , * state , params . max_len , params . split_on_word ) ;
2023-01-15 09:29:57 +00:00
}
}
if ( params . new_segment_callback ) {
2023-03-05 19:42:19 +00:00
params . new_segment_callback ( ctx , state , n_new , params . new_segment_callback_user_data ) ;
2023-01-15 09:29:57 +00:00
}
}
text = " " ;
while ( i < ( int ) tokens_cur . size ( ) & & tokens_cur [ i ] . id > whisper_token_beg ( ctx ) ) {
i + + ;
}
i - - ;
t0 = t1 ;
i0 = i + 1 ;
2023-07-04 06:45:00 +00:00
speaker_turn_next = false ;
2023-01-15 09:29:57 +00:00
}
}
if ( ! text . empty ( ) ) {
const auto t1 = seek + seek_delta ;
const auto tt0 = params . speed_up ? 2 * t0 : t0 ;
const auto tt1 = params . speed_up ? 2 * t1 : t1 ;
if ( params . print_realtime ) {
if ( params . print_timestamps ) {
printf ( " [%s --> %s] %s \n " , to_timestamp ( tt0 ) . c_str ( ) , to_timestamp ( tt1 ) . c_str ( ) , text . c_str ( ) ) ;
} else {
printf ( " %s " , text . c_str ( ) ) ;
fflush ( stdout ) ;
}
}
2023-07-04 06:45:00 +00:00
result_all . push_back ( { tt0 , tt1 , text , { } , speaker_turn_next } ) ;
2023-01-15 09:29:57 +00:00
for ( int j = i0 ; j < ( int ) tokens_cur . size ( ) ; j + + ) {
result_all . back ( ) . tokens . push_back ( tokens_cur [ j ] ) ;
}
int n_new = 1 ;
if ( params . token_timestamps ) {
whisper_exp_compute_token_level_timestamps (
2023-03-05 19:42:19 +00:00
* ctx , * state , result_all . size ( ) - 1 , params . thold_pt , params . thold_ptsum ) ;
2023-01-15 09:29:57 +00:00
if ( params . max_len > 0 ) {
2023-03-05 19:42:19 +00:00
n_new = whisper_wrap_segment ( * ctx , * state , params . max_len , params . split_on_word ) ;
2023-01-15 09:29:57 +00:00
}
}
if ( params . new_segment_callback ) {
2023-03-05 19:42:19 +00:00
params . new_segment_callback ( ctx , state , n_new , params . new_segment_callback_user_data ) ;
2023-01-15 09:29:57 +00:00
}
}
}
// update audio window
seek + = seek_delta ;
WHISPER_PRINT_DEBUG ( " seek = %d, seek_delta = %d \n " , seek , seek_delta ) ;
}
2022-10-04 19:43:37 +00:00
}
return 0 ;
}
2023-03-05 19:42:19 +00:00
int whisper_full (
struct whisper_context * ctx ,
struct whisper_full_params params ,
const float * samples ,
int n_samples ) {
return whisper_full_with_state ( ctx , ctx - > state , params , samples , n_samples ) ;
}
2022-10-29 09:24:02 +00:00
int whisper_full_parallel (
struct whisper_context * ctx ,
struct whisper_full_params params ,
const float * samples ,
int n_samples ,
2022-11-26 15:28:28 +00:00
int n_processors ) {
2022-10-29 09:24:02 +00:00
if ( n_processors = = 1 ) {
return whisper_full ( ctx , params , samples , n_samples ) ;
}
int ret = 0 ;
2023-03-05 19:42:19 +00:00
// prepare separate states for each thread
std : : vector < whisper_state * > states ;
2022-10-29 09:24:02 +00:00
const int offset_samples = ( WHISPER_SAMPLE_RATE * params . offset_ms ) / 1000 ;
const int n_samples_per_processor = ( n_samples - offset_samples ) / n_processors ;
// the calling thread will process the first chunk
// while the other threads will process the remaining chunks
std : : vector < std : : thread > workers ( n_processors - 1 ) ;
for ( int i = 0 ; i < n_processors - 1 ; + + i ) {
2023-03-05 19:42:19 +00:00
// create a new state for each thread
states . push_back ( whisper_init_state ( ctx ) ) ;
2022-10-29 09:24:02 +00:00
const int start_samples = offset_samples + ( i + 1 ) * n_samples_per_processor ;
const int n_samples_cur = ( i = = n_processors - 2 ) ? n_samples - start_samples : n_samples_per_processor ;
auto params_cur = params ;
params_cur . offset_ms = 0 ;
params_cur . print_progress = false ;
params_cur . print_realtime = false ;
params_cur . new_segment_callback = nullptr ;
params_cur . new_segment_callback_user_data = nullptr ;
2023-03-30 17:29:29 +00:00
params_cur . progress_callback = nullptr ;
params_cur . progress_callback_user_data = nullptr ;
2023-03-05 19:42:19 +00:00
workers [ i ] = std : : thread ( whisper_full_with_state , ctx , states [ i ] , std : : move ( params_cur ) , samples + start_samples , n_samples_cur ) ;
2022-10-29 09:24:02 +00:00
}
{
auto params_cur = params ;
2023-03-05 19:42:19 +00:00
// We need to disable the print real-time for this one as well, otherwise it will show only for the first chunk.
params_cur . print_realtime = false ;
// Run the first transformation using default state but only for the first chunk.
ret = whisper_full_with_state ( ctx , ctx - > state , std : : move ( params_cur ) , samples , offset_samples + n_samples_per_processor ) ;
2022-10-29 09:24:02 +00:00
}
for ( int i = 0 ; i < n_processors - 1 ; + + i ) {
workers [ i ] . join ( ) ;
}
const int64_t offset_t = ( int64_t ) params . offset_ms / 10.0 ;
2023-03-05 19:42:19 +00:00
// combine results into result_state->result_all from all other states
2022-10-29 09:24:02 +00:00
for ( int i = 0 ; i < n_processors - 1 ; + + i ) {
2023-03-05 19:42:19 +00:00
auto & results_i = states [ i ] - > result_all ;
2022-10-29 09:24:02 +00:00
2023-03-05 19:42:19 +00:00
for ( auto & result : results_i ) {
2022-10-29 11:08:23 +00:00
// correct the segment timestamp taking into account the offset
2023-03-05 19:42:19 +00:00
result . t0 + = 100 * ( ( i + 1 ) * n_samples_per_processor ) / WHISPER_SAMPLE_RATE + offset_t ;
result . t1 + = 100 * ( ( i + 1 ) * n_samples_per_processor ) / WHISPER_SAMPLE_RATE + offset_t ;
2022-10-29 09:24:02 +00:00
2022-10-29 11:08:23 +00:00
// make sure that segments are not overlapping
2023-03-05 19:42:19 +00:00
if ( ! ctx - > state - > result_all . empty ( ) ) {
result . t0 = std : : max ( result . t0 , ctx - > state - > result_all . back ( ) . t1 ) ;
2022-10-29 09:24:02 +00:00
}
2023-03-05 19:42:19 +00:00
ctx - > state - > result_all . push_back ( std : : move ( result ) ) ;
2022-10-29 09:24:02 +00:00
// call the new_segment_callback for each segment
if ( params . new_segment_callback ) {
2023-03-05 19:42:19 +00:00
params . new_segment_callback ( ctx , ctx - > state , 1 , params . new_segment_callback_user_data ) ;
2022-10-29 09:24:02 +00:00
}
}
2022-10-29 11:08:23 +00:00
2023-03-05 19:42:19 +00:00
ctx - > state - > t_mel_us + = states [ i ] - > t_mel_us ;
2023-01-15 09:29:57 +00:00
2023-03-05 19:42:19 +00:00
ctx - > state - > t_sample_us + = states [ i ] - > t_sample_us ;
ctx - > state - > t_encode_us + = states [ i ] - > t_encode_us ;
ctx - > state - > t_decode_us + = states [ i ] - > t_decode_us ;
2023-01-15 09:29:57 +00:00
2023-03-05 19:42:19 +00:00
whisper_free_state ( states [ i ] ) ;
2022-10-29 11:08:23 +00:00
}
// average the timings
2023-03-05 19:42:19 +00:00
ctx - > state - > t_mel_us / = n_processors ;
ctx - > state - > t_sample_us / = n_processors ;
ctx - > state - > t_encode_us / = n_processors ;
ctx - > state - > t_decode_us / = n_processors ;
2022-10-29 11:08:23 +00:00
// print information about the audio boundaries
fprintf ( stderr , " \n " ) ;
fprintf ( stderr , " %s: the audio has been split into %d chunks at the following times: \n " , __func__ , n_processors ) ;
for ( int i = 0 ; i < n_processors - 1 ; + + i ) {
fprintf ( stderr , " %s: split %d - %s \n " , __func__ , ( i + 1 ) , to_timestamp ( 100 * ( ( i + 1 ) * n_samples_per_processor ) / WHISPER_SAMPLE_RATE + offset_t ) . c_str ( ) ) ;
2022-10-29 09:24:02 +00:00
}
2022-10-29 11:08:23 +00:00
fprintf ( stderr , " %s: the transcription quality may be degraded near these boundaries \n " , __func__ ) ;
2022-10-29 09:24:02 +00:00
return ret ;
}
2023-03-05 19:42:19 +00:00
int whisper_full_n_segments_from_state ( struct whisper_state * state ) {
return state - > result_all . size ( ) ;
}
2022-10-04 19:43:37 +00:00
int whisper_full_n_segments ( struct whisper_context * ctx ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > result_all . size ( ) ;
}
int whisper_full_lang_id_from_state ( struct whisper_state * state ) {
return state - > lang_id ;
2022-10-04 19:43:37 +00:00
}
2023-02-05 12:46:26 +00:00
int whisper_full_lang_id ( struct whisper_context * ctx ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > lang_id ;
}
int64_t whisper_full_get_segment_t0_from_state ( struct whisper_state * state , int i_segment ) {
return state - > result_all [ i_segment ] . t0 ;
2023-02-05 12:46:26 +00:00
}
2022-10-04 19:43:37 +00:00
int64_t whisper_full_get_segment_t0 ( struct whisper_context * ctx , int i_segment ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > result_all [ i_segment ] . t0 ;
}
int64_t whisper_full_get_segment_t1_from_state ( struct whisper_state * state , int i_segment ) {
return state - > result_all [ i_segment ] . t1 ;
2022-10-04 19:43:37 +00:00
}
int64_t whisper_full_get_segment_t1 ( struct whisper_context * ctx , int i_segment ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > result_all [ i_segment ] . t1 ;
}
2023-07-04 06:45:00 +00:00
bool whisper_full_get_segment_speaker_turn_next ( struct whisper_context * ctx , int i_segment ) {
return ctx - > state - > result_all [ i_segment ] . speaker_turn_next ;
}
2023-03-05 19:42:19 +00:00
const char * whisper_full_get_segment_text_from_state ( struct whisper_state * state , int i_segment ) {
return state - > result_all [ i_segment ] . text . c_str ( ) ;
2022-10-04 19:43:37 +00:00
}
const char * whisper_full_get_segment_text ( struct whisper_context * ctx , int i_segment ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > result_all [ i_segment ] . text . c_str ( ) ;
}
int whisper_full_n_tokens_from_state ( struct whisper_state * state , int i_segment ) {
return state - > result_all [ i_segment ] . tokens . size ( ) ;
2022-10-04 19:43:37 +00:00
}
2022-10-21 14:33:59 +00:00
int whisper_full_n_tokens ( struct whisper_context * ctx , int i_segment ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > result_all [ i_segment ] . tokens . size ( ) ;
2022-10-21 14:33:59 +00:00
}
2023-03-05 19:42:19 +00:00
const char * whisper_full_get_token_text_from_state ( struct whisper_context * ctx , struct whisper_state * state , int i_segment , int i_token ) {
return ctx - > vocab . id_to_token [ state - > result_all [ i_segment ] . tokens [ i_token ] . id ] . c_str ( ) ;
}
const char * whisper_full_get_token_text ( struct whisper_context * ctx , int i_segment , int i_token ) {
return ctx - > vocab . id_to_token [ ctx - > state - > result_all [ i_segment ] . tokens [ i_token ] . id ] . c_str ( ) ;
}
whisper_token whisper_full_get_token_id_from_state ( struct whisper_state * state , int i_segment , int i_token ) {
return state - > result_all [ i_segment ] . tokens [ i_token ] . id ;
2022-10-21 14:33:59 +00:00
}
2022-10-22 18:06:50 +00:00
whisper_token whisper_full_get_token_id ( struct whisper_context * ctx , int i_segment , int i_token ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > result_all [ i_segment ] . tokens [ i_token ] . id ;
}
struct whisper_token_data whisper_full_get_token_data_from_state ( struct whisper_state * state , int i_segment , int i_token ) {
return state - > result_all [ i_segment ] . tokens [ i_token ] ;
2022-10-22 18:06:50 +00:00
}
2022-10-30 08:05:58 +00:00
struct whisper_token_data whisper_full_get_token_data ( struct whisper_context * ctx , int i_segment , int i_token ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > result_all [ i_segment ] . tokens [ i_token ] ;
}
float whisper_full_get_token_p_from_state ( struct whisper_state * state , int i_segment , int i_token ) {
return state - > result_all [ i_segment ] . tokens [ i_token ] . p ;
2022-10-30 08:05:58 +00:00
}
2022-10-21 14:33:59 +00:00
float whisper_full_get_token_p ( struct whisper_context * ctx , int i_segment , int i_token ) {
2023-03-05 19:42:19 +00:00
return ctx - > state - > result_all [ i_segment ] . tokens [ i_token ] . p ;
2022-10-21 14:33:59 +00:00
}
2022-10-25 17:18:26 +00:00
2022-11-02 19:18:20 +00:00
// =================================================================================================
2023-01-18 19:00:41 +00:00
//
// Temporary interface needed for exposing ggml interface
// Will be removed in the future when ggml becomes a separate library
//
WHISPER_API int whisper_bench_memcpy ( int n_threads ) {
2023-03-07 19:36:30 +00:00
fputs ( whisper_bench_memcpy_str ( n_threads ) , stderr ) ;
return 0 ;
}
WHISPER_API const char * whisper_bench_memcpy_str ( int n_threads ) {
static std : : string s ;
s = " " ;
char strbuf [ 256 ] ;
2023-01-18 19:00:41 +00:00
ggml_time_init ( ) ;
2023-05-01 11:44:39 +00:00
size_t n = 20 ;
size_t arr = n_threads > 0 ? 1024llu : n_threads ; // trick to avoid compiler optimizations
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
// 1GB MB array
2023-01-18 19:00:41 +00:00
const size_t size = arr * 1024llu * 1024llu ;
2023-05-01 11:44:39 +00:00
// single-thread
{
char * src = ( char * ) malloc ( size ) ;
char * dst = ( char * ) malloc ( size ) ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
for ( size_t i = 0 ; i < size ; i + + ) src [ i ] = i ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
memcpy ( dst , src , size ) ; // heat-up
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
double tsum = 0.0 ;
double sum = 0.0 ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
for ( size_t i = 0 ; i < n ; i + + ) {
const int64_t t0 = ggml_time_us ( ) ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
memcpy ( dst , src , size ) ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
const int64_t t1 = ggml_time_us ( ) ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
tsum + = ( t1 - t0 ) * 1e-6 ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
src [ rand ( ) % size ] = rand ( ) % 256 ;
}
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
snprintf ( strbuf , sizeof ( strbuf ) , " memcpy: %.2f GB/s (1 thread) \n " , ( double ) ( n * size ) / ( tsum * 1024llu * 1024llu * 1024llu ) ) ;
s + = strbuf ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
// needed to prevent the compiler from optimizing the memcpy away
{
for ( size_t i = 0 ; i < size ; i + + ) sum + = dst [ i ] ;
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
snprintf ( strbuf , sizeof ( strbuf ) , " sum: %f \n " , sum ) ;
s + = strbuf ;
}
2023-01-18 19:00:41 +00:00
2023-05-01 11:44:39 +00:00
free ( src ) ;
free ( dst ) ;
2023-01-18 19:00:41 +00:00
}
2023-03-07 19:36:30 +00:00
return s . c_str ( ) ;
2023-01-18 19:00:41 +00:00
}
WHISPER_API int whisper_bench_ggml_mul_mat ( int n_threads ) {
2023-03-07 19:36:30 +00:00
fputs ( whisper_bench_ggml_mul_mat_str ( n_threads ) , stderr ) ;
return 0 ;
}
WHISPER_API const char * whisper_bench_ggml_mul_mat_str ( int n_threads ) {
static std : : string s ;
s = " " ;
char strbuf [ 256 ] ;
2023-01-18 19:00:41 +00:00
ggml_time_init ( ) ;
const int n_max = 128 ;
const std : : vector < size_t > sizes = {
64 , 128 , 256 , 512 , 1024 , 2048 , 4096 ,
} ;
const size_t N_max = sizes . back ( ) ;
// a: N*N*sizeof(float)
// b: N*N*sizeof(float)
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
2023-05-14 15:04:23 +00:00
std : : vector < char > buf ( 4llu * N_max * N_max * sizeof ( float ) + 4 * 512 ) ;
2023-01-18 19:00:41 +00:00
2023-04-30 15:51:57 +00:00
// put a bunch of random data in the buffer
2023-01-18 19:00:41 +00:00
for ( size_t i = 0 ; i < buf . size ( ) ; i + + ) buf [ i ] = i ;
for ( int j = 0 ; j < ( int ) sizes . size ( ) ; j + + ) {
2023-04-30 15:51:57 +00:00
int n_q4_0 = 0 ;
int n_q4_1 = 0 ;
2023-05-01 11:44:39 +00:00
int n_q5_0 = 0 ;
int n_q5_1 = 0 ;
int n_q8_0 = 0 ;
2023-01-18 19:00:41 +00:00
int n_fp16 = 0 ;
int n_fp32 = 0 ;
// GFLOPS/s
2023-04-30 15:51:57 +00:00
double s_q4_0 = 0.0 ;
double s_q4_1 = 0.0 ;
2023-05-01 11:44:39 +00:00
double s_q5_0 = 0.0 ;
double s_q5_1 = 0.0 ;
double s_q8_0 = 0.0 ;
2023-01-18 19:00:41 +00:00
double s_fp16 = 0.0 ;
double s_fp32 = 0.0 ;
const size_t N = sizes [ j ] ;
2023-05-14 15:04:23 +00:00
for ( int k = 0 ; k < 7 ; + + k ) {
2023-04-30 15:51:57 +00:00
const ggml_type wtype =
k = = 0 ? GGML_TYPE_Q4_0 :
k = = 1 ? GGML_TYPE_Q4_1 :
2023-05-14 15:04:23 +00:00
k = = 2 ? GGML_TYPE_Q5_0 :
k = = 3 ? GGML_TYPE_Q5_1 :
k = = 4 ? GGML_TYPE_Q8_0 :
k = = 5 ? GGML_TYPE_F16 : GGML_TYPE_F32 ;
2023-01-18 19:00:41 +00:00
2023-05-14 15:04:23 +00:00
double & s = k = = 0 ? s_q4_0 : k = = 1 ? s_q4_1 : k = = 2 ? s_q5_0 : k = = 3 ? s_q5_1 : k = = 4 ? s_q8_0 : k = = 5 ? s_fp16 : /*k == 6*/ s_fp32 ;
int & n = k = = 0 ? n_q4_0 : k = = 1 ? n_q4_1 : k = = 2 ? n_q5_0 : k = = 3 ? n_q5_1 : k = = 4 ? n_q8_0 : k = = 5 ? n_fp16 : /*k == 6*/ n_fp32 ;
2023-01-18 19:00:41 +00:00
struct ggml_init_params gparams = {
/*.mem_size =*/ buf . size ( ) ,
/*.mem_buffer =*/ buf . data ( ) ,
2023-04-10 19:28:54 +00:00
/*.no_alloc =*/ false ,
2023-01-18 19:00:41 +00:00
} ;
struct ggml_context * ctx0 = ggml_init ( gparams ) ;
struct ggml_tensor * a = ggml_new_tensor_2d ( ctx0 , wtype , N , N ) ;
struct ggml_tensor * b = ggml_new_tensor_2d ( ctx0 , GGML_TYPE_F32 , N , N ) ;
struct ggml_tensor * c = ggml_mul_mat ( ctx0 , a , b ) ;
struct ggml_cgraph gf = ggml_build_forward ( c ) ;
gf . n_threads = n_threads ;
double tsum = 0.0 ;
// heat-up
ggml_graph_compute ( ctx0 , & gf ) ;
for ( int i = 0 ; i < n_max ; + + i ) {
const int64_t t0 = ggml_time_us ( ) ;
ggml_graph_compute ( ctx0 , & gf ) ;
const int64_t t1 = ggml_time_us ( ) ;
tsum + = ( t1 - t0 ) * 1e-6 ;
n + + ;
if ( tsum > 1.0 & & n > = 3 ) {
break ;
}
}
ggml_free ( ctx0 ) ;
s = ( ( 2.0 * N * N * N * n ) / tsum ) * 1e-9 ;
}
2023-05-14 15:04:23 +00:00
// Q4_0 | Q4_1
snprintf ( strbuf , sizeof ( strbuf ) , " %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) \n " ,
N , N , s_q4_0 , n_q4_0 , s_q4_1 , n_q4_1 ) ;
2023-05-01 11:44:39 +00:00
s + = strbuf ;
// Q5_0 | Q5_1 | Q8_0
snprintf ( strbuf , sizeof ( strbuf ) , " %4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs) \n " ,
N , N , s_q5_0 , n_q5_0 , s_q5_1 , n_q5_1 , s_q8_0 , n_q8_0 ) ;
s + = strbuf ;
// F16 | F32
snprintf ( strbuf , sizeof ( strbuf ) , " %4zu x %4zu: F16 %7.1f GFLOPS (%3d runs) | F32 %7.1f GFLOPS (%3d runs) \n " ,
N , N , s_fp16 , n_fp16 , s_fp32 , n_fp32 ) ;
2023-03-07 19:36:30 +00:00
s + = strbuf ;
2023-01-18 19:00:41 +00:00
}
2023-03-07 19:36:30 +00:00
return s . c_str ( ) ;
2023-01-18 19:00:41 +00:00
}
// =================================================================================================
// =================================================================================================
2022-11-02 19:18:20 +00:00
//
// Experimental stuff below
//
// Not sure if these should be part of the library at all, because the quality of the results is not
// guaranteed. Might get removed at some point unless a robust algorithm implementation is found
//
// =================================================================================================
//
// token-level timestamps
//
static int timestamp_to_sample ( int64_t t , int n_samples ) {
return std : : max ( 0 , std : : min ( ( int ) n_samples - 1 , ( int ) ( ( t * WHISPER_SAMPLE_RATE ) / 100 ) ) ) ;
}
static int64_t sample_to_timestamp ( int i_sample ) {
2023-01-08 13:08:45 +00:00
return ( 100ll * i_sample ) / WHISPER_SAMPLE_RATE ;
2022-11-02 19:18:20 +00:00
}
// a cost-function / heuristic that is high for text that takes longer to pronounce
// obviously, can be improved
static float voice_length ( const std : : string & text ) {
float res = 0.0f ;
2022-12-31 22:58:30 +00:00
for ( char c : text ) {
if ( c = = ' ' ) {
2022-11-02 19:18:20 +00:00
res + = 0.01f ;
2022-12-31 22:58:30 +00:00
} else if ( c = = ' , ' ) {
2022-11-02 19:18:20 +00:00
res + = 2.00f ;
2022-12-31 22:58:30 +00:00
} else if ( c = = ' . ' ) {
2022-11-02 19:18:20 +00:00
res + = 3.00f ;
2022-12-31 22:58:30 +00:00
} else if ( c = = ' ! ' ) {
2022-11-02 19:18:20 +00:00
res + = 3.00f ;
2022-12-31 22:58:30 +00:00
} else if ( c = = ' ? ' ) {
2022-11-02 19:18:20 +00:00
res + = 3.00f ;
2022-12-31 22:58:30 +00:00
} else if ( c > = ' 0 ' & & c < = ' 9 ' ) {
2022-11-02 19:18:20 +00:00
res + = 3.00f ;
} else {
res + = 1.00f ;
}
}
return res ;
}
// average the fabs of the signal
static std : : vector < float > get_signal_energy ( const float * signal , int n_samples , int n_samples_per_half_window ) {
const int hw = n_samples_per_half_window ;
std : : vector < float > result ( n_samples ) ;
for ( int i = 0 ; i < n_samples ; i + + ) {
float sum = 0 ;
for ( int j = - hw ; j < = hw ; j + + ) {
if ( i + j > = 0 & & i + j < n_samples ) {
sum + = fabs ( signal [ i + j ] ) ;
}
}
result [ i ] = sum / ( 2 * hw + 1 ) ;
}
return result ;
}
static void whisper_exp_compute_token_level_timestamps (
2023-01-15 09:29:57 +00:00
struct whisper_context & ctx ,
2023-03-05 19:42:19 +00:00
struct whisper_state & state ,
2023-01-15 09:29:57 +00:00
int i_segment ,
float thold_pt ,
float thold_ptsum ) {
2023-03-05 19:42:19 +00:00
auto & segment = state . result_all [ i_segment ] ;
2022-11-02 19:18:20 +00:00
auto & tokens = segment . tokens ;
2023-03-05 19:42:19 +00:00
const int n_samples = state . energy . size ( ) ;
2022-11-02 19:18:20 +00:00
if ( n_samples = = 0 ) {
fprintf ( stderr , " %s: no signal data available \n " , __func__ ) ;
return ;
}
const int64_t t0 = segment . t0 ;
const int64_t t1 = segment . t1 ;
const int n = tokens . size ( ) ;
if ( n = = 0 ) {
return ;
}
if ( n = = 1 ) {
tokens [ 0 ] . t0 = t0 ;
tokens [ 0 ] . t1 = t1 ;
return ;
}
2023-03-05 19:42:19 +00:00
auto & t_beg = state . t_beg ;
auto & t_last = state . t_last ;
auto & tid_last = state . tid_last ;
2022-11-02 19:18:20 +00:00
for ( int j = 0 ; j < n ; + + j ) {
auto & token = tokens [ j ] ;
if ( j = = 0 ) {
2023-01-15 09:29:57 +00:00
if ( token . id = = whisper_token_beg ( & ctx ) ) {
2022-11-02 19:18:20 +00:00
tokens [ j ] . t0 = t0 ;
tokens [ j ] . t1 = t0 ;
tokens [ j + 1 ] . t0 = t0 ;
t_beg = t0 ;
t_last = t0 ;
2023-01-15 09:29:57 +00:00
tid_last = whisper_token_beg ( & ctx ) ;
2022-11-02 19:18:20 +00:00
} else {
tokens [ j ] . t0 = t_last ;
}
}
2023-01-15 09:29:57 +00:00
const int64_t tt = t_beg + 2 * ( token . tid - whisper_token_beg ( & ctx ) ) ;
2022-11-02 19:18:20 +00:00
tokens [ j ] . id = token . id ;
tokens [ j ] . tid = token . tid ;
tokens [ j ] . p = token . p ;
tokens [ j ] . pt = token . pt ;
tokens [ j ] . ptsum = token . ptsum ;
2023-01-15 09:29:57 +00:00
tokens [ j ] . vlen = voice_length ( whisper_token_to_str ( & ctx , token . id ) ) ;
2022-11-02 19:18:20 +00:00
if ( token . pt > thold_pt & & token . ptsum > thold_ptsum & & token . tid > tid_last & & tt < = t1 ) {
if ( j > 0 ) {
tokens [ j - 1 ] . t1 = tt ;
}
tokens [ j ] . t0 = tt ;
tid_last = token . tid ;
}
}
tokens [ n - 2 ] . t1 = t1 ;
tokens [ n - 1 ] . t0 = t1 ;
tokens [ n - 1 ] . t1 = t1 ;
t_last = t1 ;
// find intervals of tokens with unknown timestamps
// fill the timestamps by proportionally splitting the interval based on the token voice lengths
{
int p0 = 0 ;
int p1 = 0 ;
while ( true ) {
while ( p1 < n & & tokens [ p1 ] . t1 < 0 ) {
p1 + + ;
}
if ( p1 > = n ) {
p1 - - ;
}
2023-01-15 09:29:57 +00:00
//printf("p0=%d p1=%d t0=%lld t1=%lld\n", p0, p1, tokens[p0].t0, tokens[p1].t1);
2022-11-02 19:18:20 +00:00
if ( p1 > p0 ) {
double psum = 0.0 ;
for ( int j = p0 ; j < = p1 ; j + + ) {
psum + = tokens [ j ] . vlen ;
}
//printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
const double dt = tokens [ p1 ] . t1 - tokens [ p0 ] . t0 ;
// split the time proportionally to the voice length
for ( int j = p0 + 1 ; j < = p1 ; j + + ) {
const double ct = tokens [ j - 1 ] . t0 + dt * tokens [ j - 1 ] . vlen / psum ;
tokens [ j - 1 ] . t1 = ct ;
tokens [ j ] . t0 = ct ;
}
}
p1 + + ;
p0 = p1 ;
if ( p1 > = n ) {
break ;
}
}
}
// fix up (just in case)
for ( int j = 0 ; j < n - 1 ; j + + ) {
if ( tokens [ j ] . t1 < 0 ) {
tokens [ j + 1 ] . t0 = tokens [ j ] . t1 ;
}
if ( j > 0 ) {
if ( tokens [ j - 1 ] . t1 > tokens [ j ] . t0 ) {
tokens [ j ] . t0 = tokens [ j - 1 ] . t1 ;
tokens [ j ] . t1 = std : : max ( tokens [ j ] . t0 , tokens [ j ] . t1 ) ;
}
}
}
// VAD
// expand or contract tokens based on voice activity
{
const int hw = WHISPER_SAMPLE_RATE / 8 ;
for ( int j = 0 ; j < n ; j + + ) {
2023-01-15 09:29:57 +00:00
if ( tokens [ j ] . id > = whisper_token_eot ( & ctx ) ) {
2022-11-02 19:18:20 +00:00
continue ;
}
int s0 = timestamp_to_sample ( tokens [ j ] . t0 , n_samples ) ;
int s1 = timestamp_to_sample ( tokens [ j ] . t1 , n_samples ) ;
const int ss0 = std : : max ( s0 - hw , 0 ) ;
const int ss1 = std : : min ( s1 + hw , n_samples ) ;
const int ns = ss1 - ss0 ;
float sum = 0.0f ;
for ( int k = ss0 ; k < ss1 ; k + + ) {
2023-03-05 19:42:19 +00:00
sum + = state . energy [ k ] ;
2022-11-02 19:18:20 +00:00
}
const float thold = 0.5 * sum / ns ;
{
int k = s0 ;
2023-03-05 19:42:19 +00:00
if ( state . energy [ k ] > thold & & j > 0 ) {
while ( k > 0 & & state . energy [ k ] > thold ) {
2022-11-02 19:18:20 +00:00
k - - ;
}
tokens [ j ] . t0 = sample_to_timestamp ( k ) ;
if ( tokens [ j ] . t0 < tokens [ j - 1 ] . t1 ) {
tokens [ j ] . t0 = tokens [ j - 1 ] . t1 ;
} else {
s0 = k ;
}
} else {
2023-03-05 19:42:19 +00:00
while ( state . energy [ k ] < thold & & k < s1 ) {
2022-11-02 19:18:20 +00:00
k + + ;
}
s0 = k ;
tokens [ j ] . t0 = sample_to_timestamp ( k ) ;
}
}
{
int k = s1 ;
2023-03-05 19:42:19 +00:00
if ( state . energy [ k ] > thold ) {
while ( k < n_samples - 1 & & state . energy [ k ] > thold ) {
2022-11-02 19:18:20 +00:00
k + + ;
}
tokens [ j ] . t1 = sample_to_timestamp ( k ) ;
if ( j < ns - 1 & & tokens [ j ] . t1 > tokens [ j + 1 ] . t0 ) {
tokens [ j ] . t1 = tokens [ j + 1 ] . t0 ;
} else {
s1 = k ;
}
} else {
2023-03-05 19:42:19 +00:00
while ( state . energy [ k ] < thold & & k > s0 ) {
2022-11-02 19:18:20 +00:00
k - - ;
}
s1 = k ;
tokens [ j ] . t1 = sample_to_timestamp ( k ) ;
}
}
}
}
// fixed token expand (optional)
//{
// const int t_expand = 0;
// for (int j = 0; j < n; j++) {
// if (j > 0) {
// tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand));
// }
// if (j < n - 1) {
// tokens[j].t1 = tokens[j].t1 + t_expand;
// }
// }
//}
// debug info
//for (int j = 0; j < n; ++j) {
// const auto & token = tokens[j];
2023-01-15 09:29:57 +00:00
// const auto tt = token.pt > thold_pt && token.ptsum > 0.01 ? whisper_token_to_str(&ctx, token.tid) : "[?]";
2022-11-02 19:18:20 +00:00
// printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
2023-01-15 09:29:57 +00:00
// tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, whisper_token_to_str(&ctx, token.id));
2022-11-02 19:18:20 +00:00
2023-01-15 09:29:57 +00:00
// if (tokens[j].id >= whisper_token_eot(&ctx)) {
2022-11-02 19:18:20 +00:00
// continue;
// }
//}
}