whisper : add whisper_token_count helper

This commit is contained in:
Georgi Gerganov 2024-03-25 14:46:07 +02:00
parent 5c2c07d479
commit ba69578828
No known key found for this signature in database
GPG Key ID: BF970631944C16B7
2 changed files with 9 additions and 1 deletions

View File

@ -3731,6 +3731,10 @@ int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_to
return res.size(); return res.size();
} }
int whisper_token_count(struct whisper_context * ctx, const char * text) {
return -whisper_tokenize(ctx, text, NULL, 0);
}
int whisper_lang_max_id() { int whisper_lang_max_id() {
auto max_id = 0; auto max_id = 0;
for (const auto & kv : g_lang) { for (const auto & kv : g_lang) {

View File

@ -345,6 +345,10 @@ extern "C" {
whisper_token * tokens, whisper_token * tokens,
int n_max_tokens); int n_max_tokens);
// Return the number of tokens in the provided text
// Equivalent to: -whisper_tokenize(ctx, text, NULL, 0)
int whisper_token_count(struct whisper_context * ctx, const char * text);
// Largest language id (i.e. number of available languages - 1) // Largest language id (i.e. number of available languages - 1)
WHISPER_API int whisper_lang_max_id(); WHISPER_API int whisper_lang_max_id();
@ -504,7 +508,7 @@ extern "C" {
// tokens to provide to the whisper decoder as initial prompt // tokens to provide to the whisper decoder as initial prompt
// these are prepended to any existing text context from a previous call // these are prepended to any existing text context from a previous call
// use whisper_tokenize() to convert text to tokens // use whisper_tokenize() to convert text to tokens
// maximum of whisper_n_text_ctx()/2 tokens are used // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224)
const char * initial_prompt; const char * initial_prompt;
const whisper_token * prompt_tokens; const whisper_token * prompt_tokens;
int prompt_n_tokens; int prompt_n_tokens;