mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-12 07:53:14 +00:00
talk-llama : sync llama.cpp
This commit is contained in:
parent
d3f6c34976
commit
dbf9c15e30
File diff suppressed because it is too large
Load Diff
@ -67,6 +67,7 @@ extern "C" {
|
|||||||
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||||
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||||
|
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
||||||
};
|
};
|
||||||
|
|
||||||
// pre-tokenization types
|
// pre-tokenization types
|
||||||
@ -87,6 +88,10 @@ extern "C" {
|
|||||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
@ -177,6 +182,12 @@ extern "C" {
|
|||||||
LLAMA_POOLING_TYPE_LAST = 3,
|
LLAMA_POOLING_TYPE_LAST = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_attention_type {
|
||||||
|
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
|
||||||
|
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
|
||||||
|
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
||||||
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||||
@ -294,6 +305,7 @@ extern "C" {
|
|||||||
|
|
||||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||||
|
enum llama_attention_type attention_type; // attention type to use for embeddings
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
@ -482,6 +494,13 @@ extern "C" {
|
|||||||
// Get a llama model tensor
|
// Get a llama model tensor
|
||||||
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
||||||
|
|
||||||
|
// Returns true if the model contains an encoder that requires llama_encode() call
|
||||||
|
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
||||||
|
|
||||||
|
// For encoder-decoder models, this function returns id of the token that must be provided
|
||||||
|
// to the decoder to start generating output sequence. For other models, it returns -1.
|
||||||
|
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API uint32_t llama_model_quantize(
|
LLAMA_API uint32_t llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
@ -767,6 +786,14 @@ extern "C" {
|
|||||||
// Frees a batch of tokens allocated with llama_batch_init()
|
// Frees a batch of tokens allocated with llama_batch_init()
|
||||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||||
|
|
||||||
|
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
||||||
|
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
||||||
|
// 0 - success
|
||||||
|
// < 0 - error
|
||||||
|
LLAMA_API int32_t llama_encode(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
struct llama_batch batch);
|
||||||
|
|
||||||
// Positive return values does not mean a fatal error, but rather a warning.
|
// Positive return values does not mean a fatal error, but rather a warning.
|
||||||
// 0 - success
|
// 0 - success
|
||||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||||
@ -857,6 +884,7 @@ extern "C" {
|
|||||||
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||||
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
|
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
||||||
|
|
||||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
||||||
@ -878,6 +906,7 @@ extern "C" {
|
|||||||
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
||||||
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
||||||
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
||||||
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
||||||
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
||||||
/// as plaintext. Does not insert a leading space.
|
/// as plaintext. Does not insert a leading space.
|
||||||
LLAMA_API int32_t llama_tokenize(
|
LLAMA_API int32_t llama_tokenize(
|
||||||
@ -892,15 +921,31 @@ extern "C" {
|
|||||||
// Token Id -> Piece.
|
// Token Id -> Piece.
|
||||||
// Uses the vocabulary in the provided context.
|
// Uses the vocabulary in the provided context.
|
||||||
// Does not write null terminator to the buffer.
|
// Does not write null terminator to the buffer.
|
||||||
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
|
||||||
// @param special If true, special tokens are rendered in the output.
|
// @param special If true, special tokens are rendered in the output.
|
||||||
LLAMA_API int32_t llama_token_to_piece(
|
LLAMA_API int32_t llama_token_to_piece(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
char * buf,
|
char * buf,
|
||||||
int32_t length,
|
int32_t length,
|
||||||
|
int32_t lstrip,
|
||||||
bool special);
|
bool special);
|
||||||
|
|
||||||
|
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
|
||||||
|
/// @param text The char pointer must be large enough to hold the resulting text.
|
||||||
|
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
|
||||||
|
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
|
||||||
|
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
|
||||||
|
/// @param unparse_special If true, special tokens are rendered in the output.
|
||||||
|
LLAMA_API int32_t llama_detokenize(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const llama_token * tokens,
|
||||||
|
int32_t n_tokens,
|
||||||
|
char * text,
|
||||||
|
int32_t text_len_max,
|
||||||
|
bool remove_special,
|
||||||
|
bool unparse_special);
|
||||||
|
|
||||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||||
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||||
@ -924,6 +969,12 @@ extern "C" {
|
|||||||
// Grammar
|
// Grammar
|
||||||
//
|
//
|
||||||
|
|
||||||
|
/// Initialize a llama_grammar.
|
||||||
|
///
|
||||||
|
/// @param rules The rule elements of the grammar to initialize.
|
||||||
|
/// @param n_rules The number of rules.
|
||||||
|
/// @param start_rule_index The index of the root rule (the starting point of the grammar).
|
||||||
|
/// @return The initialized llama_grammar or nullptr if initialization failed.
|
||||||
LLAMA_API struct llama_grammar * llama_grammar_init(
|
LLAMA_API struct llama_grammar * llama_grammar_init(
|
||||||
const llama_grammar_element ** rules,
|
const llama_grammar_element ** rules,
|
||||||
size_t n_rules,
|
size_t n_rules,
|
||||||
|
@ -35,10 +35,10 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
|
|||||||
|
|
||||||
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||||
std::vector<char> result(8, 0);
|
std::vector<char> result(8, 0);
|
||||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
|
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
|
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
@ -7030,4 +7030,3 @@ const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd
|
|||||||
{0x02FA1C, 0x02FA1C, 0x009F3B},
|
{0x02FA1C, 0x02FA1C, 0x009F3B},
|
||||||
{0x02FA1D, 0x02FA1D, 0x02A600},
|
{0x02FA1D, 0x02FA1D, 0x02A600},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||||
assert(offset < utf8.size());
|
assert(offset < utf8.size());
|
||||||
if (!(utf8[offset + 0] & 0x80)) {
|
if (!(utf8[offset + 0] & 0x80)) {
|
||||||
auto result = utf8[offset + 0];
|
auto result = utf8[offset + 0];
|
||||||
@ -232,8 +232,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||||||
};
|
};
|
||||||
|
|
||||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
size_t _prev_end = offset_ini;
|
size_t _prev_end = offset_ini;
|
||||||
@ -295,9 +294,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// regex: <space>?[^\s\p{L}\p{N}]+
|
// regex: <space>?[^\s\p{L}\p{N}]+
|
||||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||||
pos += (cpt == ' ');
|
pos += (cpt == ' ');
|
||||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||||
flags2 = _get_flags(++pos);
|
flags2 = _get_flags(++pos);
|
||||||
}
|
}
|
||||||
_add_token(pos);
|
_add_token(pos);
|
||||||
@ -351,8 +350,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||||||
};
|
};
|
||||||
|
|
||||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
size_t _prev_end = offset_ini;
|
size_t _prev_end = offset_ini;
|
||||||
@ -394,8 +392,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
|
||||||
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
|
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
|
||||||
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
|
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
|
||||||
pos++;
|
pos++;
|
||||||
while (_get_flags(pos).is_letter) {
|
while (_get_flags(pos).is_letter) {
|
||||||
@ -421,9 +419,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||||||
|
|
||||||
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||||
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
||||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
|
||||||
pos += (cpt == ' ');
|
pos += (cpt == ' ');
|
||||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||||
flags2 = _get_flags(++pos);
|
flags2 = _get_flags(++pos);
|
||||||
}
|
}
|
||||||
uint32_t cpt2 = _get_cpt(pos);
|
uint32_t cpt2 = _get_cpt(pos);
|
||||||
|
@ -48,6 +48,7 @@ struct codepoint_flags {
|
|||||||
|
|
||||||
|
|
||||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||||
|
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||||
|
@ -2949,7 +2949,7 @@ struct whisper_global_cache {
|
|||||||
// Mel spectrogram
|
// Mel spectrogram
|
||||||
|
|
||||||
void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel) {
|
void whisper_mel_init(whisper_mel & mel, ggml_backend_t backend, int n_len, int n_len_org, int n_mel) {
|
||||||
WHISPER_LOG_INFO("%s: n_len = %d, n_len_org = %d, n_mel = %d\n", __func__, n_len, n_len_org, n_mel);
|
//WHISPER_LOG_INFO("%s: n_len = %d, n_len_org = %d, n_mel = %d\n", __func__, n_len, n_len_org, n_mel);
|
||||||
mel.n_len_org = n_len_org;
|
mel.n_len_org = n_len_org;
|
||||||
assert(!mel.ctx);
|
assert(!mel.ctx);
|
||||||
mel.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true});
|
mel.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true});
|
||||||
|
Loading…
Reference in New Issue
Block a user