mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-18 10:46:28 +00:00
talk-llama : sync llama.cpp
This commit is contained in:
parent
9d754a56cf
commit
da9809f243
@ -24,3 +24,24 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
|||||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
|
||||||
|
//
|
||||||
|
// helpers
|
||||||
|
//
|
||||||
|
|
||||||
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
|
if (search.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string builder;
|
||||||
|
builder.reserve(s.length());
|
||||||
|
size_t pos = 0;
|
||||||
|
size_t last_pos = 0;
|
||||||
|
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
||||||
|
builder.append(s, last_pos, pos - last_pos);
|
||||||
|
builder.append(replace);
|
||||||
|
last_pos = pos + search.length();
|
||||||
|
}
|
||||||
|
builder.append(s, last_pos, std::string::npos);
|
||||||
|
s = std::move(builder);
|
||||||
|
}
|
||||||
|
@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
|
|||||||
constexpr float bucket_low = -10.0f;
|
constexpr float bucket_low = -10.0f;
|
||||||
constexpr float bucket_high = 10.0f;
|
constexpr float bucket_high = 10.0f;
|
||||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||||
constexpr float bucker_inter = -bucket_low * bucket_scale;
|
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||||
|
|
||||||
std::vector<int> bucket_idx(candidates->size);
|
std::vector<int> bucket_idx(candidates->size);
|
||||||
std::vector<int> histo(nbuckets, 0);
|
std::vector<int> histo(nbuckets, 0);
|
||||||
|
|
||||||
for (int i = 0; i < (int)candidates->size; ++i) {
|
for (int i = 0; i < (int)candidates->size; ++i) {
|
||||||
const float val = candidates->data[i].logit;
|
const float val = candidates->data[i].logit;
|
||||||
int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||||
ib = std::max(0, std::min(nbuckets-1, ib));
|
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||||
bucket_idx[i] = ib;
|
bucket_idx[i] = ib;
|
||||||
++histo[ib];
|
++histo[ib];
|
||||||
|
@ -16,20 +16,6 @@
|
|||||||
// helpers
|
// helpers
|
||||||
//
|
//
|
||||||
|
|
||||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
||||||
std::string result;
|
|
||||||
for (size_t pos = 0; ; pos += search.length()) {
|
|
||||||
auto new_pos = s.find(search, pos);
|
|
||||||
if (new_pos == std::string::npos) {
|
|
||||||
result += s.substr(pos, s.size() - pos);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
result += s.substr(pos, new_pos - pos) + replace;
|
|
||||||
pos = new_pos;
|
|
||||||
}
|
|
||||||
s = std::move(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
||||||
static std::string format(const char * fmt, ...) {
|
static std::string format(const char * fmt, ...) {
|
||||||
va_list ap;
|
va_list ap;
|
||||||
@ -335,6 +321,21 @@ private:
|
|||||||
|
|
||||||
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
|
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
|
||||||
|
|
||||||
|
template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
|
||||||
|
class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
|
||||||
|
public:
|
||||||
|
using std::priority_queue<T, Container, Compare>::priority_queue;
|
||||||
|
|
||||||
|
T pop_move() {
|
||||||
|
T item = std::move(this->c.front());
|
||||||
|
std::pop_heap(this->c.begin(), this->c.end(), this->comp);
|
||||||
|
this->c.pop_back();
|
||||||
|
return item;
|
||||||
|
}
|
||||||
|
|
||||||
|
void pop() = delete;
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_bigram_bpe {
|
struct llm_bigram_bpe {
|
||||||
struct comparator {
|
struct comparator {
|
||||||
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
||||||
@ -343,7 +344,7 @@ struct llm_bigram_bpe {
|
|||||||
};
|
};
|
||||||
|
|
||||||
using queue_storage = std::vector<llm_bigram_bpe>;
|
using queue_storage = std::vector<llm_bigram_bpe>;
|
||||||
using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
|
using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
|
||||||
llm_symbol::index left;
|
llm_symbol::index left;
|
||||||
llm_symbol::index right;
|
llm_symbol::index right;
|
||||||
std::string text;
|
std::string text;
|
||||||
@ -402,6 +403,7 @@ struct llm_tokenizer_bpe {
|
|||||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"\\p{N}",
|
"\\p{N}",
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
@ -424,6 +426,8 @@ struct llm_tokenizer_bpe {
|
|||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||||
};
|
};
|
||||||
@ -531,8 +535,7 @@ struct llm_tokenizer_bpe {
|
|||||||
|
|
||||||
// build token(s)
|
// build token(s)
|
||||||
while (!work_queue.empty()) {
|
while (!work_queue.empty()) {
|
||||||
auto bigram = work_queue.top();
|
auto bigram = work_queue.pop_move();
|
||||||
work_queue.pop();
|
|
||||||
|
|
||||||
auto & left_symbol = symbols[bigram.left];
|
auto & left_symbol = symbols[bigram.left];
|
||||||
auto & right_symbol = symbols[bigram.right];
|
auto & right_symbol = symbols[bigram.right];
|
||||||
@ -1480,11 +1483,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
|
|||||||
return vocab.special_pad_id;
|
return vocab.special_pad_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) {
|
bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.tokenizer_add_bos;
|
return vocab.tokenizer_add_bos;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.tokenizer_add_eos;
|
return vocab.tokenizer_add_eos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
|||||||
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
||||||
|
|
||||||
int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
||||||
int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
||||||
|
|
||||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -93,15 +93,15 @@ extern "C" {
|
|||||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
|
||||||
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
LLAMA_ROPE_TYPE_NONE = -1,
|
LLAMA_ROPE_TYPE_NONE = -1,
|
||||||
LLAMA_ROPE_TYPE_NORM = 0,
|
LLAMA_ROPE_TYPE_NORM = 0,
|
||||||
LLAMA_ROPE_TYPE_NEOX = 2,
|
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
||||||
LLAMA_ROPE_TYPE_GLM = 4,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
||||||
@ -504,10 +504,16 @@ extern "C" {
|
|||||||
// Returns true if the model contains an encoder that requires llama_encode() call
|
// Returns true if the model contains an encoder that requires llama_encode() call
|
||||||
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Returns true if the model contains a decoder that requires llama_decode() call
|
||||||
|
LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
|
||||||
|
|
||||||
// For encoder-decoder models, this function returns id of the token that must be provided
|
// For encoder-decoder models, this function returns id of the token that must be provided
|
||||||
// to the decoder to start generating output sequence. For other models, it returns -1.
|
// to the decoder to start generating output sequence. For other models, it returns -1.
|
||||||
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
||||||
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API uint32_t llama_model_quantize(
|
LLAMA_API uint32_t llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
@ -912,11 +918,8 @@ extern "C" {
|
|||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
||||||
|
|
||||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
||||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
|
||||||
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
|
||||||
|
|
||||||
// Codellama infill tokens
|
// Codellama infill tokens
|
||||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||||
|
Loading…
Reference in New Issue
Block a user