mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-18 20:27:53 +00:00
talk-llama : sync llama.cpp
This commit is contained in:
parent
eb97b257eb
commit
61edb117a0
@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
|
||||
// penalties
|
||||
|
||||
struct llama_sampler_penalties {
|
||||
const int32_t n_vocab;
|
||||
const llama_token special_eos_id;
|
||||
const llama_token linefeed_id;
|
||||
|
||||
const int32_t penalty_last_n;
|
||||
const float penalty_repeat;
|
||||
const float penalty_freq;
|
||||
const float penalty_present;
|
||||
|
||||
const bool penalize_nl;
|
||||
const bool ignore_eos;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
|
||||
// a frequency map to count token occurrences
|
||||
std::unordered_map<llama_token, int> token_count;
|
||||
};
|
||||
|
||||
static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
|
||||
@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->token_count[token]++;
|
||||
|
||||
// if the ring buffer is full, remove the oldest token
|
||||
if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
|
||||
const auto old = ctx->prev.front();
|
||||
|
||||
ctx->token_count[old]--;
|
||||
if (ctx->token_count[old] == 0) {
|
||||
ctx->token_count.erase(old);
|
||||
}
|
||||
}
|
||||
|
||||
ctx->prev.push_back(token);
|
||||
|
||||
#if 0
|
||||
// sanity check
|
||||
std::unordered_map<llama_token, int> tmp;
|
||||
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
|
||||
tmp[ctx->prev.rat(i)]++;
|
||||
}
|
||||
|
||||
assert(ctx->token_count == tmp);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
||||
|
||||
if (ctx->ignore_eos) {
|
||||
assert(ctx->special_eos_id >= 0);
|
||||
|
||||
// optimistically check if the candidates are not yet sorted/shuffled/truncated
|
||||
if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
|
||||
cur_p->data[ctx->special_eos_id].logit = -INFINITY;
|
||||
} else {
|
||||
// else, search for the special EOS token
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].id == ctx->special_eos_id) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((ctx->penalty_last_n == 0) ||
|
||||
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool nl_found = false;
|
||||
size_t nl_idx = 0;
|
||||
float nl_logit = -INFINITY;
|
||||
if (!ctx->penalize_nl) {
|
||||
assert(ctx->linefeed_id >= 0);
|
||||
|
||||
// optimistically check if the candidates are not yet sorted/shuffled/truncated
|
||||
if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
|
||||
nl_found = true;
|
||||
nl_idx = ctx->linefeed_id;
|
||||
nl_logit = cur_p->data[ctx->linefeed_id].logit;
|
||||
} else {
|
||||
// else, search for the linefeed token
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].id == ctx->linefeed_id) {
|
||||
nl_found = true;
|
||||
nl_idx = i;
|
||||
nl_logit = cur_p->data[i].logit;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a frequency map to count occurrences of each token in last_tokens
|
||||
// TODO: optimize this by maintaining the token count in the sampler context
|
||||
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
||||
llama_token_cnt token_count;
|
||||
|
||||
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
|
||||
token_count[ctx->prev.rat(i)]++;
|
||||
}
|
||||
|
||||
// Apply frequency and presence penalties to the cur_p
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
const auto token_iter = token_count.find(cur_p->data[i].id);
|
||||
if (token_iter == token_count.end()) {
|
||||
const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
|
||||
if (token_iter == ctx->token_count.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int count = token_iter->second;
|
||||
|
||||
assert(count > 0 && count <= ctx->penalty_last_n);
|
||||
|
||||
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
||||
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
||||
if (cur_p->data[i].logit <= 0) {
|
||||
@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
|
||||
}
|
||||
|
||||
cur_p->sorted = false;
|
||||
|
||||
if (!ctx->penalize_nl && nl_found) {
|
||||
// restore the logit of the newline token if it was penalized
|
||||
cur_p->data[nl_idx].logit = nl_logit;
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
|
||||
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
||||
ctx->prev.clear();
|
||||
ctx->token_count.clear();
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
|
||||
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
|
||||
auto * result = llama_sampler_init_penalties(
|
||||
ctx->n_vocab,
|
||||
ctx->special_eos_id,
|
||||
ctx->linefeed_id,
|
||||
ctx->penalty_last_n,
|
||||
ctx->penalty_repeat,
|
||||
ctx->penalty_freq,
|
||||
ctx->penalty_present,
|
||||
ctx->penalize_nl,
|
||||
ctx->ignore_eos);
|
||||
ctx->penalty_present);
|
||||
|
||||
// copy the state
|
||||
{
|
||||
@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_penalties(
|
||||
int32_t n_vocab,
|
||||
llama_token special_eos_id,
|
||||
llama_token linefeed_id,
|
||||
int32_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present,
|
||||
bool penalize_nl,
|
||||
bool ignore_eos) {
|
||||
if (linefeed_id == LLAMA_TOKEN_NULL) {
|
||||
penalize_nl = true;
|
||||
}
|
||||
|
||||
if (special_eos_id == LLAMA_TOKEN_NULL) {
|
||||
ignore_eos = false;
|
||||
}
|
||||
|
||||
float penalty_present) {
|
||||
penalty_last_n = std::max(penalty_last_n, 0);
|
||||
|
||||
return new llama_sampler {
|
||||
/* .iface = */ &llama_sampler_penalties_i,
|
||||
/* .ctx = */ new llama_sampler_penalties {
|
||||
/* .n_vocab = */ n_vocab,
|
||||
/* .special_eos_id = */ special_eos_id,
|
||||
/* .linefeed_id = */ linefeed_id,
|
||||
/* .penalty_last_n = */ penalty_last_n,
|
||||
/* .penalty_repeat = */ penalty_repeat,
|
||||
/* .penalty_freq = */ penalty_freq,
|
||||
/* .penalty_present = */ penalty_present,
|
||||
/* .penalize_nl = */ penalize_nl,
|
||||
/* .ignore_eos = */ ignore_eos,
|
||||
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
||||
/* .token_count = */ {},
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
||||
if (word.find(str) != std::string::npos) {
|
||||
token_sequences.emplace(token_id, std::vector<llama_token>());
|
||||
} else {
|
||||
size_t word_len = word.size(), str_len = str.size();
|
||||
size_t word_len = word.size();
|
||||
size_t str_len = str.size();
|
||||
size_t pos = -1;
|
||||
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
|
||||
bool match = true;
|
||||
|
@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
||||
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
||||
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
||||
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
|
||||
regex_exprs = {
|
||||
"\\p{N}",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
|
||||
std::vector<std::string> words(1, "");
|
||||
|
||||
for (const uint32_t cpt : cpts_nfd) {
|
||||
const auto flags = unicode_cpt_flags(cpt);
|
||||
const auto flags = unicode_cpt_flags_from_cpt(cpt);
|
||||
|
||||
if (flags.is_whitespace) {
|
||||
if (words.back().size()) { // finish previous word if any
|
||||
|
@ -163,6 +163,7 @@ enum llm_arch {
|
||||
LLM_ARCH_QWEN,
|
||||
LLM_ARCH_QWEN2,
|
||||
LLM_ARCH_QWEN2MOE,
|
||||
LLM_ARCH_QWEN2VL,
|
||||
LLM_ARCH_PHI2,
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PLAMO,
|
||||
@ -183,6 +184,7 @@ enum llm_arch {
|
||||
LLM_ARCH_OLMOE,
|
||||
LLM_ARCH_OPENELM,
|
||||
LLM_ARCH_ARCTIC,
|
||||
LLM_ARCH_DEEPSEEK,
|
||||
LLM_ARCH_DEEPSEEK2,
|
||||
LLM_ARCH_CHATGLM,
|
||||
LLM_ARCH_BITNET,
|
||||
@ -217,6 +219,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_QWEN, "qwen" },
|
||||
{ LLM_ARCH_QWEN2, "qwen2" },
|
||||
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
||||
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
||||
{ LLM_ARCH_PHI2, "phi2" },
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
@ -237,6 +240,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_OLMOE, "olmoe" },
|
||||
{ LLM_ARCH_OPENELM, "openelm" },
|
||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||
{ LLM_ARCH_CHATGLM, "chatglm" },
|
||||
{ LLM_ARCH_BITNET, "bitnet" },
|
||||
@ -308,6 +312,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
LLM_KV_ROPE_SCALE_LINEAR,
|
||||
LLM_KV_ROPE_SCALING_TYPE,
|
||||
@ -424,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
@ -898,6 +904,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_QWEN2VL,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_QWEN2MOE,
|
||||
{
|
||||
@ -1288,6 +1311,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DEEPSEEK,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DEEPSEEK2,
|
||||
{
|
||||
@ -1562,6 +1612,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||
LLM_CHAT_TEMPLATE_PHI_3,
|
||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||
LLM_CHAT_TEMPLATE_ZEPHYR,
|
||||
LLM_CHAT_TEMPLATE_MONARCH,
|
||||
LLM_CHAT_TEMPLATE_GEMMA,
|
||||
@ -1579,6 +1630,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||
LLM_CHAT_TEMPLATE_GRANITE,
|
||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
@ -1593,6 +1645,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
||||
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
||||
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
||||
@ -1610,6 +1663,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||
};
|
||||
|
||||
static llm_arch llm_arch_from_string(const std::string & name) {
|
||||
@ -1794,7 +1848,7 @@ private:
|
||||
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
||||
if (!bufLen) {
|
||||
ret = format("Win32 error code: %s", error_code);
|
||||
ret = format("Win32 error code: %lx", error_code);
|
||||
} else {
|
||||
ret = lpMsgBuf;
|
||||
LocalFree(lpMsgBuf);
|
||||
@ -2132,7 +2186,7 @@ struct llama_mmap {
|
||||
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
||||
|
||||
// may fail on pre-Windows 8 systems
|
||||
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
|
||||
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
|
||||
|
||||
if (pPrefetchVirtualMemory) {
|
||||
// advise the kernel to preload the mapped memory
|
||||
@ -2474,11 +2528,12 @@ struct llama_hparams {
|
||||
uint32_t time_decay_extra_dim = 0;
|
||||
uint32_t wkv_head_size = 0;
|
||||
|
||||
float rope_attn_factor = 1.0f;
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul;
|
||||
float rope_attn_factor = 1.0f;
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul;
|
||||
int rope_sections[4];
|
||||
|
||||
// for State Space Models
|
||||
uint32_t ssm_d_conv = 0;
|
||||
@ -2535,6 +2590,9 @@ struct llama_hparams {
|
||||
|
||||
if (this->rope_finetuned != other.rope_finetuned) return true;
|
||||
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
||||
if (std::equal(std::begin(this->rope_sections),
|
||||
std::end(this->rope_sections),
|
||||
std::begin(other.rope_sections))) return true;
|
||||
|
||||
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
||||
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
||||
@ -3378,6 +3436,11 @@ struct llama_context {
|
||||
// whether we are computing encoder output or decoder output
|
||||
bool is_encoding = false;
|
||||
|
||||
// TODO: find a better way to accommodate mutli-dimension position encoding methods
|
||||
// number of position id each token get, 1 for each token in most cases.
|
||||
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
|
||||
int n_pos_per_token = 1;
|
||||
|
||||
// output of the encoder part of the encoder-decoder models
|
||||
std::vector<float> embd_enc;
|
||||
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
||||
@ -4578,9 +4641,6 @@ struct llama_model_loader {
|
||||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
||||
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
||||
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
@ -5344,9 +5404,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
@ -5753,6 +5810,13 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
{
|
||||
std::array<int, 4> section_dims;
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
|
||||
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
|
||||
}
|
||||
// fall through
|
||||
case LLM_ARCH_QWEN2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@ -6063,6 +6127,19 @@ static void llm_load_hparams(
|
||||
model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 28: model.type = e_model::MODEL_20B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
bool is_lite = (hparams.n_layer == 27);
|
||||
@ -6398,6 +6475,11 @@ static void llm_load_vocab(
|
||||
} else if (
|
||||
tokenizer_pre == "falcon") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||
} else if (
|
||||
tokenizer_pre == "falcon3") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||
vocab.tokenizer_ignore_merges = true;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
} else if (
|
||||
tokenizer_pre == "mpt") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
||||
@ -6409,6 +6491,7 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "phi-2" ||
|
||||
tokenizer_pre == "jina-es" ||
|
||||
tokenizer_pre == "jina-de" ||
|
||||
tokenizer_pre == "gigachat" ||
|
||||
tokenizer_pre == "jina-v1-en" ||
|
||||
tokenizer_pre == "jina-v2-es" ||
|
||||
tokenizer_pre == "jina-v2-de" ||
|
||||
@ -6479,6 +6562,9 @@ static void llm_load_vocab(
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "minerva-7b") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
@ -7057,6 +7143,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
|
||||
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||
@ -8170,6 +8263,7 @@ static bool llm_load_tensors(
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_QWEN2:
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
{
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@ -8830,6 +8924,55 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
{
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
const int64_t n_expert_shared = hparams.n_expert_shared;
|
||||
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (i < (int) hparams.n_layer_dense_lead) {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
} else {
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error("n_expert must be > 0");
|
||||
}
|
||||
if (n_expert_used == 0) {
|
||||
throw std::runtime_error("n_expert_used must be > 0");
|
||||
}
|
||||
|
||||
// MoE branch
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
|
||||
// Shared expert branch
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
const bool is_lite = (hparams.n_layer == 27);
|
||||
@ -12559,6 +12702,124 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_qwen2vl() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
|
||||
cb(lctx.inp_pos, "inp_pos", -1);
|
||||
ggml_set_input(lctx.inp_pos);
|
||||
struct ggml_tensor * inp_pos = lctx.inp_pos;
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0,
|
||||
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0,
|
||||
ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_qwen2moe() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
@ -15066,6 +15327,161 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_deepseek() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
||||
cur = llm_build_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// MoE branch
|
||||
ggml_tensor * moe_out =
|
||||
llm_build_moe_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, hparams.expert_weights_scale,
|
||||
cb, il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// FFN shared expert
|
||||
{
|
||||
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(ffn_shexp, "ffn_shexp", il);
|
||||
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_deepseek2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
@ -16660,6 +17076,11 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_qwen2();
|
||||
} break;
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
{
|
||||
lctx.n_pos_per_token = 4;
|
||||
result = llm.build_qwen2vl();
|
||||
} break;
|
||||
case LLM_ARCH_QWEN2MOE:
|
||||
{
|
||||
result = llm.build_qwen2moe();
|
||||
@ -16748,6 +17169,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_arctic();
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
{
|
||||
result = llm.build_deepseek();
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
{
|
||||
result = llm.build_deepseek2();
|
||||
@ -16878,8 +17303,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
||||
|
||||
if (ubatch.pos && lctx.inp_pos) {
|
||||
const int64_t n_tokens = ubatch.n_tokens;
|
||||
|
||||
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
||||
auto n_pos = lctx.n_pos_per_token;
|
||||
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
|
||||
}
|
||||
|
||||
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
@ -18364,10 +18789,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
||||
new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
@ -18690,9 +19111,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
@ -19031,14 +19449,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
int chunk_size_multiplier = 1;
|
||||
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
||||
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
||||
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
@ -19051,8 +19461,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
||||
chunk_size_multiplier;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
@ -19995,6 +20404,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
case LLM_ARCH_OLMO:
|
||||
case LLM_ARCH_ARCTIC:
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_CHATGLM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
@ -20028,6 +20438,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_MINICPM3:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
return LLAMA_ROPE_TYPE_MROPE;
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
GGML_ABORT("unknown architecture");
|
||||
@ -21596,7 +22009,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
||||
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
||||
}
|
||||
} else if ((size_t) i >= ctx->output_ids.size()) {
|
||||
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
||||
throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
|
||||
} else {
|
||||
j = ctx->output_ids[i];
|
||||
}
|
||||
@ -21813,6 +22226,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
||||
}
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
||||
return LLM_CHAT_TEMPLATE_PHI_3;
|
||||
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
||||
return LLM_CHAT_TEMPLATE_FALCON_3;
|
||||
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
||||
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
||||
} else if (tmpl_contains("bos_token + message['role']")) {
|
||||
@ -21857,6 +22272,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
||||
} else if (tmpl_contains("<|start_of_role|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GRANITE;
|
||||
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
||||
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@ -21963,6 +22380,15 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
||||
// Falcon 3
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>\n" << message->content << "\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
||||
// zephyr template
|
||||
for (auto message : chat) {
|
||||
@ -22180,6 +22606,32 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
||||
// GigaChat template
|
||||
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
||||
|
||||
// Handle system message if present
|
||||
if (has_system) {
|
||||
ss << "<s>" << chat[0]->content << "<|message_sep|>";
|
||||
} else {
|
||||
ss << "<s>";
|
||||
}
|
||||
|
||||
// Process remaining messages
|
||||
for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
|
||||
std::string role(chat[i]->role);
|
||||
if (role == "user") {
|
||||
ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
|
||||
<< "available functions<|role_sep|>[]<|message_sep|>";
|
||||
} else if (role == "assistant") {
|
||||
ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
|
||||
}
|
||||
}
|
||||
|
||||
// Add generation prompt if needed
|
||||
if (add_ass) {
|
||||
ss << "assistant<|role_sep|>";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
@ -104,12 +104,15 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||
};
|
||||
|
||||
enum llama_rope_type {
|
||||
LLAMA_ROPE_TYPE_NONE = -1,
|
||||
LLAMA_ROPE_TYPE_NORM = 0,
|
||||
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
||||
LLAMA_ROPE_TYPE_NONE = -1,
|
||||
LLAMA_ROPE_TYPE_NORM = 0,
|
||||
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
||||
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
|
||||
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
|
||||
};
|
||||
|
||||
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
||||
@ -171,9 +174,9 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
||||
//LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
|
||||
//LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
|
||||
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
||||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||
|
||||
@ -455,6 +458,7 @@ extern "C" {
|
||||
// Functions to access the model's GGUF metadata scalar values
|
||||
// - The functions return the length of the string on success, or -1 on failure
|
||||
// - The output string is always null-terminated and cleared on failure
|
||||
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
||||
// - GGUF array values are not supported by these functions
|
||||
|
||||
// Get metadata value as a string by key name
|
||||
@ -1135,16 +1139,12 @@ extern "C" {
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
||||
int32_t n_vocab, // llama_n_vocab()
|
||||
llama_token special_eos_id, // llama_token_eos()
|
||||
llama_token linefeed_id, // llama_token_nl()
|
||||
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat, // 1.0 = disabled
|
||||
float penalty_freq, // 0.0 = disabled
|
||||
float penalty_present, // 0.0 = disabled
|
||||
bool penalize_nl, // consider newlines as a repeatable token
|
||||
bool ignore_eos); // ignore the end-of-sequence token
|
||||
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat, // 1.0 = disabled
|
||||
float penalty_freq, // 0.0 = disabled
|
||||
float penalty_present); // 0.0 = disabled
|
||||
|
||||
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
||||
|
@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
throw std::invalid_argument("failed to convert utf8 to codepoint");
|
||||
}
|
||||
|
||||
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
|
||||
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
|
||||
// std::vector<uint16_t> result;
|
||||
// if (/* 0x0000 <= cp && */ cp <= 0xffff) {
|
||||
// result.emplace_back(cp);
|
||||
// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
|
||||
// result.emplace_back(cpt);
|
||||
// return result;
|
||||
// }
|
||||
// if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
// result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
|
||||
// result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
|
||||
// if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
||||
// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
|
||||
// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
|
||||
// return result;
|
||||
// }
|
||||
// throw std::invalid_argument("failed to convert codepoint to utf16");
|
||||
@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
// return result;
|
||||
//}
|
||||
|
||||
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
|
||||
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
|
||||
static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
|
||||
std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
|
||||
|
||||
assert (unicode_ranges_flags.begin()[0].first == 0);
|
||||
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
|
||||
@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
|
||||
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
||||
// interface
|
||||
//
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp) {
|
||||
std::string unicode_cpt_to_utf8(uint32_t cpt) {
|
||||
std::string result;
|
||||
|
||||
if (/* 0x00 <= cp && */ cp <= 0x7f) {
|
||||
result.push_back(cp);
|
||||
if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
|
||||
result.push_back(cpt);
|
||||
return result;
|
||||
}
|
||||
if (0x80 <= cp && cp <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
if (0x80 <= cpt && cpt <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
if (0x800 <= cp && cp <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
if (0x800 <= cpt && cpt <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cp & 0x3f));
|
||||
if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cpt >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cpt >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||
return result;
|
||||
}
|
||||
|
||||
codepoint_flags unicode_cpt_flags(const uint32_t cp) {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
|
||||
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
|
||||
static const auto cpt_flags = unicode_cpt_flags_array();
|
||||
return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
|
||||
return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
|
||||
}
|
||||
|
||||
codepoint_flags unicode_cpt_flags(const std::string & utf8) {
|
||||
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
||||
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
|
||||
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
|
||||
if (utf8.empty()) {
|
||||
return undef; // undefined
|
||||
}
|
||||
size_t offset = 0;
|
||||
return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
|
||||
return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
|
||||
}
|
||||
|
||||
std::string unicode_byte_to_utf8(uint8_t byte) {
|
||||
@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
||||
return map.at(utf8);
|
||||
}
|
||||
|
||||
uint32_t unicode_tolower(uint32_t cp) {
|
||||
uint32_t unicode_tolower(uint32_t cpt) {
|
||||
// binary search
|
||||
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
|
||||
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
|
||||
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
|
||||
return pair.first < value;
|
||||
});
|
||||
if (it != unicode_map_lowercase.end() && it->first == cp) {
|
||||
if (it != unicode_map_lowercase.end() && it->first == cpt) {
|
||||
return it->second;
|
||||
}
|
||||
return cp; // Return the original code point if no lowercase mapping is found
|
||||
return cpt; // Return the original code point if no lowercase mapping is found
|
||||
}
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||
// unicode categories
|
||||
static const std::map<std::string, int> k_ucat_enum = {
|
||||
{ "\\p{N}", codepoint_flags::NUMBER },
|
||||
{ "\\p{L}", codepoint_flags::LETTER },
|
||||
{ "\\p{P}", codepoint_flags::PUNCTUATION },
|
||||
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
||||
{ "\\p{L}", unicode_cpt_flags::LETTER },
|
||||
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
||||
};
|
||||
|
||||
static const std::map<int, int> k_ucat_cpt = {
|
||||
{ codepoint_flags::NUMBER, 0xD1 },
|
||||
{ codepoint_flags::LETTER, 0xD2 },
|
||||
{ codepoint_flags::PUNCTUATION, 0xD3 },
|
||||
{ unicode_cpt_flags::NUMBER, 0xD1 },
|
||||
{ unicode_cpt_flags::LETTER, 0xD2 },
|
||||
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
|
||||
};
|
||||
|
||||
static const std::map<int, std::string> k_ucat_map = {
|
||||
{ codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
|
||||
{ codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
||||
{ codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
||||
{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
|
||||
{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
||||
{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
||||
};
|
||||
|
||||
// compute collapsed codepoints only if needed by at least one regex
|
||||
bool need_collapse = false;
|
||||
for (auto & regex_expr : regex_exprs) {
|
||||
for (const auto & regex_expr : regex_exprs) {
|
||||
// search for unicode categories
|
||||
for (const auto & ucat : k_ucat_enum) {
|
||||
if (std::string::npos != regex_expr.find(ucat.first)) {
|
||||
@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto flags = unicode_cpt_flags(cpts[i]);
|
||||
const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
|
||||
|
||||
if (flags.is_whitespace) {
|
||||
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
|
||||
@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
|
||||
std::vector<size_t> bpe_offsets = { cpts.size() };
|
||||
|
||||
for (auto & regex_expr : regex_exprs) {
|
||||
for (const auto & regex_expr : regex_exprs) {
|
||||
// first, see if we have an efficient custom regex implementation
|
||||
auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
|
||||
|
||||
@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
|
||||
// with the corresponding collapsed representation
|
||||
bool use_collapsed = false;
|
||||
for (auto & ucat : k_ucat_enum) {
|
||||
for (const auto & ucat : k_ucat_enum) {
|
||||
if (std::string::npos != regex_expr.find(ucat.first)) {
|
||||
use_collapsed = true;
|
||||
break;
|
||||
@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
|
||||
std::wstring wtext(cpts.begin(), cpts.end());
|
||||
for (size_t i = 0; i < wtext.size(); ++i) {
|
||||
if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
|
||||
if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
|
||||
wtext[i] = 0x0B;
|
||||
}
|
||||
}
|
||||
|
@ -4,9 +4,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// TODO: prefix all symbols with "llama_"
|
||||
|
||||
struct codepoint_flags {
|
||||
struct unicode_cpt_flags {
|
||||
enum {
|
||||
UNDEFINED = 0x0001,
|
||||
NUMBER = 0x0002, // regex: \p{N}
|
||||
@ -35,7 +33,7 @@ struct codepoint_flags {
|
||||
uint16_t is_nfd : 1;
|
||||
|
||||
// decode from uint16
|
||||
inline codepoint_flags(const uint16_t flags=0) {
|
||||
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
||||
*reinterpret_cast<uint16_t*>(this) = flags;
|
||||
}
|
||||
|
||||
@ -50,18 +48,19 @@ struct codepoint_flags {
|
||||
|
||||
size_t unicode_len_utf8(char src);
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||
std::string unicode_cpt_to_utf8 (uint32_t cpt);
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||
|
||||
codepoint_flags unicode_cpt_flags(const uint32_t cp);
|
||||
codepoint_flags unicode_cpt_flags(const std::string & utf8);
|
||||
unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
|
||||
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
|
||||
|
||||
std::string unicode_byte_to_utf8(uint8_t byte);
|
||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||
|
||||
uint32_t unicode_tolower(uint32_t cp);
|
||||
uint32_t unicode_tolower(uint32_t cpt);
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||
|
Loading…
Reference in New Issue
Block a user