talk-llama : sync llama.cpp (#3084)

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-04-28 16:40:23 +03:00
committed by GitHub
parent 28dcdff4c5
commit f3c42399a3
36 changed files with 16940 additions and 12400 deletions

View File

@ -16,6 +16,7 @@
#include <queue>
#include <set>
#include <unordered_map>
#include <cctype>
//
// helpers
@ -341,6 +342,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_MPT:
case LLAMA_VOCAB_PRE_TYPE_OLMO:
case LLAMA_VOCAB_PRE_TYPE_JAIS:
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
regex_exprs = {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
};
@ -392,6 +394,27 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
};
break;
case LLAMA_VOCAB_PRE_TYPE_GPT4O:
regex_exprs = {
// original regex from tokenizer.json
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
regex_exprs = {
"\\p{N}+",
"(?=(\\d{3})+(?!\\d))",
};
break;
case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
regex_exprs = {
// original regex from tokenizer.json
// "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
// FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -1483,7 +1506,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe"||
tokenizer_pre == "falcon3") {
tokenizer_pre == "falcon3" ||
tokenizer_pre == "pixtral") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true;
add_bos = true;
@ -1549,6 +1573,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
clean_spaces = false;
} else if (
tokenizer_pre == "glm4" ||
tokenizer_pre == "chatglm-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
special_bos_id = LLAMA_TOKEN_NULL;
@ -1592,6 +1617,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} else if (
tokenizer_pre == "megrez") {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else if (
tokenizer_pre == "gpt-4o" ||
tokenizer_pre == "llama4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
clean_spaces = false;
} else if (
tokenizer_pre == "superbpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
clean_spaces = false;
} else if (
tokenizer_pre == "trillion") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
clean_spaces = false;
} else if (
tokenizer_pre == "bailingmoe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
clean_spaces = false;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -1769,6 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
|| t.first == "<EOT>"
|| t.first == "_<EOT>"
|| t.first == "<end▁of▁sentence>" // DeepSeek
) {
special_eot_id = t.second;
@ -1799,8 +1842,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false
|| t.first == "<|fim_prefix|>" // Qwen
|| t.first == "<fim-prefix>"
|| t.first == "<fim_prefix>" // Granite
|| t.first == "<fim▁begin>" // DeepSeek
|| t.first == "<PRE>"
|| t.first == "▁<PRE>" // CodeLlama
) {
special_fim_pre_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -1816,8 +1861,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false
|| t.first == "<|fim_suffix|>" // Qwen
|| t.first == "<fim-suffix>"
|| t.first == "<fim_suffix>" // Granite
|| t.first == "<fim▁hole>" // DeepSeek
|| t.first == "<SUF>"
|| t.first == "▁<SUF>" // CodeLlama
) {
special_fim_suf_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -1833,8 +1880,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false
|| t.first == "<|fim_middle|>" // Qwen
|| t.first == "<fim-middle>"
|| t.first == "<fim_middle>" // Granite
|| t.first == "<fim▁end>" // DeepSeek
|| t.first == "<MID>"
|| t.first == "▁<MID>" // CodeLlama
) {
special_fim_mid_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -1850,6 +1899,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (false
|| t.first == "<|fim_pad|>" // Qwen
|| t.first == "<fim-pad>"
|| t.first == "<fim_pad>" // Granite
|| t.first == "<PAD>"
) {
special_fim_pad_id = t.second;
@ -1868,6 +1918,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|repo_name|>"
|| t.first == "<fim-repo>"
|| t.first == "<REPO>"
|| t.first == "<reponame>" // Granite
) {
special_fim_rep_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -1919,6 +1970,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>"
|| t.first == "<EOT>"
|| t.first == "_<EOT>"
) {
special_eog_ids.insert(t.second);
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -2177,14 +2229,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
// find the first occurrence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text
auto match = raw_text.find(text, raw_text_base_offset);
// string_view begins at pos 0 for the same reason
auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
// no occurrences found, stop processing this fragment for a given special token
if (match == std::string::npos) break;
// check if match is within bounds of offset <-> length
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
#endif