2024-08-08 14:16:50 +03:00
# include "llama-vocab.h"
2025-01-13 08:55:48 +02:00
# include "llama-impl.h"
2025-01-14 09:53:50 +02:00
# include "llama-model-loader.h"
2025-01-13 08:55:48 +02:00
2024-08-08 14:16:50 +03:00
# include "unicode.h"
# include <algorithm>
# include <cassert>
# include <cfloat>
# include <climits>
# include <cstdarg>
# include <cstring>
# include <forward_list>
2025-01-14 09:53:50 +02:00
# include <map>
2024-08-08 14:16:50 +03:00
# include <queue>
2025-01-14 09:53:50 +02:00
# include <set>
# include <unordered_map>
2024-08-08 14:16:50 +03:00
//
// helpers
//
struct naive_trie {
naive_trie ( ) : has_value ( false ) , value ( 0 ) {
}
void insert ( const char * key , size_t len , int32_t value = 0 ) {
if ( len = = 0 ) {
this - > has_value = true ;
this - > value = value ;
return ;
}
char c = key [ 0 ] ;
auto res = children . find ( c ) ;
if ( res ! = children . end ( ) ) {
res - > second . insert ( key + 1 , len - 1 , value ) ;
} else {
auto res = children . insert ( std : : make_pair ( c , naive_trie ( ) ) ) ;
res . first - > second . insert ( key + 1 , len - 1 , value ) ;
}
}
2024-10-02 15:14:46 +03:00
std : : pair < const char * , size_t > get_longest_prefix ( const char * key , size_t len , size_t offset = 0 ) const {
2024-08-08 14:16:50 +03:00
if ( len = = 0 | | offset = = len ) {
return std : : make_pair ( key , offset ) ;
}
char c = key [ offset ] ;
auto res = children . find ( c ) ;
if ( res ! = children . end ( ) ) {
return res - > second . get_longest_prefix ( key , len , offset + 1 ) ;
}
2024-09-24 13:22:55 +03:00
return std : : make_pair ( key , offset ) ;
2024-08-08 14:16:50 +03:00
}
2024-09-24 13:22:55 +03:00
const struct naive_trie * traverse ( const char c ) const {
2024-08-08 14:16:50 +03:00
auto res = children . find ( c ) ;
if ( res ! = children . end ( ) ) {
return & res - > second ;
}
2024-09-24 13:22:55 +03:00
return NULL ;
2024-08-08 14:16:50 +03:00
}
std : : map < char , struct naive_trie > children ;
bool has_value ;
llama_token value ;
} ;
//
2025-01-14 09:53:50 +02:00
// tokenizers
2024-08-08 14:16:50 +03:00
//
2024-10-02 15:14:46 +03:00
struct llm_tokenizer {
2025-01-14 09:53:50 +02:00
llm_tokenizer ( ) { }
virtual ~ llm_tokenizer ( ) = default ;
2024-10-02 15:14:46 +03:00
} ;
2024-08-08 14:16:50 +03:00
struct llm_symbol {
using index = int ;
index prev ;
index next ;
const char * text ;
size_t n ;
} ;
static_assert ( std : : is_trivially_copyable < llm_symbol > : : value , " llm_symbol is not trivially copyable " ) ;
//
// SPM tokenizer
// original implementation:
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
//
struct llm_bigram_spm {
struct comparator {
bool operator ( ) ( llm_bigram_spm & l , llm_bigram_spm & r ) {
return ( l . score < r . score ) | | ( l . score = = r . score & & l . left > r . left ) ;
}
} ;
using queue_storage = std : : vector < llm_bigram_spm > ;
using queue = std : : priority_queue < llm_bigram_spm , queue_storage , comparator > ;
llm_symbol : : index left ;
llm_symbol : : index right ;
float score ;
size_t size ;
} ;
2024-10-02 15:14:46 +03:00
struct llm_tokenizer_spm : llm_tokenizer {
2025-01-14 09:53:50 +02:00
llm_tokenizer_spm ( const llama_vocab & /*vocab*/ ) { }
2024-10-02 15:14:46 +03:00
} ;
struct llm_tokenizer_spm_session {
llm_tokenizer_spm_session ( const llama_vocab & vocab ) : vocab ( vocab ) { }
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
void tokenize ( const std : : string & text , std : : vector < llama_token > & output ) {
2024-08-08 14:16:50 +03:00
// split string into utf8 chars
int index = 0 ;
size_t offs = 0 ;
while ( offs < text . size ( ) ) {
llm_symbol sym ;
size_t len = unicode_len_utf8 ( text [ offs ] ) ;
sym . text = text . c_str ( ) + offs ;
sym . n = std : : min ( len , text . size ( ) - offs ) ;
offs + = sym . n ;
sym . prev = index - 1 ;
sym . next = offs = = text . size ( ) ? - 1 : index + 1 ;
index + + ;
symbols . emplace_back ( sym ) ;
}
// seed the work queue with all possible 2-character tokens.
2024-10-31 22:29:22 +02:00
for ( int i = 1 ; i < ( int ) symbols . size ( ) ; + + i ) {
2024-08-08 14:16:50 +03:00
try_add_bigram ( i - 1 , i ) ;
}
// keep substituting the highest frequency pairs for as long as we can.
while ( ! work_queue . empty ( ) ) {
auto bigram = work_queue . top ( ) ;
work_queue . pop ( ) ;
auto & left_sym = symbols [ bigram . left ] ;
auto & right_sym = symbols [ bigram . right ] ;
// if one of the symbols already got merged, skip it.
if ( left_sym . n = = 0 | | right_sym . n = = 0 | |
left_sym . n + right_sym . n ! = bigram . size ) {
continue ;
}
// merge the right sym into the left one
left_sym . n + = right_sym . n ;
right_sym . n = 0 ;
//LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
// remove the right sym from the chain
left_sym . next = right_sym . next ;
if ( right_sym . next > = 0 ) {
symbols [ right_sym . next ] . prev = bigram . left ;
}
// find more substitutions
try_add_bigram ( left_sym . prev , bigram . left ) ;
try_add_bigram ( bigram . left , left_sym . next ) ;
}
for ( int i = 0 ; i ! = - 1 ; i = symbols [ i ] . next ) {
auto & symbol = symbols [ i ] ;
resegment ( symbol , output ) ;
}
}
private :
2025-01-14 09:53:50 +02:00
void resegment ( llm_symbol & symbol , std : : vector < llama_token > & output ) {
2024-08-08 14:16:50 +03:00
auto text = std : : string ( symbol . text , symbol . n ) ;
2025-01-14 09:53:50 +02:00
auto token = vocab . text_to_token ( text ) ;
2024-08-08 14:16:50 +03:00
// Do we need to support is_unused?
2025-01-14 09:53:50 +02:00
if ( token ! = LLAMA_TOKEN_NULL ) {
output . push_back ( token ) ;
2024-08-08 14:16:50 +03:00
return ;
}
const auto p = rev_merge . find ( text ) ;
if ( p = = rev_merge . end ( ) ) {
// output any symbols that did not form tokens as bytes.
output . reserve ( output . size ( ) + symbol . n ) ;
for ( int j = 0 ; j < ( int ) symbol . n ; + + j ) {
2025-01-14 09:53:50 +02:00
llama_token id = vocab . byte_to_token ( symbol . text [ j ] ) ;
output . push_back ( id ) ;
2024-08-08 14:16:50 +03:00
}
return ;
}
2024-10-02 15:14:46 +03:00
resegment ( symbols [ p - > second . first ] , output ) ;
2024-08-08 14:16:50 +03:00
resegment ( symbols [ p - > second . second ] , output ) ;
}
void try_add_bigram ( int left , int right ) {
if ( left = = - 1 | | right = = - 1 ) {
return ;
}
const std : : string text = std : : string ( symbols [ left ] . text , symbols [ left ] . n + symbols [ right ] . n ) ;
2025-01-14 09:53:50 +02:00
auto token = vocab . text_to_token ( text ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
if ( token = = LLAMA_TOKEN_NULL ) {
2024-08-08 14:16:50 +03:00
return ;
}
2025-01-14 09:53:50 +02:00
if ( static_cast < uint32_t > ( token ) > = vocab . n_tokens ( ) ) {
2024-08-08 14:16:50 +03:00
return ;
}
2025-01-14 09:53:50 +02:00
const auto & tok_data = vocab . get_token_data ( token ) ;
2024-08-08 14:16:50 +03:00
llm_bigram_spm bigram ;
bigram . left = left ;
bigram . right = right ;
bigram . score = tok_data . score ;
bigram . size = text . size ( ) ;
work_queue . push ( bigram ) ;
// Do we need to support is_unused?
rev_merge [ text ] = std : : make_pair ( left , right ) ;
}
const llama_vocab & vocab ;
2024-10-02 15:14:46 +03:00
// currently unused
// const llm_tokenizer_spm * spm_tokenizer;
2024-08-08 14:16:50 +03:00
std : : vector < llm_symbol > symbols ;
llm_bigram_spm : : queue work_queue ;
std : : map < std : : string , std : : pair < int , int > > rev_merge ;
} ;
//
// BPE tokenizer
// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
// tried to simplify unicode stuff, so most likely does not work 100% correctly!
//
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
2024-08-28 11:04:02 +03:00
template < typename T , typename Container = std : : vector < T > , typename Compare = std : : less < typename Container : : value_type > >
class llama_priority_queue : public std : : priority_queue < T , Container , Compare > {
public :
using std : : priority_queue < T , Container , Compare > : : priority_queue ;
T pop_move ( ) {
T item = std : : move ( this - > c . front ( ) ) ;
std : : pop_heap ( this - > c . begin ( ) , this - > c . end ( ) , this - > comp ) ;
this - > c . pop_back ( ) ;
return item ;
}
void pop ( ) = delete ;
} ;
2024-08-08 14:16:50 +03:00
struct llm_bigram_bpe {
struct comparator {
bool operator ( ) ( const llm_bigram_bpe & l , const llm_bigram_bpe & r ) const {
return l . rank > r . rank | | ( l . rank = = r . rank & & l . left > r . left ) ;
}
} ;
using queue_storage = std : : vector < llm_bigram_bpe > ;
2024-08-28 11:04:02 +03:00
using queue = llama_priority_queue < llm_bigram_bpe , queue_storage , comparator > ;
2024-08-08 14:16:50 +03:00
llm_symbol : : index left ;
llm_symbol : : index right ;
std : : string text ;
int rank ;
size_t size ;
} ;
2024-10-02 15:14:46 +03:00
struct llm_tokenizer_bpe : llm_tokenizer {
2025-01-14 09:53:50 +02:00
llm_tokenizer_bpe ( const llama_vocab & vocab ) {
GGML_ASSERT ( vocab . get_type ( ) = = LLAMA_VOCAB_TYPE_BPE ) ;
switch ( vocab . get_pre_type ( ) ) {
2024-08-08 14:16:50 +03:00
case LLAMA_VOCAB_PRE_TYPE_LLAMA3 :
regex_exprs = {
// original regex from tokenizer.json
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
" (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^ \\ r \\ n \\ p{L} \\ p{N}]? \\ p{L}+| \\ p{N}{1,3}| ?[^ \\ s \\ p{L} \\ p{N}]+[ \\ r \\ n]*| \\ s*[ \\ r \\ n]+| \\ s+(?! \\ S)| \\ s+ " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_DBRX :
case LLAMA_VOCAB_PRE_TYPE_SMAUG :
regex_exprs = {
// same as llama3
" (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^ \\ r \\ n \\ p{L} \\ p{N}]? \\ p{L}+| \\ p{N}{1,3}| ?[^ \\ s \\ p{L} \\ p{N}]+[ \\ r \\ n]*| \\ s*[ \\ r \\ n]+| \\ s+(?! \\ S)| \\ s+ " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM :
regex_exprs = {
" [ \r \n ] " ,
" \\ s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓ ℕ ℙ -ℝℤΩℨK-ℭ ℯ -ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Z a -z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+ " ,
" \\ s?[!-/:-~! -/ : -~ ‘ -‟ -。]+ " ,
" \\ s+$ " ,
" [一-龥ࠀ-一가-]+ " ,
" \\ p{N}+ " ,
} ;
break ;
2025-01-13 08:55:48 +02:00
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM :
regex_exprs = {
" \\ p{N}{1,3} " ,
" [一-龥-ゟ゠-ヿ]+ " ,
" [! \" #$%&'()*+, \\ -./:;<=>?@ \\ [ \\ \\ \\ ]^_`{|}~][A-Za-z]+|[^ \r \n \\ p{L} \\ p{P} \\ p{S}]?[ \\ p{L} \\ p{M}]+| ?[ \\ p{P} \\ p{S}]+[ \r \n ]*| \\ s*[ \r \n ]+| \\ s+(?! \\ S)| \\ s+ " ,
} ;
break ;
2024-08-08 14:16:50 +03:00
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER :
regex_exprs = {
" [ \r \n ] " ,
" \\ s? \\ p{L}+ " ,
" \\ s? \\ p{P}+ " ,
" [一-龥ࠀ-一가-]+ " ,
" \\ p{N} " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_FALCON :
regex_exprs = {
" [ \\ p{P} \\ $ \\ +<=> \\ ^~ \\ |`]+ " ,
" 's|'t|'re|'ve|'m|'ll|'d| ? \\ p{L}+| ? \\ p{N}+| ?[^ \\ s \\ p{L} \\ p{N}]+| \\ s+(?! \\ S) " ,
" [0-9][0-9][0-9] " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_STARCODER :
case LLAMA_VOCAB_PRE_TYPE_REFACT :
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R :
case LLAMA_VOCAB_PRE_TYPE_SMOLLM :
case LLAMA_VOCAB_PRE_TYPE_CODESHELL :
2024-08-28 11:04:02 +03:00
case LLAMA_VOCAB_PRE_TYPE_EXAONE :
2024-12-17 21:19:28 +02:00
case LLAMA_VOCAB_PRE_TYPE_MINERVA :
2024-08-08 14:16:50 +03:00
regex_exprs = {
" \\ p{N} " ,
" 's|'t|'re|'ve|'m|'ll|'d| ? \\ p{L}+| ? \\ p{N}+| ?[^ \\ s \\ p{L} \\ p{N}]+| \\ s+(?! \\ S) " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_GPT2 :
case LLAMA_VOCAB_PRE_TYPE_MPT :
case LLAMA_VOCAB_PRE_TYPE_OLMO :
case LLAMA_VOCAB_PRE_TYPE_JAIS :
regex_exprs = {
" 's|'t|'re|'ve|'m|'ll|'d| ? \\ p{L}+| ? \\ p{N}+| ?[^ \\ s \\ p{L} \\ p{N}]+| \\ s+(?! \\ S) " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_STABLELM2 :
case LLAMA_VOCAB_PRE_TYPE_QWEN2 :
regex_exprs = {
// original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
" (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^ \\ r \\ n \\ p{L} \\ p{N}]? \\ p{L}+| \\ p{N}| ?[^ \\ s \\ p{L} \\ p{N}]+[ \\ r \\ n]*| \\ s*[ \\ r \\ n]+| \\ s+(?! \\ S)| \\ s+ " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_PORO :
2024-08-28 11:04:02 +03:00
case LLAMA_VOCAB_PRE_TYPE_BLOOM :
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH :
2024-08-08 14:16:50 +03:00
regex_exprs = {
" ?[^( \\ s|.,!?…。,、।۔،)]+ " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_CHATGLM4 :
regex_exprs = {
" (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^ \\ r \\ n \\ p{L} \\ p{N}]? \\ p{L}+| \\ p{N}{1,3}| ?[^ \\ s \\ p{L} \\ p{N}]+[ \\ r \\ n]*| \\ s*[ \\ r \\ n]+| \\ s+(?! \\ S)| \\ s+ " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_VIKING :
regex_exprs = {
" ?[^( \\ s|.,!?…。,、।۔،)]+ " ,
" \\ p{N} " ,
} ;
break ;
case LLAMA_VOCAB_PRE_TYPE_TEKKEN :
// original regex from tokenizer.json
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
regex_exprs = {
" [^ \\ r \\ n \\ p{L} \\ p{N}]?((?=[ \\ p{L}])([^a-z]))*((?=[ \\ p{L}])([^A-Z]))+|[^ \\ r \\ n \\ p{L} \\ p{N}]?((?=[ \\ p{L}])([^a-z]))+((?=[ \\ p{L}])([^A-Z]))*| \\ p{N}| ?[^ \\ s \\ p{L} \\ p{N}]+[ \\ r \\ n/]*| \\ s*[ \\ r \\ n]+| \\ s+(?! \\ S)| \\ s+ " ,
} ;
break ;
2024-10-02 15:14:46 +03:00
case LLAMA_VOCAB_PRE_TYPE_CHAMELEON :
// Note: in theory, the special token (sentinel and image token) regex_exprs below
// are unnecessary, as they are split in `tokenizer_st_partition` anyway.
// However, since the upstream pre-tokenizer uses them, they are also
// included here (see https://huggingface.co/facebook/chameleon-7b).
regex_exprs = {
" <sentinel:[0-9]+> " , // Sentinel tokens
" (IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z " , // Image tokens
" ([ \\ t \\ n]| | ) " , // directly from tokenizer.json
" \\ p{N} " , // Individual digits
" [ \\ p{P}!-/:-@ \\ [-`{-~] " , // Punctuation, Isolated
" 's|'t|'re|'ve|'m|'ll|'d| ? \\ p{L}+| ? \\ p{N}+| ?[^ \\ s \\ p{L} \\ p{N}]+| \\ s+(?! \\ S) " ,
} ;
break ;
2024-08-08 14:16:50 +03:00
default :
// default regex for BPE tokenization pre-processing
regex_exprs = {
" [ \\ p{P} \\ $ \\ +<=> \\ ^~ \\ |]+ " ,
" 's|'t|'re|'ve|'m|'ll|'d| ? \\ p{L}+| ? \\ p{N}+| ?[^ \\ s \\ p{L} \\ p{N}]+| \\ s+(?! \\ S) " ,
" \\ p{N}+ " ,
" [0-9][0-9][0-9] " ,
} ;
break ;
}
}
2024-10-02 15:14:46 +03:00
std : : vector < std : : string > regex_exprs ;
} ;
struct llm_tokenizer_bpe_session {
2025-01-14 09:53:50 +02:00
llm_tokenizer_bpe_session ( const llama_vocab & vocab , const llm_tokenizer_bpe & tokenizer ) : vocab ( vocab ) , tokenizer ( tokenizer ) { }
2024-10-02 15:14:46 +03:00
2025-01-14 09:53:50 +02:00
static void append ( const llama_token token_id , std : : vector < llama_token > & output ) {
2024-08-08 14:16:50 +03:00
output . push_back ( token_id ) ;
}
2025-01-14 09:53:50 +02:00
bool append_bos ( std : : vector < llama_token > & output ) const {
if ( vocab . get_add_bos ( ) ) {
GGML_ASSERT ( vocab . token_bos ( ) ! = LLAMA_TOKEN_NULL ) ;
output . push_back ( vocab . token_bos ( ) ) ;
2024-08-08 14:16:50 +03:00
return true ;
}
return false ;
}
2025-01-14 09:53:50 +02:00
bool append_eos ( std : : vector < llama_token > & output ) const {
if ( vocab . get_add_eos ( ) ) {
GGML_ASSERT ( vocab . token_eos ( ) ! = LLAMA_TOKEN_NULL ) ;
output . push_back ( vocab . token_eos ( ) ) ;
2024-08-08 14:16:50 +03:00
return true ;
}
return false ;
}
2025-01-14 09:53:50 +02:00
void check_double_bos_eos ( const std : : vector < llama_token > & output ) const {
if ( vocab . get_add_bos ( ) & & output . size ( ) > = 2 & & output [ 1 ] = = vocab . token_bos ( ) ) {
2024-08-08 14:16:50 +03:00
LLAMA_LOG_WARN (
" %s: Added a BOS token to the prompt as specified by the model but the prompt "
" also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
" Are you sure this is what you want? \n " , __FUNCTION__ ) ;
}
2025-01-14 09:53:50 +02:00
if ( vocab . get_add_bos ( ) & & output . size ( ) > = 2 & & * ( output . end ( ) - 2 ) = = vocab . token_eos ( ) ) {
2024-08-08 14:16:50 +03:00
LLAMA_LOG_WARN (
" %s: Added a EOS token to the prompt as specified by the model but the prompt "
" also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
" Are you sure this is what you want? \n " , __FUNCTION__ ) ;
}
}
2025-01-14 09:53:50 +02:00
void tokenize ( const std : : string & text , std : : vector < llama_token > & output ) {
2024-08-08 14:16:50 +03:00
int final_prev_index = - 1 ;
2025-01-14 09:53:50 +02:00
const auto word_collection = unicode_regex_split ( text , tokenizer . regex_exprs ) ;
2024-08-08 14:16:50 +03:00
symbols_final . clear ( ) ;
2024-10-02 15:14:46 +03:00
for ( const auto & word : word_collection ) {
2024-08-08 14:16:50 +03:00
work_queue = llm_bigram_bpe : : queue ( ) ;
symbols . clear ( ) ;
int index = 0 ;
size_t offset = 0 ;
2025-01-14 09:53:50 +02:00
//if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
if ( vocab . get_ignore_merges ( ) & & vocab . text_to_token ( word ) ! = LLAMA_TOKEN_NULL ) {
2024-08-08 14:16:50 +03:00
symbols . emplace_back ( llm_symbol { - 1 , - 1 , word . c_str ( ) , word . size ( ) } ) ;
offset = word . size ( ) ;
}
while ( offset < word . size ( ) ) {
llm_symbol sym ;
size_t char_len = std : : min ( word . size ( ) - offset , ( size_t ) unicode_len_utf8 ( word [ offset ] ) ) ;
sym . text = word . c_str ( ) + offset ;
sym . n = char_len ;
offset + = sym . n ;
sym . prev = index - 1 ;
sym . next = offset = = word . size ( ) ? - 1 : index + 1 ;
index + + ;
symbols . emplace_back ( sym ) ;
}
2024-10-31 22:29:22 +02:00
for ( int i = 1 ; i < ( int ) symbols . size ( ) ; + + i ) {
2024-08-08 14:16:50 +03:00
add_new_bigram ( i - 1 , i ) ;
}
// build token(s)
while ( ! work_queue . empty ( ) ) {
2024-08-28 11:04:02 +03:00
auto bigram = work_queue . pop_move ( ) ;
2024-08-08 14:16:50 +03:00
auto & left_symbol = symbols [ bigram . left ] ;
auto & right_symbol = symbols [ bigram . right ] ;
if ( left_symbol . n = = 0 | | right_symbol . n = = 0 ) {
continue ;
}
std : : string left_token = std : : string ( left_symbol . text , left_symbol . n ) ;
std : : string right_token = std : : string ( right_symbol . text , right_symbol . n ) ;
if ( left_token + right_token ! = bigram . text ) {
continue ; // Skip this bigram if it's outdated
}
// merge the right sym into the left one
left_symbol . n + = right_symbol . n ;
right_symbol . n = 0 ;
// remove the right sym from the chain
left_symbol . next = right_symbol . next ;
if ( right_symbol . next > = 0 ) {
symbols [ right_symbol . next ] . prev = bigram . left ;
}
add_new_bigram ( left_symbol . prev , bigram . left ) ; // left side of current symbol
add_new_bigram ( bigram . left , left_symbol . next ) ; // right side of current symbol
}
// add the finished tokens to the final list keeping correct order for next and prev
for ( auto & sym : symbols ) {
if ( sym . n > 0 ) {
sym . prev = final_prev_index ;
sym . next = - 1 ;
if ( final_prev_index ! = - 1 ) {
symbols_final [ final_prev_index ] . next = symbols_final . size ( ) ;
}
symbols_final . emplace_back ( sym ) ;
final_prev_index = symbols_final . size ( ) - 1 ;
}
}
}
symbols = symbols_final ;
if ( ! symbols . empty ( ) ) {
for ( int i = 0 ; i ! = - 1 ; i = symbols [ i ] . next ) {
auto & symbol = symbols [ i ] ;
if ( symbol . n = = 0 ) {
continue ;
}
const std : : string str = std : : string ( symbol . text , symbol . n ) ;
2025-01-14 09:53:50 +02:00
const auto token = vocab . text_to_token ( str ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
if ( token = = LLAMA_TOKEN_NULL ) {
2024-08-08 14:16:50 +03:00
for ( auto j = str . begin ( ) ; j ! = str . end ( ) ; + + j ) {
std : : string byte_str ( 1 , * j ) ;
2025-01-14 09:53:50 +02:00
auto token_multibyte = vocab . text_to_token ( byte_str ) ;
if ( token_multibyte ! = LLAMA_TOKEN_NULL ) {
output . push_back ( token_multibyte ) ;
2024-08-08 14:16:50 +03:00
}
}
} else {
2025-01-14 09:53:50 +02:00
output . push_back ( token ) ;
2024-08-08 14:16:50 +03:00
}
}
}
}
private :
void add_new_bigram ( int left , int right ) {
if ( left = = - 1 | | right = = - 1 ) {
return ;
}
std : : string left_token = std : : string ( symbols [ left ] . text , symbols [ left ] . n ) ;
std : : string right_token = std : : string ( symbols [ right ] . text , symbols [ right ] . n ) ;
int rank_found = - 1 ;
rank_found = vocab . find_bpe_rank ( left_token , right_token ) ;
if ( rank_found < 0 ) {
return ;
}
llm_bigram_bpe bigram ;
bigram . left = left ;
bigram . right = right ;
bigram . text = left_token + right_token ;
bigram . size = left_token . size ( ) + right_token . size ( ) ;
bigram . rank = rank_found ;
work_queue . push ( bigram ) ;
}
const llama_vocab & vocab ;
2025-01-14 09:53:50 +02:00
const llm_tokenizer_bpe & tokenizer ;
2024-08-08 14:16:50 +03:00
std : : vector < llm_symbol > symbols ;
std : : vector < llm_symbol > symbols_final ;
llm_bigram_bpe : : queue work_queue ;
} ;
//
// WPM tokenizer
//
2024-10-02 15:14:46 +03:00
struct llm_tokenizer_wpm : llm_tokenizer {
2025-01-14 09:53:50 +02:00
llm_tokenizer_wpm ( const llama_vocab & /*vocab*/ ) { }
2024-10-02 15:14:46 +03:00
} ;
2024-08-08 14:16:50 +03:00
2024-10-02 15:14:46 +03:00
struct llm_tokenizer_wpm_session {
llm_tokenizer_wpm_session ( const llama_vocab & vocab ) : vocab ( vocab ) { }
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
void tokenize ( const std : : string & text , std : : vector < llama_token > & output ) {
2024-08-08 14:16:50 +03:00
// normalize and split by whitespace
std : : vector < std : : string > words = preprocess ( text ) ;
// bos token prepended already
// find the longest tokens that form the words
for ( const std : : string & word : words ) {
// skip empty words
if ( word . size ( ) = = 0 ) {
continue ;
}
// prepend phantom space
const std : : string word1 = " \xe2 \x96 \x81 " + word ;
const int n = word1 . size ( ) ;
const size_t current_tokens = output . size ( ) ;
// we're at the start of a new word
// move through character position in word
for ( int i = 0 ; i < n ; + + i ) {
// loop through possible match length
bool match = false ;
2025-01-14 09:53:50 +02:00
for ( int j = std : : min ( n , i + vocab . max_token_len ( ) + 1 ) ; j > i ; j - - ) {
auto id = vocab . text_to_token ( word1 . substr ( i , j - i ) ) ;
if ( id ! = LLAMA_TOKEN_NULL ) {
output . push_back ( id ) ;
2024-08-08 14:16:50 +03:00
match = true ;
i = j - 1 ;
break ;
}
}
if ( ! match ) { // discard all
output . resize ( current_tokens ) ;
break ; // and discard next tokens
}
}
// we didn't find any matches for this word
if ( current_tokens = = output . size ( ) ) {
2025-01-14 09:53:50 +02:00
output . push_back ( vocab . token_unk ( ) ) ;
2024-08-08 14:16:50 +03:00
}
}
}
// TODO: reduce string copies by using cpts_offs array
2024-10-02 15:14:46 +03:00
static std : : vector < std : : string > preprocess ( const std : : string & text ) {
2024-08-08 14:16:50 +03:00
const std : : vector < uint32_t > cpts_nfd = unicode_cpts_normalize_nfd ( unicode_cpts_from_utf8 ( text ) ) ;
std : : vector < std : : string > words ( 1 , " " ) ;
for ( const uint32_t cpt : cpts_nfd ) {
2024-12-17 21:19:28 +02:00
const auto flags = unicode_cpt_flags_from_cpt ( cpt ) ;
2024-08-08 14:16:50 +03:00
if ( flags . is_whitespace ) {
if ( words . back ( ) . size ( ) ) { // finish previous word if any
words . emplace_back ( ) ;
}
continue ;
}
assert ( ! flags . is_separator ) ;
if ( cpt = = 0 | | cpt = = 0xFFFD | | flags . is_control ) {
continue ;
}
const std : : string s = unicode_cpt_to_utf8 ( unicode_tolower ( cpt ) ) ;
if ( flags . is_punctuation | | ( cpt < 0x7F & & flags . is_symbol ) | | is_chinese_char ( cpt ) ) {
if ( words . back ( ) . size ( ) ) { // finish previous word if any
words . emplace_back ( ) ;
}
words . back ( ) = s ; // single char word
words . emplace_back ( ) ; // start a new word
} else {
words . back ( ) + = s ; // append char to word
}
}
if ( ! words . back ( ) . size ( ) ) {
words . pop_back ( ) ;
}
return words ;
}
static bool is_chinese_char ( uint32_t cpt ) {
return
( cpt > = 0x04E00 & & cpt < = 0x09FFF ) | |
( cpt > = 0x03400 & & cpt < = 0x04DBF ) | |
( cpt > = 0x20000 & & cpt < = 0x2A6DF ) | |
( cpt > = 0x2A700 & & cpt < = 0x2B73F ) | |
( cpt > = 0x2B740 & & cpt < = 0x2B81F ) | |
( cpt > = 0x2B920 & & cpt < = 0x2CEAF ) | | // this should be 0x2B820 but in hf rust code it is 0x2B920
( cpt > = 0x0F900 & & cpt < = 0x0FAFF ) | |
( cpt > = 0x2F800 & & cpt < = 0x2FA1F ) ;
//(cpt >= 0x3000 && cpt <= 0x303F) ||
//(cpt >= 0xFF00 && cpt <= 0xFFEF);
}
2024-10-02 15:14:46 +03:00
private :
2024-08-08 14:16:50 +03:00
const llama_vocab & vocab ;
2024-10-02 15:14:46 +03:00
// currently unused
// const llm_tokenizer_wpm * wpm_tokenizer;
2024-08-08 14:16:50 +03:00
} ;
//
// UGM tokenizer
//
2024-10-02 15:14:46 +03:00
struct llm_tokenizer_ugm : llm_tokenizer {
2025-01-14 09:53:50 +02:00
llm_tokenizer_ugm ( const llama_vocab & vocab , const std : : vector < char > & precompiled_charsmap ) {
if ( precompiled_charsmap . size ( ) > 0 ) {
2024-08-08 14:16:50 +03:00
size_t charsmap_offset = 0 ;
// First four bytes of precompiled_charsmap contains length of binary
// blob containing XOR-compressed compact double array (XCDA) entries
2025-01-14 09:53:50 +02:00
uint32_t xcda_blob_size = * ( const uint32_t * ) & precompiled_charsmap [ 0 ] ;
2024-08-08 14:16:50 +03:00
charsmap_offset + = sizeof ( xcda_blob_size ) ;
2025-01-14 09:53:50 +02:00
if ( xcda_blob_size + charsmap_offset > = precompiled_charsmap . size ( ) ) {
2024-08-08 14:16:50 +03:00
throw std : : runtime_error ( " Index out of array bounds in precompiled charsmap! " ) ;
}
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
2025-01-14 09:53:50 +02:00
xcda_array = ( const uint32_t * ) & precompiled_charsmap [ charsmap_offset ] ;
2024-08-08 14:16:50 +03:00
xcda_array_size = xcda_blob_size / sizeof ( uint32_t ) ;
charsmap_offset + = xcda_blob_size ;
// Remaining bytes of precompiled charsmap contain null-terminated
// replacement strings for prefixes matched by the XCDA.
2025-01-14 09:53:50 +02:00
prefix_replacements = & precompiled_charsmap [ charsmap_offset ] ;
prefix_replacements_size = precompiled_charsmap . size ( ) - charsmap_offset ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
for ( uint32_t id = 0 ; id < vocab . n_tokens ( ) ; + + id ) {
const auto & token_data = vocab . get_token_data ( id ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
if ( vocab . is_normal ( id ) ) {
2024-08-08 14:16:50 +03:00
min_score = std : : min < float > ( min_score , token_data . score ) ;
max_score = std : : max < float > ( max_score , token_data . score ) ;
}
2025-01-14 09:53:50 +02:00
if ( vocab . is_normal ( id ) | |
vocab . is_user_defined ( id ) | |
vocab . is_unused ( id ) ) {
2024-08-08 14:16:50 +03:00
token_matcher . insert ( token_data . text . data ( ) , token_data . text . size ( ) , id ) ;
}
2025-01-14 09:53:50 +02:00
if ( vocab . is_user_defined ( id ) ) {
2024-08-08 14:16:50 +03:00
user_defined_token_matcher . insert ( token_data . text . data ( ) , token_data . text . size ( ) ) ;
}
}
unknown_token_score = min_score - unknown_token_score_penalty ;
}
2024-10-02 15:14:46 +03:00
// escaped space symbol - U+2581 (Lower One Eighth Block)
const std : : string escaped_space = " \xE2 \x96 \x81 " ;
const char * prefix_replacements = NULL ;
size_t prefix_replacements_size = 0 ;
const uint32_t * xcda_array = NULL ;
size_t xcda_array_size = 0 ;
struct naive_trie user_defined_token_matcher ;
float min_score = FLT_MAX ;
float max_score = - FLT_MAX ;
float unknown_token_score_penalty = 10.0 ;
float unknown_token_score ;
struct naive_trie token_matcher ;
} ;
struct llm_tokenizer_ugm_session {
2025-01-14 09:53:50 +02:00
llm_tokenizer_ugm_session ( const llama_vocab & vocab , const llm_tokenizer_ugm & tokenizer ) : vocab ( vocab ) , tokenizer ( tokenizer ) { }
2024-10-02 15:14:46 +03:00
2024-08-08 14:16:50 +03:00
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
* unigram language models . The general idea is to :
* - move along the input sequence in steps of one UTF code point ,
* - at each step find all possible tokenizations of the prefix by
* traversing the tokens trie ,
* - for each tokenization store the best one so far ( by higher score )
* - use the position in sequence after given token as an index to store
* results
* - if there was no valid tokenization of the current UTF code point
* then use unknown token with additional score penalty
* After processing the whole sequence we backtrack from the end to get
* the best tokenization .
*/
2025-01-14 09:53:50 +02:00
void tokenize ( const std : : string & text , std : : vector < llama_token > & output ) {
2024-08-08 14:16:50 +03:00
// get current size of output (for reversal later)
size_t output_size = output . size ( ) ;
// normalize the input first
std : : string normalized ;
normalize ( text , & normalized ) ;
size_t input_len = normalized . size ( ) ;
if ( input_len = = 0 ) {
return ;
}
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
2025-01-14 09:53:50 +02:00
std : : vector < struct best_tokenization > tokenization_results ( input_len + 1 , { vocab . token_unk ( ) , 0 , - FLT_MAX } ) ;
2024-08-08 14:16:50 +03:00
// at the beginning tokenization score is zero
2025-01-14 09:53:50 +02:00
tokenization_results [ 0 ] = { vocab . token_unk ( ) , 0 , 0 } ;
2024-08-08 14:16:50 +03:00
for ( size_t input_offset = 0 ; input_offset < input_len ; ) {
size_t prefix_offset = input_offset ;
// calculate how many code units are in the currently processed UTF code point
size_t n_utf8_code_units = std : : min < size_t > ( unicode_len_utf8 ( normalized [ input_offset ] ) , input_len - input_offset ) ;
// traverse the token matcher trie to find a matching token
bool single_codepoint_token_found = false ;
const struct best_tokenization & current_best = tokenization_results [ input_offset ] ;
2025-01-14 09:53:50 +02:00
const struct naive_trie * node = tokenizer . token_matcher . traverse ( normalized [ prefix_offset + + ] ) ;
2024-08-08 14:16:50 +03:00
while ( prefix_offset < = input_len & & node ! = NULL ) {
// check if we found valid token in prefix
if ( node - > has_value ) {
// check if it corresponds to the whole UTF code point
if ( prefix_offset - input_offset = = n_utf8_code_units ) {
single_codepoint_token_found = true ;
}
llama_token token_id = node - > value ;
2025-01-14 09:53:50 +02:00
const auto & token_data = vocab . get_token_data ( token_id ) ;
2024-08-08 14:16:50 +03:00
// we set the user-defined token scores to 0 to make them more likely to be selected
// (normal token scores are log probabilities, so they are negative)
// score type is double here to make tokenization results exactly
// the same as in the HF tokenizer using SentencePiece
2025-01-14 09:53:50 +02:00
const double token_score = vocab . is_user_defined ( token_id ) ? 0.0 : token_data . score ;
2024-08-08 14:16:50 +03:00
const double challenger_score = current_best . score_sum + token_score ;
struct best_tokenization & current_champ = tokenization_results [ prefix_offset ] ;
if ( challenger_score > current_champ . score_sum ) {
struct best_tokenization challenger = { token_id , input_offset , ( float ) challenger_score } ;
current_champ = challenger ;
}
}
node = node - > traverse ( normalized [ prefix_offset + + ] ) ;
}
// if we didn't find a valid token corresponding to the whole UTF code point
// then use unknown token as the tokenization of this UTF code point
if ( ! single_codepoint_token_found ) {
2025-01-14 09:53:50 +02:00
const double challenger_score = current_best . score_sum + tokenizer . unknown_token_score ;
2024-08-08 14:16:50 +03:00
prefix_offset = input_offset + n_utf8_code_units ;
struct best_tokenization & current_champ = tokenization_results [ prefix_offset ] ;
if ( challenger_score > current_champ . score_sum ) {
2025-01-14 09:53:50 +02:00
struct best_tokenization challenger = { vocab . token_unk ( ) , input_offset , ( float ) challenger_score } ;
2024-08-08 14:16:50 +03:00
current_champ = challenger ;
}
}
// move to the next UTF code point
input_offset + = n_utf8_code_units ;
}
// now backtrack from the end to gather token ids of the best tokenization
// merge sequences of consecutive unknown tokens into single unknown tokens
bool is_prev_unknown = false ;
for ( struct best_tokenization & tokenization = tokenization_results [ input_len ] ; ; tokenization = tokenization_results [ tokenization . input_offset ] ) {
2025-01-14 09:53:50 +02:00
bool is_unknown = tokenization . token_id = = vocab . token_unk ( ) ;
2024-08-08 14:16:50 +03:00
if ( ! ( is_prev_unknown & & is_unknown ) ) {
output . push_back ( tokenization . token_id ) ;
}
if ( tokenization . input_offset = = 0 ) {
break ;
}
is_prev_unknown = is_unknown ;
}
// reverse the output since we added tokens starting from the end of the input
std : : reverse ( output . begin ( ) + output_size , output . end ( ) ) ;
}
private :
// helper structure for returning normalization results
struct normalization_result {
const char * normalized ;
size_t normalized_len ;
size_t consumed_input ;
} ;
void normalize ( const std : : string & input , std : : string * normalized ) {
normalized - > clear ( ) ;
normalized - > reserve ( input . size ( ) * 3 ) ;
2025-01-14 09:53:50 +02:00
const std : : string space = vocab . get_escape_whitespaces ( ) ? tokenizer . escaped_space : " " ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
const bool shall_prepend_space = ! vocab . get_treat_whitespace_as_suffix ( ) & & vocab . get_add_space_prefix ( ) ;
const bool shall_append_space = vocab . get_treat_whitespace_as_suffix ( ) & & vocab . get_add_space_prefix ( ) ;
const bool shall_merge_spaces = vocab . get_remove_extra_whitespaces ( ) ;
2024-08-08 14:16:50 +03:00
bool is_space_prepended = false ;
bool processing_non_ws = false ;
size_t input_len = input . size ( ) ;
for ( size_t input_offset = 0 ; input_offset < input_len ; ) {
auto norm_res = normalize_prefix ( input , input_offset ) ;
for ( size_t i = 0 ; i < norm_res . normalized_len ; i + + ) {
char c = norm_res . normalized [ i ] ;
if ( c ! = ' ' ) {
if ( ! processing_non_ws ) {
processing_non_ws = true ;
if ( ( shall_prepend_space & & ! is_space_prepended ) | | shall_merge_spaces ) {
normalized - > append ( space ) ;
is_space_prepended = true ;
}
}
normalized - > push_back ( c ) ;
} else {
if ( processing_non_ws ) {
processing_non_ws = false ;
}
if ( ! shall_merge_spaces ) {
normalized - > append ( space ) ;
}
}
}
input_offset + = norm_res . consumed_input ;
}
if ( shall_append_space ) {
normalized - > append ( space ) ;
}
}
/*
* This structure is a view wrapper for XOR - compressed double array ( XCDA )
* See Shunsuke Kanda ( 2018 ) . Space - and Time - Efficient String Dictionaries .
2024-09-24 13:22:55 +03:00
* Each bit - packed entry contains :
2024-08-08 14:16:50 +03:00
* - BASE array value in bits 10 - 30
* - LCHECK array value in bits 0 - 7
* - LEAF array value in bit 9
* Entries containing indexes of replacement sequences have set bit 31
*/
struct xcda_array_view {
public :
xcda_array_view ( const uint32_t * xcda_array , size_t xcda_array_size ) : xcda_array ( xcda_array ) , xcda_array_size ( xcda_array_size ) {
}
uint32_t get_base ( size_t index ) {
uint32_t packed_node = get_node ( index ) ;
return ( packed_node > > 10 ) < < ( ( packed_node & ( 1U < < 9 ) ) > > 6 ) ;
}
uint32_t get_lcheck ( size_t index ) {
uint32_t packed_node = get_node ( index ) ;
return packed_node & ( ( 1U < < 31 ) | 0xff ) ;
}
bool get_leaf ( size_t index ) {
uint32_t packed_node = get_node ( index ) ;
return ( packed_node > > 8 ) & 1 ;
}
uint32_t get_value ( size_t index ) {
uint32_t packed_node = get_node ( index ) ;
return packed_node & ( ( 1U < < 31 ) - 1 ) ;
}
private :
uint32_t get_node ( size_t index ) {
if ( index > xcda_array_size ) {
throw std : : runtime_error ( " Index out of array bounds in XCDA array! " ) ;
}
return xcda_array [ index ] ;
}
const uint32_t * xcda_array ;
size_t xcda_array_size ;
} ;
2024-10-02 15:14:46 +03:00
// this structure stores the best tokenization so far at input_offset
struct best_tokenization {
llama_token token_id ;
size_t input_offset ;
float score_sum ;
} ;
2024-08-08 14:16:50 +03:00
struct normalization_result normalize_prefix ( const std : : string & input , size_t input_offset ) {
if ( input_offset = = input . size ( ) ) {
return { & input [ input_offset ] , 0 , 0 } ;
}
// if input prefix matches some user-defined token return this token as normalization result
2024-10-02 15:14:46 +03:00
auto user_defined_token_match =
2025-01-14 09:53:50 +02:00
tokenizer . user_defined_token_matcher . get_longest_prefix ( & input [ input_offset ] , input . size ( ) - input_offset ) ;
2024-08-08 14:16:50 +03:00
if ( user_defined_token_match . second > 0 ) {
return { & input [ input_offset ] , user_defined_token_match . second , user_defined_token_match . second } ;
}
size_t longest_prefix_length = 0 ;
size_t longest_prefix_offset = 0 ;
2025-01-14 09:53:50 +02:00
if ( tokenizer . xcda_array_size > 0 ) {
struct xcda_array_view xcda_view ( tokenizer . xcda_array , tokenizer . xcda_array_size ) ;
2024-08-08 14:16:50 +03:00
// Find the longest normalized sequence matching the input prefix by walking
// the XOR-compressed compact double array (XCDA) starting from the root node
// We find the index of the next node by calculating BASE[s] ^ c where s is
// the index of the previous node and c is a numerical character value
uint32_t node_index = 0 ;
// get BASE of the root node
node_index = xcda_view . get_base ( node_index ) ;
for ( size_t prefix_offset = input_offset ; prefix_offset < input . size ( ) ; prefix_offset + + ) {
unsigned char c = input [ prefix_offset ] ;
if ( c = = 0 ) {
break ;
}
node_index ^ = c ;
// if value of LCHECK is not c it means that this is not a child of
// the previous node, so we stop matching
if ( xcda_view . get_lcheck ( node_index ) ! = c ) {
break ;
}
bool is_leaf = xcda_view . get_leaf ( node_index ) ;
// get BASE of the current node
node_index ^ = xcda_view . get_base ( node_index ) ;
// if LEAF of the current node is true, it means that its BASE points to the node
// containing index of replacement sequence for currently matched input prefix
if ( is_leaf )
{
longest_prefix_length = prefix_offset - input_offset + 1 ;
// get index of replacement sequence for currently matched input prefix
longest_prefix_offset = xcda_view . get_value ( node_index ) ;
}
}
}
if ( longest_prefix_length > 0 ) {
// we have a match, so return the replacement sequence
2025-01-14 09:53:50 +02:00
if ( longest_prefix_offset > = tokenizer . prefix_replacements_size ) {
2024-08-08 14:16:50 +03:00
throw std : : runtime_error ( " Index out of array bounds in precompiled charsmap! " ) ;
}
2025-01-14 09:53:50 +02:00
const char * prefix_replacement = & ( tokenizer . prefix_replacements ) [ longest_prefix_offset ] ;
2024-08-08 14:16:50 +03:00
return { prefix_replacement , strlen ( prefix_replacement ) , longest_prefix_length } ;
}
2024-10-02 15:14:46 +03:00
// check if the input prefix contains a valid sequence of UTF-8 code units
try {
// if yes, return this sequence unmodified
size_t prefix_offset = input_offset ;
unicode_cpt_from_utf8 ( input , prefix_offset ) ;
return { & input [ input_offset ] , prefix_offset - input_offset , prefix_offset - input_offset } ;
} catch ( std : : invalid_argument & /*ex*/ ) {
// if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
return { " \xEF \xBF \xBD " , 3 , 1 } ;
}
}
2024-08-08 14:16:50 +03:00
2024-10-02 15:14:46 +03:00
const llama_vocab & vocab ;
2025-01-14 09:53:50 +02:00
const llm_tokenizer_ugm & tokenizer ;
2024-08-08 14:16:50 +03:00
} ;
2024-09-24 13:22:55 +03:00
//
// RWKV tokenizer
//
static std : : vector < uint8_t > llama_unescape_rwkv_token ( const std : : string & escaped ) {
std : : vector < uint8_t > output ;
output . reserve ( escaped . size ( ) ) ;
// Parser state
bool escaping = false ;
uint8_t hex_remaining = 0 ;
uint8_t hex_acc = 0 ;
// Step through characters, performing parsing
for ( const char & c : escaped ) {
// If we're parsing a hex code, interpret the next character
if ( hex_remaining ! = 0 ) {
uint8_t value = ( c > = ' a ' ) ? ( c - ' a ' + 10 ) : ( c - ' 0 ' ) ;
hex_acc = ( hex_acc < < 4 ) + value ;
hex_remaining - = 1 ;
if ( hex_remaining = = 0 ) {
output . push_back ( hex_acc ) ;
hex_acc = 0 ;
}
continue ;
}
// If we got an escape character, interpret it
if ( escaping ) {
if ( c = = ' t ' ) {
output . push_back ( ' \t ' ) ;
} else if ( c = = ' n ' ) {
output . push_back ( ' \n ' ) ;
} else if ( c = = ' r ' ) {
output . push_back ( ' \r ' ) ;
} else if ( c = = ' x ' ) {
hex_remaining = 2 ;
} else {
output . push_back ( c ) ;
}
escaping = false ;
continue ;
}
if ( c = = ' \\ ' ) {
escaping = true ;
continue ;
}
output . push_back ( c ) ;
}
return output ;
}
2024-10-02 15:14:46 +03:00
struct llm_tokenizer_rwkv : llm_tokenizer {
2025-01-14 09:53:50 +02:00
llm_tokenizer_rwkv ( const llama_vocab & vocab ) {
2024-09-24 13:22:55 +03:00
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
// For now, we decode the vocab here into the lookup we'll use for tokenization.
// build trie
2025-01-14 09:53:50 +02:00
for ( uint32_t id = 0 ; id < vocab . n_tokens ( ) ; + + id ) {
const auto & data = vocab . get_token_data ( id ) ;
const auto text = llama_unescape_rwkv_token ( data . text ) ;
token_matcher . insert ( ( const char * ) text . data ( ) , text . size ( ) , id ) ;
2024-09-24 13:22:55 +03:00
}
}
2024-10-02 15:14:46 +03:00
struct naive_trie token_matcher ;
} ;
struct llm_tokenizer_rwkv_session {
2025-01-14 09:53:50 +02:00
llm_tokenizer_rwkv_session ( const llama_vocab & vocab , const llm_tokenizer_rwkv & tokenizer ) : vocab ( vocab ) , tokenizer ( tokenizer ) { }
2024-10-02 15:14:46 +03:00
2025-01-14 09:53:50 +02:00
void tokenize ( const std : : string & text , std : : vector < llama_token > & output ) {
2024-09-24 13:22:55 +03:00
uint32_t position = 0 ;
while ( position < text . size ( ) ) {
2025-01-14 09:53:50 +02:00
const struct naive_trie * node = tokenizer . token_matcher . traverse ( text [ position ] ) ;
2024-09-24 13:22:55 +03:00
if ( node = = NULL ) {
// no matching token found, add unknown token
2025-01-14 09:53:50 +02:00
output . push_back ( vocab . token_unk ( ) ) ;
2024-09-24 13:22:55 +03:00
position + = 1 ;
continue ;
}
// traverse the trie to find the longest matching token
uint32_t token_id = 0 ;
uint32_t token_length = 0 ;
while ( node ! = NULL ) {
if ( node - > has_value ) {
token_id = node - > value ;
token_length = position + 1 ;
}
node = node - > traverse ( text [ + + position ] ) ;
}
// add the longest matching token
output . push_back ( token_id ) ;
position = token_length ;
}
}
2024-10-02 15:14:46 +03:00
private :
2024-09-24 13:22:55 +03:00
const llama_vocab & vocab ;
2025-01-14 09:53:50 +02:00
const llm_tokenizer_rwkv & tokenizer ;
2024-09-24 13:22:55 +03:00
} ;
2024-08-08 14:16:50 +03:00
//
2025-01-14 09:53:50 +02:00
// impl
2024-08-08 14:16:50 +03:00
//
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN ,
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
} FRAGMENT_BUFFER_VARIANT_TYPE ;
struct fragment_buffer_variant {
2025-01-14 09:53:50 +02:00
fragment_buffer_variant ( llama_token _token )
2024-08-08 14:16:50 +03:00
:
type ( FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN ) ,
token ( _token ) ,
raw_text ( _dummy ) ,
offset ( 0 ) ,
length ( 0 ) { }
fragment_buffer_variant ( const std : : string & _raw_text , int64_t _offset , int64_t _length )
:
type ( FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) ,
2025-01-14 09:53:50 +02:00
token ( ( llama_token ) - 1 ) ,
2024-08-08 14:16:50 +03:00
raw_text ( _raw_text ) ,
offset ( _offset ) ,
length ( _length ) {
GGML_ASSERT ( _offset > = 0 ) ;
GGML_ASSERT ( _length > = 1 ) ;
GGML_ASSERT ( offset + length < = raw_text . length ( ) ) ;
}
const FRAGMENT_BUFFER_VARIANT_TYPE type ;
2025-01-14 09:53:50 +02:00
const llama_token token ;
2024-08-08 14:16:50 +03:00
const std : : string _dummy ;
const std : : string & raw_text ;
const uint64_t offset ;
const uint64_t length ;
} ;
2025-01-14 09:53:50 +02:00
struct llama_vocab : : impl {
uint32_t n_token_types = 0 ; // for BERT-style token types
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM ;
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
int max_token_len = 0 ; // used for optimizing longest token search
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
llama_token special_bos_id = 1 ;
llama_token special_eos_id = 2 ;
llama_token special_eot_id = LLAMA_TOKEN_NULL ;
llama_token special_eom_id = LLAMA_TOKEN_NULL ;
llama_token special_unk_id = 0 ;
llama_token special_sep_id = LLAMA_TOKEN_NULL ;
llama_token special_pad_id = LLAMA_TOKEN_NULL ;
llama_token special_mask_id = LLAMA_TOKEN_NULL ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
llama_token linefeed_id = 13 ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// fim tokens
llama_token special_fim_pre_id = LLAMA_TOKEN_NULL ;
llama_token special_fim_suf_id = LLAMA_TOKEN_NULL ;
llama_token special_fim_mid_id = LLAMA_TOKEN_NULL ;
llama_token special_fim_pad_id = LLAMA_TOKEN_NULL ;
llama_token special_fim_rep_id = LLAMA_TOKEN_NULL ; // repo
llama_token special_fim_sep_id = LLAMA_TOKEN_NULL ; // file separator
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// tokenizer flags
bool add_space_prefix = false ;
bool add_bos = false ;
bool add_eos = false ;
bool ignore_merges = false ;
bool clean_spaces = false ; // clean_up_tokenization_spaces
bool remove_extra_whitespaces = false ;
bool escape_whitespaces = true ;
bool treat_whitespace_as_suffix = false ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : unordered_map < std : : string , llama_token > token_to_id ;
std : : vector < token_data > id_to_token ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : vector < llama_token > cache_special_tokens ;
std : : vector < std : : string > cache_token_to_piece ; // llama_token_to_piece(special = true);
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : map < std : : pair < std : : string , std : : string > , int > bpe_ranks ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// set of all tokens that cause "end of generation"
std : : set < llama_token > special_eog_ids ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : unique_ptr < llm_tokenizer > tokenizer ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : vector < char > precompiled_charsmap ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
impl ( const llama_vocab & vocab ) : vocab ( vocab ) {
}
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
~ impl ( ) = default ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
void load ( llama_model_loader & ml , const LLM_KV & kv ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
enum llama_vocab_type get_type ( ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : string type_name ( ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
bool is_normal ( llama_token id ) const ;
bool is_unknown ( llama_token id ) const ;
bool is_control ( llama_token id ) const ;
bool is_byte ( llama_token id ) const ;
bool is_user_defined ( llama_token id ) const ;
bool is_unused ( llama_token id ) const ;
bool is_eog ( llama_token id ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
uint8_t token_to_byte ( llama_token id ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
llama_token_attr token_get_attr ( llama_token id ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
void init_tokenizer ( enum llama_vocab_type type ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
void tokenizer_st_partition ( std : : forward_list < fragment_buffer_variant > & buffer , bool parse_special ) const ;
2024-10-02 15:14:46 +03:00
2025-01-14 09:53:50 +02:00
std : : string token_to_piece_for_cache (
llama_token token ,
bool special ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : vector < llama_token > tokenize (
const std : : string & raw_text ,
bool add_special ,
bool parse_special = false ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
int32_t tokenize (
const char * text ,
int32_t text_len ,
llama_token * tokens ,
int32_t n_tokens_max ,
bool add_special ,
bool parse_special ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// does not write null-terminator to buf
int32_t token_to_piece (
llama_token token ,
char * buf ,
int32_t length ,
int32_t lstrip ,
bool special ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// use cached data
const std : : string & token_to_piece ( llama_token token ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
int32_t detokenize (
const llama_token * tokens ,
int32_t n_tokens ,
char * text ,
int32_t text_len_max ,
bool remove_special ,
bool unparse_special ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : string detokenize (
const std : : vector < llama_token > & tokens ,
bool special ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
void print_info ( ) const ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
private :
const llama_vocab & vocab ;
} ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
void llama_vocab : : impl : : load ( llama_model_loader & ml , const LLM_KV & kv ) {
struct gguf_context * ctx = ml . meta . get ( ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// determine vocab type
{
std : : string tokenizer_model ;
std : : string tokenizer_pre ;
ml . get_key ( LLM_KV_TOKENIZER_MODEL , tokenizer_model ) ;
ml . get_key ( LLM_KV_TOKENIZER_PRE , tokenizer_pre , false ) ;
ml . get_key ( LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT , n_token_types , false ) ;
if ( tokenizer_model = = " no_vocab " | | tokenizer_model = = " none " ) {
type = LLAMA_VOCAB_TYPE_NONE ;
// default special tokens
special_bos_id = LLAMA_TOKEN_NULL ;
special_eos_id = LLAMA_TOKEN_NULL ;
special_unk_id = LLAMA_TOKEN_NULL ;
special_sep_id = LLAMA_TOKEN_NULL ;
special_pad_id = LLAMA_TOKEN_NULL ;
special_mask_id = LLAMA_TOKEN_NULL ;
linefeed_id = LLAMA_TOKEN_NULL ;
// read vocab size from metadata
uint32_t n_tokens = 0 ;
if ( ! ml . get_key ( LLM_KV_VOCAB_SIZE , n_tokens , false ) ) {
LLAMA_LOG_WARN ( " %s: there is no vocab_size in metadata \n " , __func__ ) ;
}
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
return ;
}
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
if ( tokenizer_model = = " llama " ) {
type = LLAMA_VOCAB_TYPE_SPM ;
// default special tokens
special_bos_id = 1 ;
special_eos_id = 2 ;
special_unk_id = 0 ;
special_sep_id = LLAMA_TOKEN_NULL ;
special_pad_id = LLAMA_TOKEN_NULL ;
special_mask_id = LLAMA_TOKEN_NULL ;
} else if ( tokenizer_model = = " bert " ) {
type = LLAMA_VOCAB_TYPE_WPM ;
// default special tokens
special_bos_id = 101 ;
special_eos_id = LLAMA_TOKEN_NULL ;
special_unk_id = 100 ;
special_sep_id = 102 ;
special_pad_id = 0 ;
special_mask_id = 103 ;
} else if ( tokenizer_model = = " gpt2 " ) {
type = LLAMA_VOCAB_TYPE_BPE ;
// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_MERGES ) . c_str ( ) ) ;
if ( merges_keyidx = = - 1 ) {
throw std : : runtime_error ( " cannot find tokenizer merges in model file \n " ) ;
}
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
const int n_merges = gguf_get_arr_n ( ctx , merges_keyidx ) ;
for ( int i = 0 ; i < n_merges ; i + + ) {
const std : : string word = gguf_get_arr_str ( ctx , merges_keyidx , i ) ;
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : string first ;
std : : string second ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
const size_t pos = word . find ( ' ' , 1 ) ;
if ( pos ! = std : : string : : npos ) {
first = word . substr ( 0 , pos ) ;
second = word . substr ( pos + 1 ) ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
bpe_ranks . emplace ( std : : make_pair ( first , second ) , i ) ;
}
// default special tokens
special_bos_id = 11 ;
special_eos_id = 11 ;
special_unk_id = LLAMA_TOKEN_NULL ;
special_sep_id = LLAMA_TOKEN_NULL ;
special_pad_id = LLAMA_TOKEN_NULL ;
special_mask_id = LLAMA_TOKEN_NULL ;
} else if ( tokenizer_model = = " t5 " ) {
type = LLAMA_VOCAB_TYPE_UGM ;
// default special tokens
special_bos_id = LLAMA_TOKEN_NULL ;
special_eos_id = 1 ;
special_unk_id = 2 ;
special_sep_id = LLAMA_TOKEN_NULL ;
special_pad_id = 0 ;
special_mask_id = LLAMA_TOKEN_NULL ;
const int precompiled_charsmap_keyidx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP ) . c_str ( ) ) ;
if ( precompiled_charsmap_keyidx ! = - 1 ) {
size_t n_precompiled_charsmap = gguf_get_arr_n ( ctx , precompiled_charsmap_keyidx ) ;
const char * pc = ( const char * ) gguf_get_arr_data ( ctx , precompiled_charsmap_keyidx ) ;
precompiled_charsmap . assign ( pc , pc + n_precompiled_charsmap ) ;
# ifdef IS_BIG_ENDIAN
// correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = ( uint32_t * ) & precompiled_charsmap [ 0 ] ;
* xcda_blob_size = __builtin_bswap32 ( * xcda_blob_size ) ;
assert ( * xcda_blob_size + sizeof ( uint32_t ) < n_precompiled_charsmap ) ;
size_t xcda_array_size = * xcda_blob_size / sizeof ( uint32_t ) ;
uint32_t * xcda_array = ( uint32_t * ) & precompiled_charsmap [ sizeof ( uint32_t ) ] ;
for ( size_t i = 0 ; i < xcda_array_size ; + + i ) {
xcda_array [ i ] = __builtin_bswap32 ( xcda_array [ i ] ) ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
# endif
}
} else if ( tokenizer_model = = " rwkv " ) {
type = LLAMA_VOCAB_TYPE_RWKV ;
// default special tokens
special_bos_id = LLAMA_TOKEN_NULL ;
special_eos_id = LLAMA_TOKEN_NULL ;
special_unk_id = LLAMA_TOKEN_NULL ;
special_sep_id = LLAMA_TOKEN_NULL ;
special_pad_id = LLAMA_TOKEN_NULL ;
} else {
throw std : : runtime_error ( format ( " unknown tokenizer: '%s' " , tokenizer_model . c_str ( ) ) ) ;
}
// for now, only BPE models have pre-tokenizers
if ( type = = LLAMA_VOCAB_TYPE_BPE ) {
add_space_prefix = false ;
clean_spaces = true ;
if ( tokenizer_pre . empty ( ) ) {
LLAMA_LOG_WARN ( " %s: missing pre-tokenizer type, using: 'default' \n " , __func__ ) ;
LLAMA_LOG_WARN ( " %s: \n " , __func__ ) ;
LLAMA_LOG_WARN ( " %s: ************************************ \n " , __func__ ) ;
LLAMA_LOG_WARN ( " %s: GENERATION QUALITY WILL BE DEGRADED! \n " , __func__ ) ;
LLAMA_LOG_WARN ( " %s: CONSIDER REGENERATING THE MODEL \n " , __func__ ) ;
LLAMA_LOG_WARN ( " %s: ************************************ \n " , __func__ ) ;
LLAMA_LOG_WARN ( " %s: \n " , __func__ ) ;
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
} else if ( tokenizer_pre = = " default " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
} else if (
tokenizer_pre = = " llama3 " | |
tokenizer_pre = = " llama-v3 " | |
tokenizer_pre = = " llama-bpe " | |
tokenizer_pre = = " falcon3 " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3 ;
ignore_merges = true ;
add_bos = true ;
} else if (
tokenizer_pre = = " deepseek-llm " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " deepseek-coder " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " deepseek-v3 " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " falcon " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON ;
} else if (
tokenizer_pre = = " mpt " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_MPT ;
} else if (
tokenizer_pre = = " starcoder " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER ;
} else if (
tokenizer_pre = = " gpt-2 " | |
tokenizer_pre = = " phi-2 " | |
tokenizer_pre = = " jina-es " | |
tokenizer_pre = = " jina-de " | |
tokenizer_pre = = " gigachat " | |
tokenizer_pre = = " jina-v1-en " | |
tokenizer_pre = = " jina-v2-es " | |
tokenizer_pre = = " jina-v2-de " | |
tokenizer_pre = = " jina-v2-code " | |
tokenizer_pre = = " roberta-bpe " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2 ;
} else if (
tokenizer_pre = = " refact " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT ;
} else if (
tokenizer_pre = = " command-r " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " qwen2 " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2 ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " stablelm2 " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2 ;
} else if (
tokenizer_pre = = " olmo " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO ;
} else if (
tokenizer_pre = = " dbrx " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX ;
} else if (
tokenizer_pre = = " smaug-bpe " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG ;
} else if (
tokenizer_pre = = " poro-chat " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " chatglm-bpe " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4 ;
special_bos_id = LLAMA_TOKEN_NULL ;
} else if (
tokenizer_pre = = " viking " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " jais " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS ;
} else if (
tokenizer_pre = = " tekken " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN ;
clean_spaces = false ;
ignore_merges = true ;
add_bos = true ;
} else if (
tokenizer_pre = = " smollm " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " codeshell " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL ;
} else if (
tokenizer_pre = = " bloom " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM ;
} else if (
tokenizer_pre = = " gpt3-finnish " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH ;
} else if (
tokenizer_pre = = " exaone " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE ;
} else if (
tokenizer_pre = = " chameleon " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON ;
add_bos = true ;
clean_spaces = false ;
} else if (
tokenizer_pre = = " minerva-7b " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA ;
} else if (
tokenizer_pre = = " megrez " ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2 ;
} else {
throw std : : runtime_error ( format ( " unknown pre-tokenizer type: '%s' " , tokenizer_pre . c_str ( ) ) ) ;
}
} else if ( type = = LLAMA_VOCAB_TYPE_SPM ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
add_space_prefix = true ;
clean_spaces = false ;
add_bos = true ;
add_eos = false ;
} else if ( type = = LLAMA_VOCAB_TYPE_WPM ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
add_space_prefix = false ;
clean_spaces = true ;
add_bos = true ;
add_eos = false ;
} else if ( type = = LLAMA_VOCAB_TYPE_UGM ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
add_bos = false ;
add_eos = true ;
} else if ( type = = LLAMA_VOCAB_TYPE_RWKV ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
add_space_prefix = false ;
clean_spaces = false ;
add_bos = false ;
add_eos = false ;
} else {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
}
ml . get_key ( LLM_KV_TOKENIZER_ADD_PREFIX , add_space_prefix , false ) ;
ml . get_key ( LLM_KV_TOKENIZER_REMOVE_EXTRA_WS , remove_extra_whitespaces , false ) ;
}
const int token_idx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_LIST ) . c_str ( ) ) ;
if ( token_idx = = - 1 ) {
throw std : : runtime_error ( " cannot find tokenizer vocab in model file \n " ) ;
}
const float * scores = nullptr ;
const int score_idx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_SCORES ) . c_str ( ) ) ;
if ( score_idx ! = - 1 ) {
scores = ( const float * ) gguf_get_arr_data ( ctx , score_idx ) ;
}
const int * toktypes = nullptr ;
const int toktype_idx = gguf_find_key ( ctx , kv ( LLM_KV_TOKENIZER_TOKEN_TYPE ) . c_str ( ) ) ;
if ( toktype_idx ! = - 1 ) {
toktypes = ( const int * ) gguf_get_arr_data ( ctx , toktype_idx ) ;
}
uint32_t n_tokens = gguf_get_arr_n ( ctx , token_idx ) ;
id_to_token . resize ( n_tokens ) ;
for ( uint32_t i = 0 ; i < n_tokens ; i + + ) {
std : : string word = gguf_get_arr_str ( ctx , token_idx , i ) ;
if ( word . empty ( ) ) {
LLAMA_LOG_WARN ( " %s: empty token at index %u \n " , __func__ , i ) ;
word = " [EMPTY_ " + std : : to_string ( i ) + " ] " ;
}
token_to_id [ word ] = i ;
max_token_len = std : : max ( max_token_len , ( int ) word . size ( ) ) ;
auto & token_data = id_to_token [ i ] ;
token_data . text = std : : move ( word ) ;
token_data . score = scores ? scores [ i ] : 0.0f ;
token_data . attr = LLAMA_TOKEN_ATTR_NORMAL ;
if ( toktypes ) { //TODO: remove, required until per token attributes are available from GGUF file
switch ( toktypes [ i ] ) {
case LLAMA_TOKEN_TYPE_UNKNOWN : token_data . attr = LLAMA_TOKEN_ATTR_UNKNOWN ; break ;
case LLAMA_TOKEN_TYPE_UNUSED : token_data . attr = LLAMA_TOKEN_ATTR_UNUSED ; break ;
case LLAMA_TOKEN_TYPE_NORMAL : token_data . attr = LLAMA_TOKEN_ATTR_NORMAL ; break ;
case LLAMA_TOKEN_TYPE_CONTROL : token_data . attr = LLAMA_TOKEN_ATTR_CONTROL ; break ;
case LLAMA_TOKEN_TYPE_USER_DEFINED : token_data . attr = LLAMA_TOKEN_ATTR_USER_DEFINED ; break ;
case LLAMA_TOKEN_TYPE_BYTE : token_data . attr = LLAMA_TOKEN_ATTR_BYTE ; break ;
case LLAMA_TOKEN_TYPE_UNDEFINED : token_data . attr = LLAMA_TOKEN_ATTR_UNDEFINED ; break ;
default : token_data . attr = LLAMA_TOKEN_ATTR_UNDEFINED ; break ;
}
}
}
GGML_ASSERT ( id_to_token . size ( ) = = token_to_id . size ( ) ) ;
init_tokenizer ( type ) ;
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if ( type = = LLAMA_VOCAB_TYPE_SPM ) {
try {
linefeed_id = vocab . byte_to_token ( ' \n ' ) ;
} catch ( const std : : exception & e ) {
LLAMA_LOG_WARN ( " %s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead. " , __func__ , e . what ( ) ) ;
linefeed_id = special_pad_id ;
}
} else if ( type = = LLAMA_VOCAB_TYPE_WPM ) {
linefeed_id = special_pad_id ;
} else if ( type = = LLAMA_VOCAB_TYPE_RWKV ) {
const std : : vector < int > ids = tokenize ( " \n " , false ) ;
GGML_ASSERT ( ! ids . empty ( ) & & " model vocab missing newline token " ) ;
linefeed_id = ids [ 0 ] ;
} else {
const std : : vector < int > ids = tokenize ( " \xC4 \x8A " , false ) ; // U+010A
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
if ( ids . empty ( ) ) {
LLAMA_LOG_WARN ( " %s: model vocab missing newline token, using special_pad_id instead \n " , __func__ ) ;
linefeed_id = special_pad_id ;
} else {
linefeed_id = ids [ 0 ] ;
}
}
// special tokens
{
const std : : vector < std : : pair < enum llm_kv , int32_t & > > special_token_types = {
{ LLM_KV_TOKENIZER_BOS_ID , special_bos_id } ,
{ LLM_KV_TOKENIZER_EOS_ID , special_eos_id } ,
{ LLM_KV_TOKENIZER_EOT_ID , special_eot_id } ,
{ LLM_KV_TOKENIZER_EOM_ID , special_eom_id } ,
{ LLM_KV_TOKENIZER_UNK_ID , special_unk_id } ,
{ LLM_KV_TOKENIZER_SEP_ID , special_sep_id } ,
{ LLM_KV_TOKENIZER_PAD_ID , special_pad_id } ,
{ LLM_KV_TOKENIZER_MASK_ID , special_mask_id } ,
{ LLM_KV_TOKENIZER_FIM_PRE_ID , special_fim_pre_id } ,
{ LLM_KV_TOKENIZER_FIM_SUF_ID , special_fim_suf_id } ,
{ LLM_KV_TOKENIZER_FIM_MID_ID , special_fim_mid_id } ,
{ LLM_KV_TOKENIZER_FIM_PAD_ID , special_fim_pad_id } ,
{ LLM_KV_TOKENIZER_FIM_REP_ID , special_fim_rep_id } ,
{ LLM_KV_TOKENIZER_FIM_SEP_ID , special_fim_sep_id } ,
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID , special_fim_pre_id } ,
{ LLM_KV_TOKENIZER_SUFFIX_ID , special_fim_suf_id } ,
{ LLM_KV_TOKENIZER_MIDDLE_ID , special_fim_mid_id } ,
} ;
for ( const auto & it : special_token_types ) {
const std : : string & key = kv ( std : : get < 0 > ( it ) ) ;
int32_t & id = std : : get < 1 > ( it ) ;
uint32_t new_id ;
if ( ! ml . get_key ( std : : get < 0 > ( it ) , new_id , false ) ) {
continue ;
}
if ( new_id > = id_to_token . size ( ) ) {
LLAMA_LOG_WARN ( " %s: bad special token: '%s' = %u, using default id %d \n " ,
__func__ , key . c_str ( ) , new_id , id ) ;
} else {
id = new_id ;
}
}
// Handle add_bos and add_eos
{
bool temp = true ;
if ( ml . get_key ( LLM_KV_TOKENIZER_ADD_BOS , temp , false ) ) {
add_bos = temp ;
}
if ( ml . get_key ( LLM_KV_TOKENIZER_ADD_EOS , temp , false ) ) {
add_eos = temp ;
}
}
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// auto-detect special tokens by text
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
// for now, we apply this workaround to find the tokens based on their text
for ( const auto & t : token_to_id ) {
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
if ( special_eot_id = = LLAMA_TOKEN_NULL ) {
if ( false
| | t . first = = " <|eot_id|> "
| | t . first = = " <|im_end|> "
| | t . first = = " <|end|> "
| | t . first = = " <end_of_turn> "
| | t . first = = " <|endoftext|> "
| | t . first = = " <EOT> "
| | t . first = = " <| end▁of▁sentence| > " // DeepSeek
) {
special_eot_id = t . second ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
}
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
}
2024-09-24 13:22:55 +03:00
2025-01-14 09:53:50 +02:00
// find EOM token: "<|eom_id|>"
if ( special_eom_id = = LLAMA_TOKEN_NULL ) {
if ( false
| | t . first = = " <|eom_id|> "
) {
special_eom_id = t . second ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
}
}
}
2024-09-24 13:22:55 +03:00
2025-01-14 09:53:50 +02:00
// find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
if ( special_fim_pre_id = = LLAMA_TOKEN_NULL ) {
if ( false
| | t . first = = " <|fim_prefix|> " // Qwen
| | t . first = = " <fim-prefix> "
| | t . first = = " <| fim▁begin| > " // DeepSeek
| | t . first = = " <PRE> "
) {
special_fim_pre_id = t . second ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
2024-09-24 13:22:55 +03:00
}
}
2025-01-14 09:53:50 +02:00
}
// find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
if ( special_fim_suf_id = = LLAMA_TOKEN_NULL ) {
if ( false
| | t . first = = " <|fim_suffix|> " // Qwen
| | t . first = = " <fim-suffix> "
| | t . first = = " <| fim▁hole| > " // DeepSeek
| | t . first = = " <SUF> "
) {
special_fim_suf_id = t . second ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
}
}
}
// find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
if ( special_fim_mid_id = = LLAMA_TOKEN_NULL ) {
if ( false
| | t . first = = " <|fim_middle|> " // Qwen
| | t . first = = " <fim-middle> "
| | t . first = = " <| fim▁end| > " // DeepSeek
| | t . first = = " <MID> "
) {
special_fim_mid_id = t . second ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
}
}
}
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
if ( special_fim_pad_id = = LLAMA_TOKEN_NULL ) {
if ( false
| | t . first = = " <|fim_pad|> " // Qwen
| | t . first = = " <fim-pad> "
| | t . first = = " <PAD> "
) {
special_fim_pad_id = t . second ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
}
}
}
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
if ( special_fim_rep_id = = LLAMA_TOKEN_NULL ) {
if ( false
| | t . first = = " <|fim_repo|> " // Qwen
| | t . first = = " <|repo_name|> "
| | t . first = = " <fim-repo> "
| | t . first = = " <REPO> "
) {
special_fim_rep_id = t . second ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
}
}
}
// find FIM_SEP token: "<|file_sep|>"
if ( special_fim_sep_id = = LLAMA_TOKEN_NULL ) {
if ( false
| | t . first = = " <|file_sep|> " // Qwen
) {
special_fim_sep_id = t . second ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
}
}
}
}
// maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
special_eog_ids . clear ( ) ;
if ( special_fim_pad_id ! = LLAMA_TOKEN_NULL & & special_eog_ids . count ( special_fim_pad_id ) = = 0 ) {
special_eog_ids . insert ( special_fim_pad_id ) ;
}
if ( special_fim_rep_id ! = LLAMA_TOKEN_NULL & & special_eog_ids . count ( special_fim_rep_id ) = = 0 ) {
special_eog_ids . insert ( special_fim_rep_id ) ;
}
if ( special_fim_sep_id ! = LLAMA_TOKEN_NULL & & special_eog_ids . count ( special_fim_sep_id ) = = 0 ) {
special_eog_ids . insert ( special_fim_sep_id ) ;
}
for ( const auto & t : token_to_id ) {
if ( false
| | t . first = = " <|eot_id|> "
| | t . first = = " <|im_end|> "
| | t . first = = " <|end|> "
| | t . first = = " <end_of_turn> "
| | t . first = = " <|endoftext|> "
| | t . first = = " <|eom_id|> "
| | t . first = = " <EOT> "
) {
special_eog_ids . insert ( t . second ) ;
if ( ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL ) = = 0 ) {
LLAMA_LOG_WARN ( " %s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
id_to_token [ t . second ] . attr = LLAMA_TOKEN_ATTR_CONTROL ;
}
} else {
// token is control, but not marked as EOG -> print a debug log
if ( id_to_token [ t . second ] . attr & LLAMA_TOKEN_ATTR_CONTROL & & special_eog_ids . count ( t . second ) = = 0 ) {
LLAMA_LOG_DEBUG ( " %s: control token: %6d '%s' is not marked as EOG \n " ,
__func__ , t . second , t . first . c_str ( ) ) ;
}
}
}
// sanity checks
if ( special_eos_id ! = LLAMA_TOKEN_NULL & & special_eog_ids . count ( special_eos_id ) = = 0 ) {
special_eog_ids . insert ( special_eos_id ) ;
LLAMA_LOG_WARN ( " %s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect \n " , __func__ ) ;
}
if ( special_eot_id ! = LLAMA_TOKEN_NULL & & special_eog_ids . count ( special_eot_id ) = = 0 ) {
special_eog_ids . insert ( special_eot_id ) ;
LLAMA_LOG_WARN ( " %s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect \n " , __func__ ) ;
}
if ( special_eom_id ! = LLAMA_TOKEN_NULL & & special_eog_ids . count ( special_eom_id ) = = 0 ) {
special_eog_ids . insert ( special_eom_id ) ;
LLAMA_LOG_WARN ( " %s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect \n " , __func__ ) ;
}
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
// build special tokens cache
{
for ( llama_token id = 0 ; id < ( llama_token ) n_tokens ; + + id ) {
if ( id_to_token [ id ] . attr & ( LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN ) ) {
cache_special_tokens . push_back ( id ) ;
}
}
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
std : : sort ( cache_special_tokens . begin ( ) , cache_special_tokens . end ( ) ,
[ & ] ( const llama_token a , const llama_token b ) {
return id_to_token [ a ] . text . size ( ) > id_to_token [ b ] . text . size ( ) ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
) ;
LLAMA_LOG_INFO ( " %s: special tokens cache size = %u \n " , __func__ , ( uint32_t ) cache_special_tokens . size ( ) ) ;
}
// build token to piece cache
{
size_t size_cache = 0 ;
std : : vector < std : : string > cache ( n_tokens ) ;
for ( uint32_t id = 0 ; id < n_tokens ; + + id ) {
cache [ id ] = token_to_piece_for_cache ( id , true ) ;
size_cache + = cache [ id ] . size ( ) ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
std : : swap ( cache_token_to_piece , cache ) ;
LLAMA_LOG_INFO ( " %s: token to piece cache size = %.4f MB \n " , __func__ , size_cache / 1024.0 / 1024.0 ) ;
}
// Handle per token attributes
//NOTE: Each model customizes per token attributes.
//NOTE: Per token attributes are missing from the GGUF file.
//TODO: Extract attributes from GGUF file.
{
auto _contains_any = [ ] ( const std : : string & str , const std : : vector < std : : string > & substrs ) - > bool {
for ( const auto & substr : substrs ) {
if ( str . find ( substr ) < std : : string : : npos ) {
return true ;
}
}
return false ;
} ;
auto _set_tokenid_attr = [ & ] ( const llama_token id , llama_token_attr attr , bool value ) {
uint32_t current = id_to_token . at ( id ) . attr ;
current = value ? ( current | attr ) : ( current & ~ attr ) ;
id_to_token [ id ] . attr = ( llama_token_attr ) current ;
} ;
auto _set_token_attr = [ & ] ( const std : : string & token , llama_token_attr attr , bool value ) {
_set_tokenid_attr ( token_to_id . at ( token ) , attr , value ) ;
} ;
std : : string model_name ;
std : : string tokenizer_pre ;
ml . get_key ( LLM_KV_GENERAL_NAME , model_name , false ) ;
ml . get_key ( LLM_KV_TOKENIZER_PRE , tokenizer_pre , false ) ;
// model name to lowercase
std : : transform ( model_name . begin ( ) , model_name . end ( ) , model_name . begin ( ) ,
[ ] ( const std : : string : : value_type x ) {
return std : : tolower ( x ) ;
}
) ;
// set attributes by model/tokenizer name
if ( _contains_any ( tokenizer_pre , { " jina-v2-de " , " jina-v2-es " , " jina-v2-code " } ) ) {
_set_token_attr ( " <mask> " , LLAMA_TOKEN_ATTR_LSTRIP , true ) ;
} else if ( _contains_any ( model_name , { " phi-3 " , " phi3 " } ) ) {
for ( auto id : cache_special_tokens ) {
_set_tokenid_attr ( id , LLAMA_TOKEN_ATTR_RSTRIP , true ) ;
}
for ( const auto * token : { " </s> " } ) {
_set_token_attr ( token , LLAMA_TOKEN_ATTR_RSTRIP , true ) ;
}
for ( const auto * token : { " <unk> " , " <s> " , " <|endoftext|> " } ) {
_set_token_attr ( token , LLAMA_TOKEN_ATTR_RSTRIP , false ) ;
}
2024-08-08 14:16:50 +03:00
}
}
}
2025-01-14 09:53:50 +02:00
enum llama_vocab_type llama_vocab : : impl : : get_type ( ) const {
return type ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
std : : string llama_vocab : : impl : : type_name ( ) const {
switch ( type ) {
case LLAMA_VOCAB_TYPE_NONE : return " no vocab " ;
case LLAMA_VOCAB_TYPE_SPM : return " SPM " ;
case LLAMA_VOCAB_TYPE_BPE : return " BPE " ;
case LLAMA_VOCAB_TYPE_WPM : return " WPM " ;
case LLAMA_VOCAB_TYPE_UGM : return " UGM " ;
case LLAMA_VOCAB_TYPE_RWKV : return " RWKV " ;
default : return " unknown " ;
}
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
bool llama_vocab : : impl : : is_normal ( llama_token id ) const {
GGML_ASSERT ( type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return id_to_token [ id ] . attr & LLAMA_TOKEN_ATTR_NORMAL ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
bool llama_vocab : : impl : : is_unknown ( llama_token id ) const {
GGML_ASSERT ( type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return id_to_token [ id ] . attr & LLAMA_TOKEN_ATTR_UNKNOWN ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
bool llama_vocab : : impl : : is_control ( llama_token id ) const {
GGML_ASSERT ( type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return id_to_token [ id ] . attr & LLAMA_TOKEN_ATTR_CONTROL ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
bool llama_vocab : : impl : : is_byte ( llama_token id ) const {
GGML_ASSERT ( type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return id_to_token [ id ] . attr & LLAMA_TOKEN_ATTR_BYTE ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
bool llama_vocab : : impl : : is_user_defined ( llama_token id ) const {
GGML_ASSERT ( type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return id_to_token [ id ] . attr & LLAMA_TOKEN_ATTR_USER_DEFINED ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
bool llama_vocab : : impl : : is_unused ( llama_token id ) const {
GGML_ASSERT ( type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return id_to_token [ id ] . attr & LLAMA_TOKEN_ATTR_UNUSED ;
2024-10-31 22:29:22 +02:00
}
2025-01-14 09:53:50 +02:00
bool llama_vocab : : impl : : is_eog ( llama_token id ) const {
return id ! = LLAMA_TOKEN_NULL & & special_eog_ids . count ( id ) > 0 ;
2024-10-31 22:29:22 +02:00
}
2025-01-14 09:53:50 +02:00
uint8_t llama_vocab : : impl : : token_to_byte ( llama_token id ) const {
GGML_ASSERT ( get_type ( ) ! = LLAMA_VOCAB_TYPE_NONE ) ;
GGML_ASSERT ( is_byte ( id ) ) ;
const auto & token_data = id_to_token . at ( id ) ;
switch ( get_type ( ) ) {
case LLAMA_VOCAB_TYPE_SPM :
case LLAMA_VOCAB_TYPE_UGM : {
auto buf = token_data . text . substr ( 3 , 2 ) ;
return strtol ( buf . c_str ( ) , NULL , 16 ) ;
}
case LLAMA_VOCAB_TYPE_BPE : {
GGML_ABORT ( " fatal error " ) ;
}
case LLAMA_VOCAB_TYPE_WPM : {
GGML_ABORT ( " fatal error " ) ;
}
default :
GGML_ABORT ( " fatal error " ) ;
}
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
llama_token_attr llama_vocab : : impl : : token_get_attr ( llama_token id ) const {
GGML_ASSERT ( type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return id_to_token . at ( id ) . attr ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
void llama_vocab : : impl : : init_tokenizer ( enum llama_vocab_type type ) {
LLAMA_LOG_DEBUG ( " %s: initializing tokenizer for type %d \n " , __func__ , type ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
switch ( type ) {
case LLAMA_VOCAB_TYPE_SPM :
tokenizer = std : : make_unique < llm_tokenizer_spm > ( vocab ) ;
break ;
case LLAMA_VOCAB_TYPE_BPE :
tokenizer = std : : make_unique < llm_tokenizer_bpe > ( vocab ) ;
break ;
case LLAMA_VOCAB_TYPE_WPM :
tokenizer = std : : make_unique < llm_tokenizer_wpm > ( vocab ) ;
break ;
case LLAMA_VOCAB_TYPE_UGM :
tokenizer = std : : make_unique < llm_tokenizer_ugm > ( vocab , precompiled_charsmap ) ;
break ;
case LLAMA_VOCAB_TYPE_RWKV :
tokenizer = std : : make_unique < llm_tokenizer_rwkv > ( vocab ) ;
break ;
default :
GGML_ABORT ( " unsupported vocab type " ) ;
}
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
//
// (de-) tokenize
//
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// #define PRETOKENIZERDEBUG
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
void llama_vocab : : impl : : tokenizer_st_partition ( std : : forward_list < fragment_buffer_variant > & buffer , bool parse_special ) const {
// for each special token
for ( const llama_token special_id : cache_special_tokens ) {
const auto & data = vocab . get_token_data ( special_id ) ;
const auto & text = data . text ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
if ( ! parse_special & & ( data . attr & ( LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN ) ) ) {
// Ignore control and unknown tokens when parse_special == false
continue ;
// User-defined tokens are still pre-tokenized before everything else
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
}
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// for each text fragment
std : : forward_list < fragment_buffer_variant > : : iterator it = buffer . begin ( ) ;
while ( it ! = buffer . end ( ) ) {
auto & fragment = ( * it ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// if a fragment is text ( not yet processed )
if ( fragment . type = = FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) {
const auto & raw_text = fragment . raw_text ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
auto raw_text_base_offset = fragment . offset ;
auto raw_text_base_length = fragment . length ;
2024-10-31 22:29:22 +02:00
2025-01-14 09:53:50 +02:00
// loop over the text
while ( true ) {
// find the first occurrence of a given special token in this fragment
// passing offset argument only limit the "search area" but match coordinates
// are still relative to the source full raw_text
auto match = raw_text . find ( text , raw_text_base_offset ) ;
2024-10-31 22:29:22 +02:00
2025-01-14 09:53:50 +02:00
// no occurrences found, stop processing this fragment for a given special token
if ( match = = std : : string : : npos ) break ;
2024-10-31 22:29:22 +02:00
2025-01-14 09:53:50 +02:00
// check if match is within bounds of offset <-> length
if ( match + text . length ( ) > raw_text_base_offset + raw_text_base_length ) break ;
2024-10-31 22:29:22 +02:00
2025-01-14 09:53:50 +02:00
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " FF: (%ld %ld %ld) '%s' \n " , raw_text - > length ( ) , raw_text_base_offset , raw_text_base_length , raw_text - > substr ( raw_text_base_offset , raw_text_base_length ) . c_str ( ) ) ;
# endif
auto source = std : : distance ( buffer . begin ( ) , it ) ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// if match is further than base offset
// then we have some text to the left of it
if ( match > raw_text_base_offset ) {
// left
const int64_t left_reminder_offset = raw_text_base_offset + 0 ;
int64_t left_reminder_length = match - raw_text_base_offset ;
if ( data . attr & LLAMA_TOKEN_ATTR_LSTRIP ) {
while ( left_reminder_length > 0 & & isspace ( raw_text [ left_reminder_offset + left_reminder_length - 1 ] ) ) {
left_reminder_length - - ;
}
}
if ( left_reminder_length > 0 ) {
buffer . emplace_after ( it , raw_text , left_reminder_offset , left_reminder_length ) ;
it + + ;
}
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " FL: (%ld %ld) '%s' \n " , left_reminder_offset , left_reminder_length , raw_text - > substr ( left_reminder_offset , left_reminder_length ) . c_str ( ) ) ;
# endif
}
// special token
buffer . emplace_after ( it , special_id ) ;
it + + ;
// right
if ( match + text . length ( ) < raw_text_base_offset + raw_text_base_length ) {
int64_t right_reminder_offset = match + text . length ( ) ;
int64_t right_reminder_length = raw_text_base_length - ( ( match - raw_text_base_offset ) + text . length ( ) ) ;
if ( data . attr & LLAMA_TOKEN_ATTR_RSTRIP ) {
while ( right_reminder_length > 0 & & isspace ( raw_text [ right_reminder_offset ] ) ) {
right_reminder_offset + + ;
right_reminder_length - - ;
}
}
if ( right_reminder_length > 0 ) {
buffer . emplace_after ( it , raw_text , right_reminder_offset , right_reminder_length ) ;
it + + ;
}
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " FR: (%ld %ld) '%s' \n " , right_reminder_offset , right_reminder_length , raw_text - > substr ( right_reminder_offset , right_reminder_length ) . c_str ( ) ) ;
# endif
if ( source = = 0 ) {
buffer . erase_after ( buffer . before_begin ( ) ) ;
} else {
buffer . erase_after ( std : : next ( buffer . begin ( ) , ( source - 1 ) ) ) ;
}
// repeat for the right side
raw_text_base_offset = right_reminder_offset ;
raw_text_base_length = right_reminder_length ;
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " RR: (%ld %ld) '%s' \n " , raw_text_base_offset , raw_text_base_length , raw_text - > substr ( raw_text_base_offset , raw_text_base_length ) . c_str ( ) ) ;
# endif
} else {
if ( source = = 0 ) {
buffer . erase_after ( buffer . before_begin ( ) ) ;
} else {
buffer . erase_after ( std : : next ( buffer . begin ( ) , ( source - 1 ) ) ) ;
}
break ;
}
}
}
it + + ;
}
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
}
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
// NOTE: avoid ever using this except for building the token_to_piece caches
std : : string llama_vocab : : impl : : token_to_piece_for_cache ( llama_token token , bool special ) const {
std : : string piece ;
piece . resize ( piece . capacity ( ) ) ; // using string internal cache
const int n_chars = vocab . token_to_piece ( token , & piece [ 0 ] , piece . size ( ) , 0 , special ) ;
if ( n_chars < 0 ) {
piece . resize ( - n_chars ) ;
int check = vocab . token_to_piece ( token , & piece [ 0 ] , piece . size ( ) , 0 , special ) ;
GGML_ASSERT ( check = = - n_chars ) ;
}
else {
piece . resize ( n_chars ) ;
2024-08-08 14:16:50 +03:00
}
2025-01-14 09:53:50 +02:00
return piece ;
}
static void llama_escape_whitespace ( std : : string & text ) {
replace_all ( text , " " , " \xe2 \x96 \x81 " ) ;
}
static void llama_unescape_whitespace ( std : : string & word ) {
replace_all ( word , " \xe2 \x96 \x81 " , " " ) ;
2024-08-08 14:16:50 +03:00
}
static std : : string llama_decode_text ( const std : : string & text ) {
std : : string decoded_text ;
const auto cpts = unicode_cpts_from_utf8 ( text ) ;
for ( const auto cpt : cpts ) {
const auto utf8 = unicode_cpt_to_utf8 ( cpt ) ;
try {
decoded_text + = unicode_utf8_to_byte ( utf8 ) ;
} catch ( const std : : out_of_range & /*e*/ ) {
decoded_text + = " [UNK_BYTE_0x " ;
for ( const auto c : utf8 ) {
decoded_text + = format ( " %02x " , ( uint8_t ) c ) ;
}
decoded_text + = text + " ] " ;
}
}
return decoded_text ;
}
2025-01-14 09:53:50 +02:00
std : : vector < llama_token > llama_vocab : : impl : : tokenize (
const std : : string & raw_text ,
bool add_special ,
bool parse_special ) const {
GGML_ASSERT ( tokenizer & & " Tokenizer not initialized. Call llama_vocab::init_tokenizer() first. " ) ;
std : : vector < llama_token > output ;
std : : forward_list < fragment_buffer_variant > fragment_buffer ;
if ( ! raw_text . empty ( ) ) {
fragment_buffer . emplace_front ( raw_text , 0 , raw_text . length ( ) ) ;
tokenizer_st_partition ( fragment_buffer , parse_special ) ;
}
switch ( get_type ( ) ) {
case LLAMA_VOCAB_TYPE_SPM :
{
// OG tokenizer behavior:
//
// tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns []
bool is_prev_special = true ; // prefix with space if first token
if ( add_special & & add_bos ) {
GGML_ASSERT ( special_bos_id ! = LLAMA_TOKEN_NULL ) ;
output . push_back ( special_bos_id ) ;
is_prev_special = true ;
}
for ( const auto & fragment : fragment_buffer ) {
if ( fragment . type = = FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) {
std : : string text ;
// prefix with space if previous is special
if ( add_space_prefix & & is_prev_special ) {
text = ' ' ;
}
text + = fragment . raw_text . substr ( fragment . offset , fragment . length ) ;
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " TT: (%ld %ld %ld) '%s' \n " , text . length ( ) , fragment . offset , fragment . length , text . c_str ( ) ) ;
# endif
llama_escape_whitespace ( text ) ;
llm_tokenizer_spm_session session ( vocab ) ;
session . tokenize ( text , output ) ;
is_prev_special = false ;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output . push_back ( fragment . token ) ;
is_prev_special = true ;
}
}
if ( add_special & & add_bos & & output . size ( ) > = 2 & & output [ 1 ] = = special_bos_id ) {
LLAMA_LOG_WARN (
" %s: Added a BOS token to the prompt as specified by the model but the prompt "
" also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
" Are you sure this is what you want? \n " , __FUNCTION__ ) ;
}
if ( add_special & & add_eos ) {
GGML_ASSERT ( special_eos_id ! = LLAMA_TOKEN_NULL ) ;
output . push_back ( special_eos_id ) ;
}
} break ;
case LLAMA_VOCAB_TYPE_BPE :
{
llm_tokenizer_bpe_session session ( vocab , * static_cast < const llm_tokenizer_bpe * > ( tokenizer . get ( ) ) ) ;
// it calls some other methods that are not exist in llm_tokenizer,
// here just cast it to bpe tokenizer object
if ( add_special ) {
session . append_bos ( output ) ;
}
for ( const auto & fragment : fragment_buffer ) {
if ( fragment . type = = FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) {
std : : string text = fragment . raw_text . substr ( fragment . offset , fragment . length ) ;
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " TT: (%ld %ld %ld) '%s' \n " , text . length ( ) , fragment . offset , fragment . length , text . c_str ( ) ) ;
# endif
session . tokenize ( text , output ) ;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
session . append ( fragment . token , output ) ;
}
}
if ( add_special ) {
session . append_eos ( output ) ;
session . check_double_bos_eos ( output ) ;
}
} break ;
case LLAMA_VOCAB_TYPE_WPM :
{
if ( add_special ) {
GGML_ASSERT ( special_bos_id ! = LLAMA_TOKEN_NULL ) ;
output . push_back ( special_bos_id ) ;
}
llm_tokenizer_wpm_session session ( vocab ) ;
for ( const auto & fragment : fragment_buffer ) {
if ( fragment . type = = FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) {
std : : string text = fragment . raw_text . substr ( fragment . offset , fragment . length ) ;
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " TT: (%ld %ld %ld) '%s' \n " , text . length ( ) , fragment . offset , fragment . length , text . c_str ( ) ) ;
# endif
session . tokenize ( text , output ) ;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output . push_back ( fragment . token ) ;
}
}
if ( add_special ) {
GGML_ASSERT ( special_sep_id ! = LLAMA_TOKEN_NULL ) ;
output . push_back ( special_sep_id ) ;
}
} break ;
case LLAMA_VOCAB_TYPE_UGM :
{
if ( add_special & & add_bos ) {
GGML_ASSERT ( special_bos_id ! = LLAMA_TOKEN_NULL ) ;
output . push_back ( special_bos_id ) ;
}
llm_tokenizer_ugm_session session ( vocab , * static_cast < const llm_tokenizer_ugm * > ( tokenizer . get ( ) ) ) ;
for ( const auto & fragment : fragment_buffer ) {
if ( fragment . type = = FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) {
std : : string text = fragment . raw_text . substr ( fragment . offset , fragment . length ) ;
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " TT: (%ld %ld %ld) '%s' \n " , text . length ( ) , fragment . offset , fragment . length , text . c_str ( ) ) ;
# endif
session . tokenize ( text , output ) ;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output . push_back ( fragment . token ) ;
}
}
if ( add_special & & add_bos & & output . size ( ) > = 2 & & output [ 1 ] = = special_bos_id ) {
LLAMA_LOG_WARN (
" %s: Added a BOS token to the prompt as specified by the model but the prompt "
" also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
" Are you sure this is what you want? \n " , __FUNCTION__ ) ;
}
if ( add_special & & add_eos ) {
GGML_ASSERT ( special_eos_id ! = LLAMA_TOKEN_NULL ) ;
output . push_back ( special_eos_id ) ;
}
} break ;
case LLAMA_VOCAB_TYPE_RWKV :
{
llm_tokenizer_rwkv_session session ( vocab , * static_cast < const llm_tokenizer_rwkv * > ( tokenizer . get ( ) ) ) ;
for ( const auto & fragment : fragment_buffer ) {
if ( fragment . type = = FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT ) {
std : : string text = fragment . raw_text . substr ( fragment . offset , fragment . length ) ;
# ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN ( " TT: (%ld %ld %ld) '%s' \n " , text . length ( ) , fragment . offset , fragment . length , text . c_str ( ) ) ;
# endif
session . tokenize ( text , output ) ;
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output . push_back ( fragment . token ) ;
}
}
} break ;
case LLAMA_VOCAB_TYPE_NONE :
GGML_ABORT ( " fatal error " ) ;
}
return output ;
}
int32_t llama_vocab : : impl : : token_to_piece ( llama_token token , char * buf , int32_t length , int32_t lstrip , bool special ) const {
2024-08-08 14:16:50 +03:00
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL ;
2025-01-14 09:53:50 +02:00
const llama_token_attr attr = token_get_attr ( token ) ;
2024-08-08 14:16:50 +03:00
if ( ! special & & ( attr & attr_special ) ) {
return 0 ;
}
// copy piece chars to output text buffer
// skip up to 'lstrip' leading spaces before copying
auto _try_copy = [ = ] ( const char * token , size_t size ) - > int32_t {
for ( int32_t i = 0 ; i < lstrip & & size & & * token = = ' ' ; + + i ) {
token + + ;
size - - ;
}
if ( length < ( int32_t ) size ) {
return - ( int32_t ) size ;
}
memcpy ( buf , token , size ) ;
return ( int32_t ) size ;
} ;
// if we have a cache - use it
{
2025-01-14 09:53:50 +02:00
const auto & cache = cache_token_to_piece ;
2024-08-08 14:16:50 +03:00
if ( ! cache . empty ( ) ) {
const auto & result = cache . at ( token ) ;
return _try_copy ( result . data ( ) , result . size ( ) ) ;
}
}
2025-01-14 09:53:50 +02:00
if ( 0 < = token & & token < ( int32_t ) id_to_token . size ( ) ) {
const std : : string & token_text = id_to_token [ token ] . text ;
switch ( get_type ( ) ) {
2024-08-08 14:16:50 +03:00
case LLAMA_VOCAB_TYPE_WPM :
case LLAMA_VOCAB_TYPE_SPM :
case LLAMA_VOCAB_TYPE_UGM : {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
if ( attr & ( attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED ) ) {
return _try_copy ( token_text . data ( ) , token_text . size ( ) ) ;
2024-10-02 15:14:46 +03:00
}
if ( attr & LLAMA_TOKEN_ATTR_NORMAL ) {
2024-08-08 14:16:50 +03:00
std : : string result = token_text ;
llama_unescape_whitespace ( result ) ;
return _try_copy ( result . data ( ) , result . size ( ) ) ;
2024-10-02 15:14:46 +03:00
}
if ( attr & LLAMA_TOKEN_ATTR_BYTE ) {
2025-01-14 09:53:50 +02:00
char byte = ( char ) token_to_byte ( token ) ;
2024-08-08 14:16:50 +03:00
return _try_copy ( ( char * ) & byte , 1 ) ;
}
break ;
}
case LLAMA_VOCAB_TYPE_BPE : {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
if ( attr & ( attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED ) ) {
return _try_copy ( token_text . data ( ) , token_text . size ( ) ) ;
2024-10-02 15:14:46 +03:00
}
if ( attr & LLAMA_TOKEN_ATTR_NORMAL ) {
2024-08-08 14:16:50 +03:00
std : : string result = llama_decode_text ( token_text ) ;
return _try_copy ( result . data ( ) , result . size ( ) ) ;
}
break ;
}
2024-09-24 13:22:55 +03:00
case LLAMA_VOCAB_TYPE_RWKV : {
std : : vector < uint8_t > result = llama_unescape_rwkv_token ( token_text ) ;
// If we don't have enough space, return an error
if ( result . size ( ) > ( size_t ) length ) {
return - ( int ) result . size ( ) ;
}
memcpy ( buf , result . data ( ) , result . size ( ) ) ;
return ( int ) result . size ( ) ;
}
2024-08-08 14:16:50 +03:00
default :
GGML_ABORT ( " fatal error " ) ;
}
}
return 0 ;
}
2025-01-14 09:53:50 +02:00
const std : : string & llama_vocab : : impl : : token_to_piece ( llama_token token ) const {
return cache_token_to_piece . at ( token ) ;
}
int32_t llama_vocab : : impl : : detokenize (
2024-08-08 14:16:50 +03:00
const llama_token * tokens ,
int32_t n_tokens ,
char * text ,
int32_t text_len_max ,
bool remove_special ,
2025-01-14 09:53:50 +02:00
bool unparse_special ) const {
if ( type = = LLAMA_VOCAB_TYPE_NONE ) {
2025-01-13 08:55:48 +02:00
return 0 ;
}
2025-01-14 09:53:50 +02:00
GGML_ASSERT ( tokenizer & & " Tokenizer not initialized. Call llama_vocab::init_tokenizer() first. " ) ;
2024-10-02 15:14:46 +03:00
2024-08-08 14:16:50 +03:00
int32_t avail = text_len_max ;
int32_t total = 0 ;
// remove the leading space
2025-01-14 09:53:50 +02:00
bool remove_space = add_space_prefix ;
2024-08-08 14:16:50 +03:00
2025-01-14 09:53:50 +02:00
if ( remove_special & & add_bos ) {
if ( n_tokens > 0 & & tokens [ 0 ] = = special_bos_id ) {
2024-08-08 14:16:50 +03:00
remove_space = false ;
n_tokens - - ;
tokens + + ;
}
}
2025-01-14 09:53:50 +02:00
if ( remove_special & & add_eos ) {
if ( n_tokens > 0 & & tokens [ n_tokens - 1 ] = = special_eos_id ) {
2024-08-08 14:16:50 +03:00
n_tokens - - ;
}
}
for ( int32_t i = 0 ; i < n_tokens ; + + i ) {
GGML_ASSERT ( avail > = 0 ) ;
2025-01-14 09:53:50 +02:00
int32_t n_chars = token_to_piece ( tokens [ i ] , text , avail , remove_space , unparse_special ) ;
2024-08-08 14:16:50 +03:00
remove_space = false ;
if ( n_chars < 0 ) {
avail = 0 ;
total - = n_chars ;
} else if ( n_chars > 0 ) {
avail - = n_chars ;
text + = n_chars ;
total + = n_chars ;
}
}
if ( total > text_len_max ) {
return - total ;
}
2025-01-14 09:53:50 +02:00
if ( clean_spaces ) {
2024-08-08 14:16:50 +03:00
text - = total ; // restart text
// first pass: characters ?!., //TODO: where do these characters come from?
const int32_t total1 = total ;
total = total ? 1 : 0 ;
for ( int32_t i = 1 ; i < total1 ; + + i ) {
const char x = text [ i ] ;
if ( text [ i - 1 ] = = ' ' ) {
if ( x = = ' ? ' | | x = = ' ! ' | | x = = ' . ' | | x = = ' , ' ) { // " ?", " !", " .", " ,"
total - - ; // remove space
}
}
text [ total + + ] = x ;
}
// second pass: strip single apostrophe between spaces
const int32_t total2 = total ;
total = total ? 1 : 0 ;
for ( int32_t i = 1 ; i < total2 ; + + i ) {
const char x = text [ i ] ;
if ( x = = ' \' ' & & i + 1 < total2 & & text [ i - 1 ] = = ' ' & & text [ i + 1 ] = = ' ' ) { // " ' "
total - - ; // remove prev space
text [ + + i ] = ' \0 ' ; // remove next space
}
text [ total + + ] = x ;
}
// third pass: apostrophe contractions //NOTE: this makes sense?
const int32_t total3 = total ;
total = total ? 1 : 0 ;
for ( int32_t i = 1 ; i < total3 ; + + i ) {
const char x = text [ i ] ;
if ( text [ i - 1 ] = = ' ' ) {
if ( x = = ' \' ' & & i + 1 < total3 ) {
const char x1 = text [ i + 1 ] ;
if ( x1 = = ' t ' | | x1 = = ' d ' ) { // " 't", " 'd"
//total--; // remove space
} else if ( x1 = = ' s ' | | x1 = = ' m ' ) { // " 's", " 'm"
total - - ; // remove space
} else if ( i + 2 < total3 ) {
const char x2 = text [ i + 2 ] ;
if ( ( x1 = = ' l ' & & x2 = = ' l ' ) ) { // " 'll"
//total--; // remove space
} else if ( ( x1 = = ' r ' & & x2 = = ' e ' ) | | ( x1 = = ' v ' & & x2 = = ' e ' ) ) { // " 're", " 've"
total - - ; // remove space
} else {
//total--; // remove space
}
} else {
//total--; // remove space
}
}
}
text [ total + + ] = x ;
}
}
return total < = text_len_max ? total : - total ;
}
2024-10-31 22:29:22 +02:00
2025-01-14 09:53:50 +02:00
void llama_vocab : : impl : : print_info ( ) const {
LLAMA_LOG_INFO ( " %s: vocab type = %s \n " , __func__ , type_name ( ) . c_str ( ) ) ;
LLAMA_LOG_INFO ( " %s: n_vocab = %u \n " , __func__ , vocab . n_tokens ( ) ) ;
LLAMA_LOG_INFO ( " %s: n_merges = %u \n " , __func__ , ( uint32_t ) bpe_ranks . size ( ) ) ;
// special tokens
if ( special_bos_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: BOS token = %d '%s' \n " , __func__ , special_bos_id , id_to_token [ special_bos_id ] . text . c_str ( ) ) ; }
if ( special_eos_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: EOS token = %d '%s' \n " , __func__ , special_eos_id , id_to_token [ special_eos_id ] . text . c_str ( ) ) ; }
if ( special_eot_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: EOT token = %d '%s' \n " , __func__ , special_eot_id , id_to_token [ special_eot_id ] . text . c_str ( ) ) ; }
if ( special_eom_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: EOM token = %d '%s' \n " , __func__ , special_eom_id , id_to_token [ special_eom_id ] . text . c_str ( ) ) ; }
if ( special_unk_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: UNK token = %d '%s' \n " , __func__ , special_unk_id , id_to_token [ special_unk_id ] . text . c_str ( ) ) ; }
if ( special_sep_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: SEP token = %d '%s' \n " , __func__ , special_sep_id , id_to_token [ special_sep_id ] . text . c_str ( ) ) ; }
if ( special_pad_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: PAD token = %d '%s' \n " , __func__ , special_pad_id , id_to_token [ special_pad_id ] . text . c_str ( ) ) ; }
if ( special_mask_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: MASK token = %d '%s' \n " , __func__ , special_mask_id , id_to_token [ special_mask_id ] . text . c_str ( ) ) ; }
if ( linefeed_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: LF token = %d '%s' \n " , __func__ , linefeed_id , id_to_token [ linefeed_id ] . text . c_str ( ) ) ; }
if ( special_fim_pre_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: FIM PRE token = %d '%s' \n " , __func__ , special_fim_pre_id , id_to_token [ special_fim_pre_id ] . text . c_str ( ) ) ; }
if ( special_fim_suf_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: FIM SUF token = %d '%s' \n " , __func__ , special_fim_suf_id , id_to_token [ special_fim_suf_id ] . text . c_str ( ) ) ; }
if ( special_fim_mid_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: FIM MID token = %d '%s' \n " , __func__ , special_fim_mid_id , id_to_token [ special_fim_mid_id ] . text . c_str ( ) ) ; }
if ( special_fim_pad_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: FIM PAD token = %d '%s' \n " , __func__ , special_fim_pad_id , id_to_token [ special_fim_pad_id ] . text . c_str ( ) ) ; }
if ( special_fim_rep_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: FIM REP token = %d '%s' \n " , __func__ , special_fim_rep_id , id_to_token [ special_fim_rep_id ] . text . c_str ( ) ) ; }
if ( special_fim_sep_id ! = LLAMA_TOKEN_NULL ) { LLAMA_LOG_INFO ( " %s: FIM SEP token = %d '%s' \n " , __func__ , special_fim_sep_id , id_to_token [ special_fim_sep_id ] . text . c_str ( ) ) ; }
for ( const auto & id : special_eog_ids ) {
LLAMA_LOG_INFO ( " %s: EOG token = %d '%s' \n " , __func__ , id , id_to_token [ id ] . text . c_str ( ) ) ;
}
LLAMA_LOG_INFO ( " %s: max token length = %d \n " , __func__ , max_token_len ) ;
}
llama_vocab : : llama_vocab ( ) : pimpl ( new impl ( * this ) ) {
}
llama_vocab : : ~ llama_vocab ( ) {
}
void llama_vocab : : load ( llama_model_loader & ml , const LLM_KV & kv ) {
pimpl - > load ( ml , kv ) ;
}
enum llama_vocab_type llama_vocab : : get_type ( ) const {
return pimpl - > type ;
}
enum llama_vocab_pre_type llama_vocab : : get_pre_type ( ) const {
return pimpl - > pre_type ;
}
uint32_t llama_vocab : : n_tokens ( ) const {
return ( uint32_t ) pimpl - > id_to_token . size ( ) ;
}
uint32_t llama_vocab : : n_token_types ( ) const {
return ( uint32_t ) pimpl - > n_token_types ;
}
std : : string llama_vocab : : type_name ( ) const {
return pimpl - > type_name ( ) ;
}
bool llama_vocab : : is_normal ( llama_token id ) const {
return pimpl - > is_normal ( id ) ;
}
bool llama_vocab : : is_unknown ( llama_token id ) const {
return pimpl - > is_unknown ( id ) ;
}
bool llama_vocab : : is_control ( llama_token id ) const {
return pimpl - > is_control ( id ) ;
}
bool llama_vocab : : is_byte ( llama_token id ) const {
return pimpl - > is_byte ( id ) ;
}
bool llama_vocab : : is_user_defined ( llama_token id ) const {
return pimpl - > is_user_defined ( id ) ;
}
bool llama_vocab : : is_unused ( llama_token id ) const {
return pimpl - > is_unused ( id ) ;
}
bool llama_vocab : : is_eog ( llama_token id ) const {
return pimpl - > is_eog ( id ) ;
}
uint8_t llama_vocab : : token_to_byte ( llama_token id ) const {
return pimpl - > token_to_byte ( id ) ;
}
llama_token llama_vocab : : byte_to_token ( uint8_t ch ) const {
GGML_ASSERT ( get_type ( ) ! = LLAMA_VOCAB_TYPE_NONE ) ;
static const char * hex = " 0123456789ABCDEF " ;
switch ( get_type ( ) ) {
case LLAMA_VOCAB_TYPE_SPM :
case LLAMA_VOCAB_TYPE_UGM : {
const char buf [ 7 ] = { ' < ' , ' 0 ' , ' x ' , hex [ ch > > 4 ] , hex [ ch & 15 ] , ' > ' , 0 } ;
auto token = pimpl - > token_to_id . find ( buf ) ;
if ( token ! = pimpl - > token_to_id . end ( ) ) {
return ( * token ) . second ;
}
// Try to fall back to just the byte as a string
const char buf2 [ 2 ] = { ( char ) ch , 0 } ;
return pimpl - > token_to_id . at ( buf2 ) ;
}
case LLAMA_VOCAB_TYPE_WPM :
case LLAMA_VOCAB_TYPE_BPE : {
return pimpl - > token_to_id . at ( unicode_byte_to_utf8 ( ch ) ) ;
}
default :
GGML_ABORT ( " fatal error " ) ;
}
}
llama_token llama_vocab : : text_to_token ( const std : : string & text ) const {
GGML_ASSERT ( pimpl - > type ! = LLAMA_VOCAB_TYPE_NONE ) ;
auto it = pimpl - > token_to_id . find ( text ) ;
if ( it ! = pimpl - > token_to_id . end ( ) ) {
return ( * it ) . second ;
}
return LLAMA_TOKEN_NULL ;
}
const llama_vocab : : token_data & llama_vocab : : get_token_data ( llama_token id ) const {
GGML_ASSERT ( pimpl - > type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return pimpl - > id_to_token . at ( id ) ;
}
const char * llama_vocab : : token_get_text ( llama_token id ) const {
GGML_ASSERT ( pimpl - > type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return pimpl - > id_to_token . at ( id ) . text . c_str ( ) ;
}
float llama_vocab : : token_get_score ( llama_token id ) const {
GGML_ASSERT ( pimpl - > type ! = LLAMA_VOCAB_TYPE_NONE ) ;
return pimpl - > id_to_token . at ( id ) . score ;
}
llama_token_attr llama_vocab : : token_get_attr ( llama_token id ) const {
return pimpl - > token_get_attr ( id ) ;
}
llama_token llama_vocab : : token_bos ( ) const {
return pimpl - > special_bos_id ;
}
llama_token llama_vocab : : token_eos ( ) const {
return pimpl - > special_eos_id ;
}
llama_token llama_vocab : : token_eot ( ) const {
return pimpl - > special_eot_id ;
}
llama_token llama_vocab : : token_eom ( ) const {
return pimpl - > special_eom_id ;
}
llama_token llama_vocab : : token_unk ( ) const {
return pimpl - > special_unk_id ;
}
llama_token llama_vocab : : token_sep ( ) const {
return pimpl - > special_sep_id ;
}
llama_token llama_vocab : : token_nl ( ) const {
return pimpl - > linefeed_id ;
}
llama_token llama_vocab : : token_pad ( ) const {
return pimpl - > special_pad_id ;
}
llama_token llama_vocab : : token_prefix ( ) const {
return pimpl - > special_fim_pre_id ;
}
llama_token llama_vocab : : token_middle ( ) const {
return pimpl - > special_fim_mid_id ;
}
llama_token llama_vocab : : token_suffix ( ) const {
return pimpl - > special_fim_suf_id ;
}
llama_token llama_vocab : : token_fim_pre ( ) const {
return pimpl - > special_fim_pre_id ;
}
llama_token llama_vocab : : token_fim_suf ( ) const {
return pimpl - > special_fim_suf_id ;
}
llama_token llama_vocab : : token_fim_mid ( ) const {
return pimpl - > special_fim_mid_id ;
}
llama_token llama_vocab : : token_fim_pad ( ) const {
return pimpl - > special_fim_pad_id ;
}
llama_token llama_vocab : : token_fim_rep ( ) const {
return pimpl - > special_fim_rep_id ;
}
llama_token llama_vocab : : token_fim_sep ( ) const {
return pimpl - > special_fim_sep_id ;
}
bool llama_vocab : : get_add_space_prefix ( ) const {
return pimpl - > add_space_prefix ;
}
bool llama_vocab : : get_add_bos ( ) const {
return pimpl - > add_bos ;
}
bool llama_vocab : : get_add_eos ( ) const {
return pimpl - > add_eos ;
}
bool llama_vocab : : get_ignore_merges ( ) const {
return pimpl - > ignore_merges ;
}
bool llama_vocab : : get_clean_spaces ( ) const {
return pimpl - > clean_spaces ;
}
bool llama_vocab : : get_remove_extra_whitespaces ( ) const {
return pimpl - > remove_extra_whitespaces ;
}
bool llama_vocab : : get_escape_whitespaces ( ) const {
return pimpl - > escape_whitespaces ;
}
bool llama_vocab : : get_treat_whitespace_as_suffix ( ) const {
return pimpl - > treat_whitespace_as_suffix ;
}
int llama_vocab : : max_token_len ( ) const {
return pimpl - > max_token_len ;
}
int llama_vocab : : find_bpe_rank ( const std : : string & token_left , const std : : string & token_right ) const {
GGML_ASSERT ( token_left . find ( ' ' ) = = std : : string : : npos ) ;
GGML_ASSERT ( token_left . find ( ' \n ' ) = = std : : string : : npos ) ;
GGML_ASSERT ( token_right . find ( ' ' ) = = std : : string : : npos ) ;
GGML_ASSERT ( token_right . find ( ' \n ' ) = = std : : string : : npos ) ;
auto it = pimpl - > bpe_ranks . find ( std : : make_pair ( token_left , token_right ) ) ;
if ( it = = pimpl - > bpe_ranks . end ( ) ) {
return - 1 ;
}
return it - > second ;
}
int32_t llama_vocab : : tokenize (
const char * text ,
int32_t text_len ,
llama_token * tokens ,
int32_t n_tokens_max ,
bool add_special ,
bool parse_special ) const {
auto res = tokenize ( std : : string ( text , text_len ) , add_special , parse_special ) ;
if ( n_tokens_max < ( int ) res . size ( ) ) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return - ( ( int ) res . size ( ) ) ;
}
for ( size_t i = 0 ; i < res . size ( ) ; i + + ) {
tokens [ i ] = res [ i ] ;
}
return res . size ( ) ;
}
std : : vector < llama_token > llama_vocab : : tokenize (
const std : : string & raw_text ,
bool add_special ,
bool parse_special ) const {
return pimpl - > tokenize ( raw_text , add_special , parse_special ) ;
}
const std : : string & llama_vocab : : token_to_piece ( llama_token token ) const {
return pimpl - > token_to_piece ( token ) ;
}
int32_t llama_vocab : : token_to_piece ( llama_token token , char * buf , int32_t length , int32_t lstrip , bool special ) const {
return pimpl - > token_to_piece ( token , buf , length , lstrip , special ) ;
}
int32_t llama_vocab : : detokenize (
const llama_token * tokens ,
int32_t n_tokens ,
char * text ,
int32_t text_len_max ,
bool remove_special ,
bool unparse_special ) const {
return pimpl - > detokenize ( tokens , n_tokens , text , text_len_max , remove_special , unparse_special ) ;
}
std : : string llama_vocab : : detokenize ( const std : : vector < llama_token > & tokens , bool special ) const {
2024-10-31 22:29:22 +02:00
std : : string text ;
text . resize ( std : : max ( text . capacity ( ) , tokens . size ( ) ) ) ;
2025-01-14 09:53:50 +02:00
int32_t n_chars = detokenize ( tokens . data ( ) , ( int32_t ) tokens . size ( ) , & text [ 0 ] , ( int32_t ) text . size ( ) , false , special ) ;
2024-10-31 22:29:22 +02:00
if ( n_chars < 0 ) {
text . resize ( - n_chars ) ;
2025-01-14 09:53:50 +02:00
n_chars = detokenize ( tokens . data ( ) , ( int32_t ) tokens . size ( ) , & text [ 0 ] , ( int32_t ) text . size ( ) , false , special ) ;
2024-10-31 22:29:22 +02:00
GGML_ASSERT ( n_chars < = ( int32_t ) text . size ( ) ) ; // whitespace trimming is performed after per-token detokenization
}
text . resize ( n_chars ) ;
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return text ;
}
2025-01-14 09:53:50 +02:00
void llama_vocab : : print_info ( ) const {
pimpl - > print_info ( ) ;
}
//
// interface implementation
//
int32_t llama_vocab_n_tokens ( const struct llama_vocab * vocab ) {
return vocab - > n_tokens ( ) ;
}
// deprecated
int32_t llama_n_vocab ( const struct llama_vocab * vocab ) {
return llama_vocab_n_tokens ( vocab ) ;
}
enum llama_vocab_type llama_vocab_type ( const struct llama_vocab * vocab ) {
return vocab - > get_type ( ) ;
}
const char * llama_vocab_get_text ( const struct llama_vocab * vocab , llama_token token ) {
return vocab - > token_get_text ( token ) ;
}
float llama_vocab_get_score ( const struct llama_vocab * vocab , llama_token token ) {
return vocab - > token_get_score ( token ) ;
}
enum llama_token_attr llama_vocab_get_attr ( const struct llama_vocab * vocab , llama_token token ) {
return vocab - > token_get_attr ( token ) ;
}
bool llama_vocab_is_eog ( const struct llama_vocab * vocab , llama_token token ) {
return vocab - > is_eog ( token ) ;
}
bool llama_vocab_is_control ( const struct llama_vocab * vocab , llama_token token ) {
return vocab - > is_control ( token ) ;
}
llama_token llama_vocab_bos ( const struct llama_vocab * vocab ) {
return vocab - > token_bos ( ) ;
}
llama_token llama_vocab_eos ( const struct llama_vocab * vocab ) {
return vocab - > token_eos ( ) ;
}
llama_token llama_vocab_eot ( const struct llama_vocab * vocab ) {
return vocab - > token_eot ( ) ;
}
// deprecated
llama_token llama_vocab_cls ( const struct llama_vocab * vocab ) {
return vocab - > token_bos ( ) ;
}
llama_token llama_vocab_sep ( const struct llama_vocab * vocab ) {
return vocab - > token_sep ( ) ;
}
llama_token llama_vocab_nl ( const struct llama_vocab * vocab ) {
return vocab - > token_nl ( ) ;
}
llama_token llama_vocab_pad ( const struct llama_vocab * vocab ) {
return vocab - > token_pad ( ) ;
}
bool llama_vocab_get_add_bos ( const struct llama_vocab * vocab ) {
return vocab - > get_add_bos ( ) ;
}
bool llama_vocab_get_add_eos ( const struct llama_vocab * vocab ) {
return vocab - > get_add_eos ( ) ;
}
llama_token llama_vocab_fim_pre ( const struct llama_vocab * vocab ) {
return vocab - > token_fim_pre ( ) ;
}
llama_token llama_vocab_fim_suf ( const struct llama_vocab * vocab ) {
return vocab - > token_fim_suf ( ) ;
}
llama_token llama_vocab_fim_mid ( const struct llama_vocab * vocab ) {
return vocab - > token_fim_mid ( ) ;
}
llama_token llama_vocab_fim_pad ( const struct llama_vocab * vocab ) {
return vocab - > token_fim_pad ( ) ;
}
llama_token llama_vocab_fim_rep ( const struct llama_vocab * vocab ) {
return vocab - > token_fim_rep ( ) ;
}
llama_token llama_vocab_fim_sep ( const struct llama_vocab * vocab ) {
return vocab - > token_fim_sep ( ) ;
}
// deprecated
const char * llama_token_get_text ( const struct llama_vocab * vocab , llama_token token ) {
return llama_vocab_get_text ( vocab , token ) ;
}
// deprecated
float llama_token_get_score ( const struct llama_vocab * vocab , llama_token token ) {
return llama_vocab_get_score ( vocab , token ) ;
}
// deprecated
enum llama_token_attr llama_token_get_attr ( const struct llama_vocab * vocab , llama_token token ) {
return llama_vocab_get_attr ( vocab , token ) ;
}
// deprecated
bool llama_token_is_eog ( const struct llama_vocab * vocab , llama_token token ) {
return llama_vocab_is_eog ( vocab , token ) ;
}
// deprecated
bool llama_token_is_control ( const struct llama_vocab * vocab , llama_token token ) {
return llama_vocab_is_control ( vocab , token ) ;
}
// deprecated
llama_token llama_token_bos ( const struct llama_vocab * vocab ) {
return llama_vocab_bos ( vocab ) ;
}
// deprecated
llama_token llama_token_eos ( const struct llama_vocab * vocab ) {
return llama_vocab_eos ( vocab ) ;
}
// deprecated
llama_token llama_token_eot ( const struct llama_vocab * vocab ) {
return llama_vocab_eot ( vocab ) ;
}
// deprecated
llama_token llama_token_cls ( const struct llama_vocab * vocab ) {
//return llama_vocab_cls(vocab);
return llama_vocab_bos ( vocab ) ; // avoid deprecation warning
}
// deprecated
llama_token llama_token_sep ( const struct llama_vocab * vocab ) {
return llama_vocab_sep ( vocab ) ;
}
// deprecated
llama_token llama_token_nl ( const struct llama_vocab * vocab ) {
return llama_vocab_nl ( vocab ) ;
}
// deprecated
llama_token llama_token_pad ( const struct llama_vocab * vocab ) {
return llama_vocab_pad ( vocab ) ;
}
// deprecated
bool llama_add_bos_token ( const struct llama_vocab * vocab ) {
return llama_vocab_get_add_bos ( vocab ) ;
}
// deprecated
bool llama_add_eos_token ( const struct llama_vocab * vocab ) {
return llama_vocab_get_add_eos ( vocab ) ;
}
// deprecated
llama_token llama_token_fim_pre ( const struct llama_vocab * vocab ) {
return llama_vocab_fim_pre ( vocab ) ;
}
// deprecated
llama_token llama_token_fim_suf ( const struct llama_vocab * vocab ) {
return llama_vocab_fim_suf ( vocab ) ;
}
// deprecated
llama_token llama_token_fim_mid ( const struct llama_vocab * vocab ) {
return llama_vocab_fim_mid ( vocab ) ;
}
// deprecated
llama_token llama_token_fim_pad ( const struct llama_vocab * vocab ) {
return llama_vocab_fim_pad ( vocab ) ;
}
// deprecated
llama_token llama_token_fim_rep ( const struct llama_vocab * vocab ) {
return llama_vocab_fim_rep ( vocab ) ;
}
// deprecated
llama_token llama_token_fim_sep ( const struct llama_vocab * vocab ) {
return llama_vocab_fim_sep ( vocab ) ;
}
//
// tokenization
//
int32_t llama_tokenize (
const struct llama_vocab * vocab ,
const char * text ,
int32_t text_len ,
llama_token * tokens ,
int32_t n_tokens_max ,
bool add_special ,
bool parse_special ) {
return vocab - > tokenize ( text , text_len , tokens , n_tokens_max , add_special , parse_special ) ;
}
int32_t llama_token_to_piece (
const struct llama_vocab * vocab ,
llama_token token ,
char * buf ,
int32_t length ,
int32_t lstrip ,
bool special ) {
return vocab - > token_to_piece ( token , buf , length , lstrip , special ) ;
}
int32_t llama_detokenize (
const struct llama_vocab * vocab ,
const llama_token * tokens ,
int32_t n_tokens ,
char * text ,
int32_t text_len_max ,
bool remove_special ,
bool unparse_special ) {
return vocab - > detokenize ( tokens , n_tokens , text , text_len_max , remove_special , unparse_special ) ;
}