whisper : fix UB when reading buffer of length 0 bytes (#265)

This commit is contained in:
Georgi Gerganov 2022-12-13 23:13:55 +02:00
parent f66ac6dc4f
commit 124c718c73
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -549,13 +549,20 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
//} //}
std::string word; std::string word;
std::vector<char> tmp;
for (int i = 0; i < n_vocab; i++) { for (int i = 0; i < n_vocab; i++) {
uint32_t len; uint32_t len;
read_safe(fin, len); read_safe(fin, len);
std::vector<char> tmp(len); // create a buffer if (len > 0) {
fin.read( &tmp[0], tmp.size() ); // read to buffer tmp.resize(len);
word.assign(&tmp[0], tmp.size()); fin.read(&tmp[0], tmp.size()); // read to buffer
word.assign(&tmp[0], tmp.size());
} else {
// seems like we have an empty-string token in multi-language models (i = 50256)
//fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
word = "";
}
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word; vocab.id_to_token[i] = word;