From 124c718c73f915f3e4235ae2af8841356e76177d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Dec 2022 23:13:55 +0200 Subject: [PATCH] whisper : fix UB when reading buffer of length 0 bytes (#265) --- whisper.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 33f258bc..1bc79967 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -549,13 +549,20 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx //} std::string word; + std::vector tmp; for (int i = 0; i < n_vocab; i++) { uint32_t len; read_safe(fin, len); - std::vector tmp(len); // create a buffer - fin.read( &tmp[0], tmp.size() ); // read to buffer - word.assign(&tmp[0], tmp.size()); + if (len > 0) { + tmp.resize(len); + fin.read(&tmp[0], tmp.size()); // read to buffer + word.assign(&tmp[0], tmp.size()); + } else { + // seems like we have an empty-string token in multi-language models (i = 50256) + //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i); + word = ""; + } vocab.token_to_id[word] = i; vocab.id_to_token[i] = word;