whisper : split_on_word no longer trims (#1046)

This commit is contained in:
Georgi Gerganov 2023-06-25 23:51:01 +03:00 committed by GitHub
parent 3f7a03ebe3
commit 72deb41eb2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -3401,26 +3401,6 @@ static void whisper_exp_compute_token_level_timestamps(
float thold_pt,
float thold_ptsum);
// trim from start (in place)
static inline void ltrim(std::string &s) {
s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) {
return std::isspace(ch);
}));
}
// trim from end (in place)
static inline void rtrim(std::string &s) {
s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) {
return std::isspace(ch);
}).base(), s.end());
}
// trim from both ends (in place)
static inline void trim(std::string &s) {
rtrim(s);
ltrim(s);
}
static inline bool should_split_on_word(const char * txt, bool split_on_word) {
if (!split_on_word) return true;
@ -3447,11 +3427,6 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
const int cur = strlen(txt);
if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
// split here
if (split_on_word) {
trim(text);
}
state.result_all.back().text = std::move(text);
state.result_all.back().t1 = token.t0;
state.result_all.back().tokens.resize(i);
@ -3479,9 +3454,6 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
}
}
if (split_on_word) {
trim(text);
}
state.result_all.back().text = std::move(text);
return res;