main : add some comments for the word-level timestamp algorithm

This commit is contained in:
Georgi Gerganov 2022-11-01 22:35:21 +02:00
parent 0729da9a3b
commit 6fb98370ba
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

View File

@ -321,12 +321,11 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
}
// word-level timestamps (experimental)
// TODO: probably still has bugs, needs refactoring, etc..
// TODO: auto threshold
// TODO: make ffmpeg output optional
// TODO: extra pass to detect unused speech and assign to tokens
// TODO: font parameter adjustments
// TODO: move to whisper.h/whisper.cpp and add parameter to select max line-length of subtitles
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
if (params.output_wts) {
std::vector<float> pcm_avg(pcmf32.size(), 0);
// average the fabs of the signal
@ -421,7 +420,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
tokens[j].ptsum = token.ptsum;
tokens[j].text = whisper_token_to_str(ctx, token.id);
//tokens[j].vlen = tokens[j].pt;
tokens[j].vlen = voice_length(tokens[j].text);
if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last && tt <= t1) {
@ -439,6 +437,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
t_last = t1;
// find intervals of tokens with unknown timestamps
// fill the timestamps by proportionally splitting the interval based on the token voice lengths
{
int p0 = 0;
int p1 = 0;
while (true) {
@ -460,10 +461,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
const double dt = tokens[p1].t1 - tokens[p0].t0;
// split the time proportionally to the voice length
for (int j = p0 + 1; j <= p1; j++) {
const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
//const double ct = tokens[j - 1].t0 + (dt*(j - p0))/(p1 - p0 + 1);
//const double ct = tokens[p0].t0 + (dt*(j - p0))/(p1 - p0 + 1);
tokens[j - 1].t1 = ct;
tokens[j ].t0 = ct;
@ -476,7 +476,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
break;
}
}
}
// fix up (just in case)
for (int j = 0; j < n - 1; j++) {
if (tokens[j].t1 < 0) {
tokens[j + 1].t0 = tokens[j].t1;
@ -494,6 +496,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
}
// VAD
// expand or contract tokens based on voice activity
{
const int hw = WHISPER_SAMPLE_RATE/8;
@ -565,6 +568,8 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
}
}
// fixed token expand (optional)
{
const int t_expand = 0;
for (int j = 0; j < n; j++) {
@ -575,7 +580,10 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
tokens[j].t1 = tokens[j].t1 + t_expand;
}
}
}
// debug info
// TODO: toggle via parameter
for (int j = 0; j < n; ++j) {
const auto & token = tokens[j];
const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
@ -591,6 +599,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
//fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
}
// TODO: become parameters
static const int line_wrap = 60;
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
@ -686,7 +695,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
fout.close();
fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
}
return true;
}