mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-19 04:37:51 +00:00
main : add some comments for the word-level timestamp algorithm
This commit is contained in:
parent
0729da9a3b
commit
6fb98370ba
@ -321,12 +321,11 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
|
||||
}
|
||||
|
||||
// word-level timestamps (experimental)
|
||||
// TODO: probably still has bugs, needs refactoring, etc..
|
||||
// TODO: auto threshold
|
||||
// TODO: make ffmpeg output optional
|
||||
// TODO: extra pass to detect unused speech and assign to tokens
|
||||
// TODO: font parameter adjustments
|
||||
// TODO: move to whisper.h/whisper.cpp and add parameter to select max line-length of subtitles
|
||||
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
|
||||
if (params.output_wts) {
|
||||
std::vector<float> pcm_avg(pcmf32.size(), 0);
|
||||
|
||||
// average the fabs of the signal
|
||||
@ -421,7 +420,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
tokens[j].ptsum = token.ptsum;
|
||||
|
||||
tokens[j].text = whisper_token_to_str(ctx, token.id);
|
||||
//tokens[j].vlen = tokens[j].pt;
|
||||
tokens[j].vlen = voice_length(tokens[j].text);
|
||||
|
||||
if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last && tt <= t1) {
|
||||
@ -439,6 +437,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
|
||||
t_last = t1;
|
||||
|
||||
// find intervals of tokens with unknown timestamps
|
||||
// fill the timestamps by proportionally splitting the interval based on the token voice lengths
|
||||
{
|
||||
int p0 = 0;
|
||||
int p1 = 0;
|
||||
while (true) {
|
||||
@ -460,10 +461,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
|
||||
const double dt = tokens[p1].t1 - tokens[p0].t0;
|
||||
|
||||
// split the time proportionally to the voice length
|
||||
for (int j = p0 + 1; j <= p1; j++) {
|
||||
const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
|
||||
//const double ct = tokens[j - 1].t0 + (dt*(j - p0))/(p1 - p0 + 1);
|
||||
//const double ct = tokens[p0].t0 + (dt*(j - p0))/(p1 - p0 + 1);
|
||||
|
||||
tokens[j - 1].t1 = ct;
|
||||
tokens[j ].t0 = ct;
|
||||
@ -476,7 +476,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fix up (just in case)
|
||||
for (int j = 0; j < n - 1; j++) {
|
||||
if (tokens[j].t1 < 0) {
|
||||
tokens[j + 1].t0 = tokens[j].t1;
|
||||
@ -494,6 +496,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
}
|
||||
|
||||
// VAD
|
||||
// expand or contract tokens based on voice activity
|
||||
{
|
||||
const int hw = WHISPER_SAMPLE_RATE/8;
|
||||
|
||||
@ -565,6 +568,8 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
}
|
||||
}
|
||||
|
||||
// fixed token expand (optional)
|
||||
{
|
||||
const int t_expand = 0;
|
||||
|
||||
for (int j = 0; j < n; j++) {
|
||||
@ -575,7 +580,10 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
tokens[j].t1 = tokens[j].t1 + t_expand;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// debug info
|
||||
// TODO: toggle via parameter
|
||||
for (int j = 0; j < n; ++j) {
|
||||
const auto & token = tokens[j];
|
||||
const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
|
||||
@ -591,6 +599,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
//fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
|
||||
}
|
||||
|
||||
// TODO: become parameters
|
||||
static const int line_wrap = 60;
|
||||
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
||||
|
||||
@ -686,7 +695,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
fout.close();
|
||||
|
||||
fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user