main : add some comments for the word-level timestamp algorithm

2025-06-13 20:48:08 +00:00 · 2022-11-01 22:35:21 +02:00
parent 0729da9a3b
commit 6fb98370ba
1 changed files with 274 additions and 266 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -321,12 +321,11 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
 }

 // word-level timestamps (experimental)
-// TODO: probably still has bugs, needs refactoring, etc..
-// TODO: auto threshold
+// TODO: make ffmpeg output optional
 // TODO: extra pass to detect unused speech and assign to tokens
 // TODO: font parameter adjustments
+// TODO: move to whisper.h/whisper.cpp and add parameter to select max line-length of subtitles
 bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
-    if (params.output_wts) {
    std::vector<float> pcm_avg(pcmf32.size(), 0);

    // average the fabs of the signal
@ -421,7 +420,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
            tokens[j].ptsum = token.ptsum;

            tokens[j].text = whisper_token_to_str(ctx, token.id);
-                //tokens[j].vlen = tokens[j].pt;
            tokens[j].vlen = voice_length(tokens[j].text);

            if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last && tt <= t1) {
@ -439,6 +437,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f

        t_last = t1;

+        // find intervals of tokens with unknown timestamps
+        // fill the timestamps by proportionally splitting the interval based on the token voice lengths
+        {
            int p0 = 0;
            int p1 = 0;
            while (true) {
@ -460,10 +461,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f

                    const double dt = tokens[p1].t1 - tokens[p0].t0;

+                    // split the time proportionally to the voice length
                    for (int j = p0 + 1; j <= p1; j++) {
                        const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
-                        //const double ct = tokens[j - 1].t0 + (dt*(j - p0))/(p1 - p0 + 1);
-                        //const double ct = tokens[p0].t0 + (dt*(j - p0))/(p1 - p0 + 1);

                        tokens[j - 1].t1 = ct;
                        tokens[j    ].t0 = ct;
@ -476,7 +476,9 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
                    break;
                }
            }
+        }

+        // fix up (just in case)
        for (int j = 0; j < n - 1; j++) {
            if (tokens[j].t1 < 0) {
                tokens[j + 1].t0 = tokens[j].t1;
@ -494,6 +496,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
        }

        // VAD
+        // expand or contract tokens based on voice activity
        {
            const int hw = WHISPER_SAMPLE_RATE/8;

@ -565,6 +568,8 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
            }
        }

+        // fixed token expand (optional)
+        {
            const int t_expand = 0;

            for (int j = 0; j < n; j++) {
@ -575,7 +580,10 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
                    tokens[j].t1 = tokens[j].t1 + t_expand;
                }
            }
+        }

+        // debug info
+        // TODO: toggle via parameter
        for (int j = 0; j < n; ++j) {
            const auto & token = tokens[j];
            const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
@ -591,6 +599,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
            //fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
        }

+        // TODO: become parameters
        static const int line_wrap = 60;
        static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";

@ -686,7 +695,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
    fout.close();

    fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
-    }

    return true;
 }