whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from main.cpp to whisper.cpp and can be reused for all subtitles types. This means that now you can specify the maximum length of the generated lines. Simply provide the "-ml" argument specifying the max length in number of characters
2025-06-12 20:18:08 +00:00 · 2022-11-02 21:18:20 +02:00
parent 4b1c32e8ea
commit d5afebd37c
5 changed files with 518 additions and 370 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -36,6 +36,7 @@ std::string to_timestamp(int64_t t, bool comma = false) {
    return std::string(buf);
 }

+// helper function to replace substrings
 void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
        pos = s.find(search, pos);
@ -45,31 +46,6 @@ void replace_all(std::string & s, const std::string & search, const std::string
    }
 }

-// a cost-function that is high for text that takes longer to pronounce
-float voice_length(const std::string & text) {
-    float res = 0.0f;
-
-    for (size_t i = 0; i < text.size(); ++i) {
-        if (text[i] == ' ') {
-            res += 0.01f;
-        } else if (text[i] == ',') {
-            res += 2.00f;
-        } else if (text[i] == '.') {
-            res += 3.00f;
-        } else if (text[i] == '!') {
-            res += 3.00f;
-        } else if (text[i] == '?') {
-            res += 3.00f;
-        } else if (text[i] >= '0' && text[i] <= '9') {
-            res += 3.00f;
-        } else {
-            res += 1.00f;
-        }
-    }
-
-    return res;
-}
-
 // command-line parameters
 struct whisper_params {
    int32_t seed         = -1; // RNG seed, not used currently
@ -78,6 +54,7 @@ struct whisper_params {
    int32_t offset_t_ms  = 0;
    int32_t offset_n     = 0;
    int32_t max_context  = -1;
+    int32_t max_len      = 0;

    float word_thold = 0.01f;

@ -120,6 +97,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            params.offset_n = std::stoi(argv[++i]);
        } else if (arg == "-mc" || arg == "--max-context") {
            params.max_context = std::stoi(argv[++i]);
+        } else if (arg == "-ml" || arg == "--max-len") {
+            params.max_len = std::stoi(argv[++i]);
        } else if (arg == "-wt" || arg == "--word-thold") {
            params.word_thold = std::stof(argv[++i]);
        } else if (arg == "-v" || arg == "--verbose") {
@ -176,13 +155,14 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
    fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
    fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
+    fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
    fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
    fprintf(stderr, "  -v,       --verbose        verbose output\n");
    fprintf(stderr, "            --translate      translate from source language to english\n");
    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
    fprintf(stderr, "  -ovtt,    --output-vtt     output result in a vtt file\n");
    fprintf(stderr, "  -osrt,    --output-srt     output result in a srt file\n");
-    fprintf(stderr, "  -owts,    --output-words   output word-level timestamps to a text file\n");
+    fprintf(stderr, "  -owts,    --output-words   output script for generating karaoke video\n");
    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
    fprintf(stderr, "  -pc,      --print_colors   print colors\n");
    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
@ -192,65 +172,67 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "\n");
 }

-void whisper_print_segment_callback(struct whisper_context * ctx, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const whisper_params & params = *(whisper_params *) user_data;

    const int n_segments = whisper_full_n_segments(ctx);

-    // print the last segment
-    const int i = n_segments - 1;
-    if (i == 0) {
+    // print the last n_new segments
+    const int s0 = n_segments - n_new;
+    if (s0 == 0) {
        printf("\n");
    }

-    if (params.no_timestamps) {
-        if (params.print_colors) {
-            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                if (params.print_special_tokens == false) {
-                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                    if (id >= whisper_token_eot(ctx)) {
-                        continue;
+    for (int i = s0; i < n_segments; i++) {
+        if (params.no_timestamps) {
+            if (params.print_colors) {
+                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                    if (params.print_special_tokens == false) {
+                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                        if (id >= whisper_token_eot(ctx)) {
+                            continue;
+                        }
                    }
+
+                    const char * text = whisper_full_get_token_text(ctx, i, j);
+                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
                }
-
-                const char * text = whisper_full_get_token_text(ctx, i, j);
-                const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+            } else {
+                const char * text = whisper_full_get_segment_text(ctx, i);
+                printf("%s", text);
            }
+            fflush(stdout);
        } else {
-            const char * text = whisper_full_get_segment_text(ctx, i);
-            printf("%s", text);
-        }
-        fflush(stdout);
-    } else {
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

-        if (params.print_colors) {
-            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                if (params.print_special_tokens == false) {
-                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                    if (id >= whisper_token_eot(ctx)) {
-                        continue;
+            if (params.print_colors) {
+                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
+                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                    if (params.print_special_tokens == false) {
+                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                        if (id >= whisper_token_eot(ctx)) {
+                            continue;
+                        }
                    }
+
+                    const char * text = whisper_full_get_token_text(ctx, i, j);
+                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
                }
+                printf("\n");
+            } else {
+                const char * text = whisper_full_get_segment_text(ctx, i);

-                const char * text = whisper_full_get_token_text(ctx, i, j);
-                const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
+                printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
            }
-            printf("\n");
-        } else {
-            const char * text = whisper_full_get_segment_text(ctx, i);
-
-            printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
        }
    }
 }
@ -320,297 +302,41 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

-// word-level timestamps (experimental)
-// TODO: make ffmpeg output optional
-// TODO: extra pass to detect unused speech and assign to tokens
+// karaoke video generation
+// outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-// TODO: move to whisper.h/whisper.cpp and add parameter to select max line-length of subtitles
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
-    std::vector<float> pcm_avg(pcmf32.size(), 0);
-
-    // average the fabs of the signal
-    {
-        const int hw = 32;
-
-        for (int i = 0; i < pcmf32.size(); i++) {
-            float sum = 0;
-            for (int j = -hw; j <= hw; j++) {
-                if (i + j >= 0 && i + j < pcmf32.size()) {
-                    sum += fabs(pcmf32[i + j]);
-                }
-            }
-            pcm_avg[i] = sum/(2*hw + 1);
-        }
-    }
-
-    struct token_info {
-        int64_t t0 = -1;
-        int64_t t1 = -1;
-
-        int64_t tt0 = -1;
-        int64_t tt1 = -1;
-
-        whisper_token id;
-        whisper_token tid;
-
-        float p     = 0.0f;
-        float pt    = 0.0f;
-        float ptsum = 0.0f;
-
-        std::string text;
-        float vlen = 0.0f; // voice length of this token
-    };
-
-    int64_t t_beg  = 0;
-    int64_t t_last = 0;
-
-    whisper_token tid_last = 0;
-
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
    std::ofstream fout(fname);

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

+    // TODO: become parameter
+    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
+
    fout << "!/bin/bash" << "\n";
    fout << "\n";

-    fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE << ":rate=25:color=black -vf \"";
-
-    bool is_first = true;
+    fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";

    for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

-        const char *text = whisper_full_get_segment_text(ctx, i);
-
-        const int s0 = std::max(0,                   (int) (t0*WHISPER_SAMPLE_RATE/100));
-        const int s1 = std::min((int) pcmf32.size(), (int) (t1*WHISPER_SAMPLE_RATE/100));
-
        const int n = whisper_full_n_tokens(ctx, i);

-        std::vector<token_info> tokens(n);
-
-        if (n <= 1) {
-            continue;
-        }
-
+        std::vector<whisper_token_data> tokens(n);
        for (int j = 0; j < n; ++j) {
-            struct whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
-
-            if (j == 0) {
-                if (token.id == whisper_token_beg(ctx)) {
-                    tokens[j    ].t0 = t0;
-                    tokens[j    ].t1 = t0;
-                    tokens[j + 1].t0 = t0;
-
-                    t_beg  = t0;
-                    t_last = t0;
-                    tid_last = whisper_token_beg(ctx);
-                } else {
-                    tokens[j    ].t0 = t_last;
-                }
-            }
-
-            const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(ctx));
-
-            tokens[j].id    = token.id;
-            tokens[j].tid   = token.tid;
-            tokens[j].p     = token.p;
-            tokens[j].pt    = token.pt;
-            tokens[j].ptsum = token.ptsum;
-
-            tokens[j].text = whisper_token_to_str(ctx, token.id);
-            tokens[j].vlen = voice_length(tokens[j].text);
-
-            if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last && tt <= t1) {
-                if (j > 0) {
-                    tokens[j - 1].t1 = tt;
-                }
-                tokens[j].t0 = tt;
-                tid_last = token.tid;
-            }
+            tokens[j] = whisper_full_get_token_data(ctx, i, j);
        }

-        tokens[n - 2].t1 = t1;
-        tokens[n - 1].t0 = t1;
-        tokens[n - 1].t1 = t1;
-
-        t_last = t1;
-
-        // find intervals of tokens with unknown timestamps
-        // fill the timestamps by proportionally splitting the interval based on the token voice lengths
-        {
-            int p0 = 0;
-            int p1 = 0;
-            while (true) {
-                while (p1 < n && tokens[p1].t1 < 0) {
-                    p1++;
-                }
-
-                if (p1 >= n) {
-                    p1--;
-                }
-
-                if (p1 > p0) {
-                    double psum = 0.0;
-                    for (int j = p0; j <= p1; j++) {
-                        psum += tokens[j].vlen;
-                    }
-
-                    //printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
-
-                    const double dt = tokens[p1].t1 - tokens[p0].t0;
-
-                    // split the time proportionally to the voice length
-                    for (int j = p0 + 1; j <= p1; j++) {
-                        const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
-
-                        tokens[j - 1].t1 = ct;
-                        tokens[j    ].t0 = ct;
-                    }
-                }
-
-                p1++;
-                p0 = p1;
-                if (p1 >= n) {
-                    break;
-                }
-            }
-        }
-
-        // fix up (just in case)
-        for (int j = 0; j < n - 1; j++) {
-            if (tokens[j].t1 < 0) {
-                tokens[j + 1].t0 = tokens[j].t1;
-            }
-
-            if (j > 0) {
-                if (tokens[j - 1].t1 > tokens[j].t0) {
-                    tokens[j].t0 = tokens[j - 1].t1;
-                    tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1);
-                }
-            }
-
-            tokens[j].tt0 = tokens[j].t0;
-            tokens[j].tt1 = tokens[j].t1;
-        }
-
-        // VAD
-        // expand or contract tokens based on voice activity
-        {
-            const int hw = WHISPER_SAMPLE_RATE/8;
-
-            for (int j = 0; j < n; j++) {
-                if (tokens[j].id >= whisper_token_eot(ctx)) {
-                    continue;
-                }
-
-                const int64_t t0 = tokens[j].t0;
-                const int64_t t1 = tokens[j].t1;
-
-                int s0 = std::max(0,                        (int) (t0*WHISPER_SAMPLE_RATE/100));
-                int s1 = std::min((int) pcmf32.size() - 1,  (int) (t1*WHISPER_SAMPLE_RATE/100));
-
-                const int ss0 = std::max(0,                       (int) (t0*WHISPER_SAMPLE_RATE/100) - hw);
-                const int ss1 = std::min((int) pcmf32.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100) + hw);
-
-                const int n = ss1 - ss0;
-
-                float sum = 0.0f;
-
-                for (int k = ss0; k < ss1; k++) {
-                    sum += pcm_avg[k];
-                }
-
-                const float thold = 0.5*sum/n;
-
-                {
-                    int k = s0;
-                    if (pcm_avg[k] > thold && j > 0) {
-                        while (k > 0 && pcm_avg[k] > thold) {
-                            k--;
-                        }
-                        tokens[j].t0 = (int64_t) (100*k/WHISPER_SAMPLE_RATE);
-                        if (tokens[j].t0 < tokens[j - 1].t1) {
-                            tokens[j].t0 = tokens[j - 1].t1;
-                        } else {
-                            s0 = k;
-                        }
-                    } else {
-                        while (pcm_avg[k] < thold && k < s1) {
-                            k++;
-                        }
-                        s0 = k;
-                        tokens[j].t0 = 100*k/WHISPER_SAMPLE_RATE;
-                    }
-                }
-
-                {
-                    int k = s1;
-                    if (pcm_avg[k] > thold) {
-                        while (k < (int) pcmf32.size() - 1 && pcm_avg[k] > thold) {
-                            k++;
-                        }
-                        tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
-                        if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
-                            tokens[j].t1 = tokens[j + 1].t0;
-                        } else {
-                            s1 = k;
-                        }
-                    } else {
-                        while (pcm_avg[k] < thold && k > s0) {
-                            k--;
-                        }
-                        s1 = k;
-                        tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
-                    }
-                }
-            }
-        }
-
-        // fixed token expand (optional)
-        {
-            const int t_expand = 0;
-
-            for (int j = 0; j < n; j++) {
-                if (j > 0) {
-                    tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand));
-                }
-                if (j < n - 1) {
-                    tokens[j].t1 = tokens[j].t1 + t_expand;
-                }
-            }
-        }
-
-        // debug info
-        // TODO: toggle via parameter
-        for (int j = 0; j < n; ++j) {
-            const auto & token = tokens[j];
-            const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
-            printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
-                    tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, token.text.c_str());
-
-            if (tokens[j].id >= whisper_token_eot(ctx)) {
-                continue;
-            }
-
-            //printf("[%s --> %s] %s\n", to_timestamp(token.t0).c_str(), to_timestamp(token.t1).c_str(), whisper_token_to_str(ctx, token.id));
-
-            //fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
-        }
-
-        // TODO: become parameters
-        static const int line_wrap = 60;
-        static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
-
-        if (!is_first) {
+        if (i > 0) {
            fout << ",";
        }

        // background text
        fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";

-        is_first = false;
+        bool is_first = true;

        for (int j = 0; j < n; ++j) {
            const auto & token = tokens[j];
@ -654,17 +380,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
                    }

                    ncnt += txt.size();
-
-                    if (ncnt > line_wrap) {
-                        if (k < j) {
-                            txt_bg = "> ";
-                            txt_fg = "> ";
-                            txt_ul = "\\ \\ ";
-                            ncnt = 0;
-                        } else {
-                            break;
-                        }
-                    }
                }

                ::replace_all(txt_bg, "'", "’");
@ -673,8 +388,11 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
                ::replace_all(txt_fg, "\"", "\\\"");
            }

-            // background text
-            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << token.tt0/100.0 << "," << token.tt1/100.0 << ")'";
+            if (is_first) {
+                // background text
+                fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
+                is_first = false;
+            }

            // foreground text
            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
@ -815,6 +533,10 @@ int main(int argc, char ** argv) {
            wparams.n_max_text_ctx       = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
            wparams.offset_ms            = params.offset_t_ms;

+            wparams.token_timestamps     = params.output_wts || params.max_len > 0;
+            wparams.thold_pt             = params.word_thold;
+            wparams.max_len              = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
+
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
@ -852,7 +574,7 @@ int main(int argc, char ** argv) {
            // output to WTS file
            if (params.output_wts) {
                const auto fname_wts = fname_inp + ".wts";
-                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, pcmf32);
+                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }
        }
    }