whisper.cpp/examples/main/main.cpp

#include "whisper.h"

// third-party utilities
// use your favorite implementations
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"

#include <cmath>
#include <fstream>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>

// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
// Lowest is red, middle is yellow, highest is green.
const std::vector<std::string> k_colors = {
    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
};

//  500 -> 00:05.000
// 6000 -> 01:00.000
std::string to_timestamp(int64_t t, bool comma = false) {
    int64_t msec = t * 10;
    int64_t hr = msec / (1000 * 60 * 60);
    msec = msec - hr * (1000 * 60 * 60);
    int64_t min = msec / (1000 * 60);
    msec = msec - min * (1000 * 60);
    int64_t sec = msec / 1000;
    msec = msec - sec * 1000;

    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);

    return std::string(buf);
}

// helper function to replace substrings
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
        pos = s.find(search, pos);
        if (pos == std::string::npos) break;
        s.erase(pos, search.length());
        s.insert(pos, replace);
    }
}

// command-line parameters
struct whisper_params {
    int32_t seed         = -1; // RNG seed, not used currently
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_processors = 1;
    int32_t offset_t_ms  = 0;
    int32_t offset_n     = 0;
    int32_t duration_ms  = 0;
    int32_t max_context  = -1;
    int32_t max_len      = 0;

    float word_thold = 0.01f;

    bool verbose              = false;
    bool translate            = false;
    bool output_txt           = false;
    bool output_vtt           = false;
    bool output_srt           = false;
    bool output_wts           = false;
    bool print_special_tokens = false;
    bool print_colors         = false;
    bool no_timestamps        = false;

    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";

    std::vector<std::string> fname_inp = {};
};

void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

        if (arg[0] != '-') {
            params.fname_inp.push_back(arg);
            continue;
        }

        if (arg == "-s" || arg == "--seed") {
            params.seed = std::stoi(argv[++i]);
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-p" || arg == "--processors") {
            params.n_processors = std::stoi(argv[++i]);
        } else if (arg == "-ot" || arg == "--offset-t") {
            params.offset_t_ms = std::stoi(argv[++i]);
        } else if (arg == "-on" || arg == "--offset-n") {
            params.offset_n = std::stoi(argv[++i]);
        } else if (arg == "-d" || arg == "--duration") {
            params.duration_ms = std::stoi(argv[++i]);
        } else if (arg == "-mc" || arg == "--max-context") {
            params.max_context = std::stoi(argv[++i]);
        } else if (arg == "-ml" || arg == "--max-len") {
            params.max_len = std::stoi(argv[++i]);
        } else if (arg == "-wt" || arg == "--word-thold") {
            params.word_thold = std::stof(argv[++i]);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else if (arg == "--translate") {
            params.translate = true;
        } else if (arg == "-l" || arg == "--language") {
            params.language = argv[++i];
            if (whisper_lang_id(params.language.c_str()) == -1) {
                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
                whisper_print_usage(argc, argv, params);
                exit(0);
            }
        } else if (arg == "-otxt" || arg == "--output-txt") {
            params.output_txt = true;
        } else if (arg == "-ovtt" || arg == "--output-vtt") {
            params.output_vtt = true;
        } else if (arg == "-osrt" || arg == "--output-srt") {
            params.output_srt = true;
        } else if (arg == "-owts" || arg == "--output-words") {
            params.output_wts = true;
        } else if (arg == "-ps" || arg == "--print_special") {
            params.print_special_tokens = true;
        } else if (arg == "-pc" || arg == "--print_colors") {
            params.print_colors = true;
        } else if (arg == "-nt" || arg == "--no_timestamps") {
            params.no_timestamps = true;
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-f" || arg == "--file") {
            params.fname_inp.push_back(argv[++i]);
        } else if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }

    return true;
}

void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p N,     --processors N   number of processors to use during computation (default: %d)\n", params.n_processors);
    fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
    fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
    fprintf(stderr, "  -d  N,    --duration N     duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
    fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
    fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
    fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
    fprintf(stderr, "  -v,       --verbose        verbose output\n");
    fprintf(stderr, "            --translate      translate from source language to english\n");
    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
    fprintf(stderr, "  -ovtt,    --output-vtt     output result in a vtt file\n");
    fprintf(stderr, "  -osrt,    --output-srt     output result in a srt file\n");
    fprintf(stderr, "  -owts,    --output-words   output script for generating karaoke video\n");
    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
    fprintf(stderr, "  -pc,      --print_colors   print colors\n");
    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path\n");
    fprintf(stderr, "\n");
}

void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const whisper_params & params = *(whisper_params *) user_data;

    const int n_segments = whisper_full_n_segments(ctx);

    // print the last n_new segments
    const int s0 = n_segments - n_new;
    if (s0 == 0) {
        printf("\n");
    }

    for (int i = s0; i < n_segments; i++) {
        if (params.no_timestamps) {
            if (params.print_colors) {
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
                    if (params.print_special_tokens == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
                        }
                    }

                    const char * text = whisper_full_get_token_text(ctx, i, j);
                    const float  p    = whisper_full_get_token_p   (ctx, i, j);

                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));

                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
                }
            } else {
                const char * text = whisper_full_get_segment_text(ctx, i);
                printf("%s", text);
            }
            fflush(stdout);
        } else {
            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

            if (params.print_colors) {
                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
                    if (params.print_special_tokens == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
                        }
                    }

                    const char * text = whisper_full_get_token_text(ctx, i, j);
                    const float  p    = whisper_full_get_token_p   (ctx, i, j);

                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));

                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
                }
                printf("\n");
            } else {
                const char * text = whisper_full_get_segment_text(ctx, i);

                printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
            }
        }
    }
}

bool output_txt(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
        return false;
    }

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        fout << text;
    }

    return true;
}

bool output_vtt(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
        return 9;
    }

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

    fout << "WEBVTT\n\n";

    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

        fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
        fout << text << "\n\n";
    }

    return true;
}

bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
        return false;
    }

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

        fout << i + 1 + params.offset_n << "\n";
        fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
        fout << text << "\n\n";
    }

    return true;
}

// karaoke video generation
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
// TODO: font parameter adjustments
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
    std::ofstream fout(fname);

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

    // TODO: become parameter
    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";

    fout << "#!/bin/bash" << "\n";
    fout << "\n";

    fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";

    for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

        const int n = whisper_full_n_tokens(ctx, i);

        std::vector<whisper_token_data> tokens(n);
        for (int j = 0; j < n; ++j) {
            tokens[j] = whisper_full_get_token_data(ctx, i, j);
        }

        if (i > 0) {
            fout << ",";
        }

        // background text
        fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";

        bool is_first = true;

        for (int j = 0; j < n; ++j) {
            const auto & token = tokens[j];

            if (tokens[j].id >= whisper_token_eot(ctx)) {
                continue;
            }

            std::string txt_bg;
            std::string txt_fg; // highlight token
            std::string txt_ul; // underline

            txt_bg = "> ";
            txt_fg = "> ";
            txt_ul = "\\ \\ ";

            {
                int ncnt = 0;
                for (int k = 0; k < n; ++k) {
                    const auto & token2 = tokens[k];

                    if (tokens[k].id >= whisper_token_eot(ctx)) {
                        continue;
                    }

                    const std::string txt = whisper_token_to_str(ctx, token2.id);

                    txt_bg += txt;

                    if (k == j) {
                        for (int l = 0; l < (int) txt.size(); ++l) {
                            txt_fg += txt[l];
                            txt_ul += "_";
                        }
                        txt_fg += "|";
                    } else {
                        for (int l = 0; l < (int) txt.size(); ++l) {
                            txt_fg += "\\ ";
                            txt_ul += "\\ ";
                        }
                    }

                    ncnt += txt.size();
                }

                ::replace_all(txt_bg, "'", "’");
                ::replace_all(txt_bg, "\"", "\\\"");
                ::replace_all(txt_fg, "'", "’");
                ::replace_all(txt_fg, "\"", "\\\"");
            }

            if (is_first) {
                // background text
                fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
                is_first = false;
            }

            // foreground text
            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";

            // underline
            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
        }
    }

    fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";

    fout << "\n\n";
    fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
    fout << "\n";
    fout << "echo \"  ffplay " << fname_inp << ".mp4\"\n";
    fout << "\n";

    fout.close();

    fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);

    return true;
}

int main(int argc, char ** argv) {
    whisper_params params;

    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }

    if (params.seed < 0) {
        params.seed = time(NULL);
    }

    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        whisper_print_usage(argc, argv, params);
        return 2;
    }

    // whisper init

    struct whisper_context * ctx = whisper_init(params.model.c_str());

    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 3;
    }

    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];

        // WAV input
        std::vector<float> pcmf32;
        {
            drwav wav;
            if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
                fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
                whisper_print_usage(argc, argv, {});
                return 4;
            }

            if (wav.channels != 1 && wav.channels != 2) {
                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
                return 5;
            }

            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
                return 6;
            }

            if (wav.bitsPerSample != 16) {
                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
                return 7;
            }

            int n = wav.totalPCMFrameCount;

            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
            drwav_uninit(&wav);

            // convert to mono, float
            pcmf32.resize(n);
            if (wav.channels == 1) {
                for (int i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[i])/32768.0f;
                }
            } else {
                for (int i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
        }

        // print system information
        {
            fprintf(stderr, "\n");
            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
        }

        // print some info about the processing
        {
            fprintf(stderr, "\n");
            if (!whisper_is_multilingual(ctx)) {
                if (params.language != "en" || params.translate) {
                    params.language = "en";
                    params.translate = false;
                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
                }
            }
            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, params.n_processors,
                    params.language.c_str(),
                    params.translate ? "translate" : "transcribe",
                    params.no_timestamps ? 0 : 1);

            fprintf(stderr, "\n");
        }


        // run the inference
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

            wparams.print_realtime       = false;
            wparams.print_progress       = false;
            wparams.print_timestamps     = !params.no_timestamps;
            wparams.print_special_tokens = params.print_special_tokens;
            wparams.translate            = params.translate;
            wparams.language             = params.language.c_str();
            wparams.n_threads            = params.n_threads;
            wparams.n_max_text_ctx       = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
            wparams.offset_ms            = params.offset_t_ms;
            wparams.duration_ms          = params.duration_ms;

            wparams.token_timestamps     = params.output_wts || params.max_len > 0;
            wparams.thold_pt             = params.word_thold;
            wparams.max_len              = params.output_wts && params.max_len == 0 ? 60 : params.max_len;

            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
                wparams.new_segment_callback_user_data = &params;
            }

            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                return 8;
            }
        }

        // output stuff
        {
            printf("\n");

            // output to text file
            if (params.output_txt) {
                const auto fname_txt = fname_inp + ".txt";
                output_txt(ctx, fname_txt.c_str());
            }

            // output to VTT file
            if (params.output_vtt) {
                const auto fname_vtt = fname_inp + ".vtt";
                output_vtt(ctx, fname_vtt.c_str());
            }

            // output to SRT file
            if (params.output_srt) {
                const auto fname_srt = fname_inp + ".srt";
                output_srt(ctx, fname_srt.c_str(), params);
            }

            // output to WTS file
            if (params.output_wts) {
                const auto fname_wts = fname_inp + ".wts";
                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }
        }
    }

    whisper_print_timings(ctx);
    whisper_free(ctx);

    return 0;
}
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								#include "whisper.h"
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								// third-party utilities
 								// use your favorite implementations
 								#define DR_WAV_IMPLEMENTATION
 								#include "dr_wav.h"
-												Fix bug in FFT

The FFT routine does not work for odd N
Solution is to add DFT and use it when N is odd

											
										
										
											2022-10-02 14:46:21 +00:00
-												wip : experimental color coding of tokens based on probabilities

											
										
										
											2022-10-21 14:33:59 +00:00
+								#include <cmath>
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
+								#include <fstream>
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								#include <cstdio>
 								#include <string>
 								#include <thread>
 								#include <vector>
-												Fix bug in FFT

The FFT routine does not work for odd N
Solution is to add DFT and use it when N is odd

											
										
										
											2022-10-02 14:46:21 +00:00
-												wip : experimental color coding of tokens based on probabilities

											
										
										
											2022-10-21 14:33:59 +00:00
+								// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 								// Lowest is red, middle is yellow, highest is green.
 								const std::vector<std::string> k_colors = {
 								    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
 								    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
 								};
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								//  500 -> 00:05.000
 								// 6000 -> 01:00.000
-												main : fix SRT timestamp to use comma "," instead of dot "."

											
										
										
											2022-10-24 15:28:23 +00:00
+								std::string to_timestamp(int64_t t, bool comma = false) {
-												Update main.cpp
											
										
										
											2022-10-10 04:35:10 +00:00
+								    int64_t msec = t * 10;
 								    int64_t hr = msec / (1000 * 60 * 60);
 								    msec = msec - hr * (1000 * 60 * 60);
 								    int64_t min = msec / (1000 * 60);
 								    msec = msec - min * (1000 * 60);
 								    int64_t sec = msec / 1000;
 								    msec = msec - sec * 1000;
-												Use Accelerate framework on Apple silicon

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

											
										
										
											2022-10-17 18:44:16 +00:00
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    char buf[32];
-												main : fix SRT timestamp to use comma "," instead of dot "."

											
										
										
											2022-10-24 15:28:23 +00:00
+								    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    return std::string(buf);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								// helper function to replace substrings
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								void replace_all(std::string & s, const std::string & search, const std::string & replace) {
 								    for (size_t pos = 0; ; pos += replace.length()) {
 								        pos = s.find(search, pos);
 								        if (pos == std::string::npos) break;
 								        s.erase(pos, search.length());
 								        s.insert(pos, replace);
 								    }
 								}
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								// command-line parameters
 								struct whisper_params {
-												main : merge parallel example in main

											
										
										
											2022-10-29 09:26:03 +00:00
+								    int32_t seed         = -1; // RNG seed, not used currently
 								    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
 								    int32_t n_processors = 1;
 								    int32_t offset_t_ms  = 0;
 								    int32_t offset_n     = 0;
-												ref #22 : add "duration" option

Can be used to partially process a recording

											
										
										
											2022-11-07 18:14:52 +00:00
+								    int32_t duration_ms  = 0;
-												main : merge parallel example in main

											
										
										
											2022-10-29 09:26:03 +00:00
+								    int32_t max_context  = -1;
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								    int32_t max_len      = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								    float word_thold = 0.01f;
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    bool verbose              = false;
 								    bool translate            = false;
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
+								    bool output_txt           = false;
 								    bool output_vtt           = false;
 								    bool output_srt           = false;
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								    bool output_wts           = false;
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    bool print_special_tokens = false;
-												wip : experimental color coding of tokens based on probabilities

											
										
										
											2022-10-21 14:33:59 +00:00
+								    bool print_colors         = false;
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    bool no_timestamps        = false;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    std::string language  = "en";
 								    std::string model     = "models/ggml-base.en.bin";
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
 								    std::vector<std::string> fname_inp = {};
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								};
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
 								    for (int i = 1; i < argc; i++) {
 								        std::string arg = argv[i];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								        if (arg[0] != '-') {
 								            params.fname_inp.push_back(arg);
 								            continue;
 								        }
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								        if (arg == "-s" || arg == "--seed") {
 								            params.seed = std::stoi(argv[++i]);
 								        } else if (arg == "-t" || arg == "--threads") {
 								            params.n_threads = std::stoi(argv[++i]);
-												main : merge parallel example in main

											
										
										
											2022-10-29 09:26:03 +00:00
+								        } else if (arg == "-p" || arg == "--processors") {
 								            params.n_processors = std::stoi(argv[++i]);
-												ref #68 : add option "-on" to specify segment index offset for SRT

Also, change option "-o" to "-ot"

											
										
										
											2022-10-21 15:14:53 +00:00
+								        } else if (arg == "-ot" || arg == "--offset-t") {
 								            params.offset_t_ms = std::stoi(argv[++i]);
 								        } else if (arg == "-on" || arg == "--offset-n") {
 								            params.offset_n = std::stoi(argv[++i]);
-												ref #22 : add "duration" option

Can be used to partially process a recording

											
										
										
											2022-11-07 18:14:52 +00:00
+								        } else if (arg == "-d" || arg == "--duration") {
 								            params.duration_ms = std::stoi(argv[++i]);
-												main : fix sampling time + add max_context parameter

											
										
										
											2022-10-29 06:42:14 +00:00
+								        } else if (arg == "-mc" || arg == "--max-context") {
 								            params.max_context = std::stoi(argv[++i]);
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								        } else if (arg == "-ml" || arg == "--max-len") {
 								            params.max_len = std::stoi(argv[++i]);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								        } else if (arg == "-wt" || arg == "--word-thold") {
 								            params.word_thold = std::stof(argv[++i]);
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								        } else if (arg == "-v" || arg == "--verbose") {
 								            params.verbose = true;
 								        } else if (arg == "--translate") {
 								            params.translate = true;
 								        } else if (arg == "-l" || arg == "--language") {
 								            params.language = argv[++i];
 								            if (whisper_lang_id(params.language.c_str()) == -1) {
 								                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
 								                whisper_print_usage(argc, argv, params);
 								                exit(0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
+								        } else if (arg == "-otxt" || arg == "--output-txt") {
 								            params.output_txt = true;
 								        } else if (arg == "-ovtt" || arg == "--output-vtt") {
 								            params.output_vtt = true;
 								        } else if (arg == "-osrt" || arg == "--output-srt") {
 								            params.output_srt = true;
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								        } else if (arg == "-owts" || arg == "--output-words") {
 								            params.output_wts = true;
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								        } else if (arg == "-ps" || arg == "--print_special") {
 								            params.print_special_tokens = true;
-												wip : experimental color coding of tokens based on probabilities

											
										
										
											2022-10-21 14:33:59 +00:00
+								        } else if (arg == "-pc" || arg == "--print_colors") {
 								            params.print_colors = true;
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								        } else if (arg == "-nt" || arg == "--no_timestamps") {
 								            params.no_timestamps = true;
 								        } else if (arg == "-m" || arg == "--model") {
 								            params.model = argv[++i];
 								        } else if (arg == "-f" || arg == "--file") {
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            params.fname_inp.push_back(argv[++i]);
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								        } else if (arg == "-h" || arg == "--help") {
 								            whisper_print_usage(argc, argv, params);
 								            exit(0);
 								        } else {
 								            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
 								            whisper_print_usage(argc, argv, params);
 								            exit(0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
 								    return true;
 								}
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
 								    fprintf(stderr, "\n");
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    fprintf(stderr, "\n");
 								    fprintf(stderr, "options:\n");
 								    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
 								    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
 								    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
-												main : merge parallel example in main

											
										
										
											2022-10-29 09:26:03 +00:00
+								    fprintf(stderr, "  -p N,     --processors N   number of processors to use during computation (default: %d)\n", params.n_processors);
-												ref #68 : add option "-on" to specify segment index offset for SRT

Also, change option "-o" to "-ot"

											
										
										
											2022-10-21 15:14:53 +00:00
+								    fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
 								    fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
-												ref #22 : add "duration" option

Can be used to partially process a recording

											
										
										
											2022-11-07 18:14:52 +00:00
+								    fprintf(stderr, "  -d  N,    --duration N     duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
-												main : fix sampling time + add max_context parameter

											
										
										
											2022-10-29 06:42:14 +00:00
+								    fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								    fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								    fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    fprintf(stderr, "  -v,       --verbose        verbose output\n");
 								    fprintf(stderr, "            --translate      translate from source language to english\n");
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
+								    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
 								    fprintf(stderr, "  -ovtt,    --output-vtt     output result in a vtt file\n");
-												Minor

											
										
										
											2022-10-08 15:13:26 +00:00
+								    fprintf(stderr, "  -osrt,    --output-srt     output result in a srt file\n");
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								    fprintf(stderr, "  -owts,    --output-words   output script for generating karaoke video\n");
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
-												wip : experimental color coding of tokens based on probabilities

											
										
										
											2022-10-21 14:33:59 +00:00
+								    fprintf(stderr, "  -pc,      --print_colors   print colors\n");
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
 								    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
 								    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path\n");
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    fprintf(stderr, "\n");
-												ref #4 : added transcription timestamps

Can be turned off with "-nt" argument.
Performance has also improved.

											
										
										
											2022-09-29 20:09:04 +00:00
+								}
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
+								    const whisper_params & params = *(whisper_params *) user_data;
 								    const int n_segments = whisper_full_n_segments(ctx);
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								    // print the last n_new segments
 								    const int s0 = n_segments - n_new;
 								    if (s0 == 0) {
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
+								        printf("\n");
 								    }
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								    for (int i = s0; i < n_segments; i++) {
 								        if (params.no_timestamps) {
 								            if (params.print_colors) {
 								                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
 								                    if (params.print_special_tokens == false) {
 								                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
 								                        if (id >= whisper_token_eot(ctx)) {
 								                            continue;
 								                        }
-												main : print colors + no timestamps

											
										
										
											2022-10-22 18:09:30 +00:00
+								                    }
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								                    const char * text = whisper_full_get_token_text(ctx, i, j);
 								                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
-												main : print colors + no timestamps

											
										
										
											2022-10-22 18:09:30 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
-												main : print colors + no timestamps

											
										
										
											2022-10-22 18:09:30 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
 								                }
 								            } else {
 								                const char * text = whisper_full_get_segment_text(ctx, i);
 								                printf("%s", text);
-												main : print colors + no timestamps

											
										
										
											2022-10-22 18:09:30 +00:00
+								            }
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								            fflush(stdout);
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
+								        } else {
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
 								            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
 								            if (params.print_colors) {
 								                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
 								                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
 								                    if (params.print_special_tokens == false) {
 								                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
 								                        if (id >= whisper_token_eot(ctx)) {
 								                            continue;
 								                        }
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
+								                    }
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								                    const char * text = whisper_full_get_token_text(ctx, i, j);
 								                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
 								                }
 								                printf("\n");
 								            } else {
 								                const char * text = whisper_full_get_segment_text(ctx, i);
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								                printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
 								            }
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
+								        }
 								    }
 								}
-												main : refactor subtitle output

											
										
										
											2022-10-22 17:42:11 +00:00
+								bool output_txt(struct whisper_context * ctx, const char * fname) {
 								    std::ofstream fout(fname);
 								    if (!fout.is_open()) {
 								        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
 								        return false;
 								    }
 								    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
 								    const int n_segments = whisper_full_n_segments(ctx);
 								    for (int i = 0; i < n_segments; ++i) {
 								        const char * text = whisper_full_get_segment_text(ctx, i);
 								        fout << text;
 								    }
 								    return true;
 								}
 								bool output_vtt(struct whisper_context * ctx, const char * fname) {
 								    std::ofstream fout(fname);
 								    if (!fout.is_open()) {
 								        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
 								        return 9;
 								    }
 								    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
 								    fout << "WEBVTT\n\n";
 								    const int n_segments = whisper_full_n_segments(ctx);
 								    for (int i = 0; i < n_segments; ++i) {
 								        const char * text = whisper_full_get_segment_text(ctx, i);
 								        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
 								        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
 								        fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
 								        fout << text << "\n\n";
 								    }
 								    return true;
 								}
-												ref #68, #79 : fix segment time output

											
										
										
											2022-10-23 10:29:36 +00:00
+								bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
-												main : refactor subtitle output

											
										
										
											2022-10-22 17:42:11 +00:00
+								    std::ofstream fout(fname);
 								    if (!fout.is_open()) {
 								        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
 								        return false;
 								    }
 								    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
 								    const int n_segments = whisper_full_n_segments(ctx);
 								    for (int i = 0; i < n_segments; ++i) {
 								        const char * text = whisper_full_get_segment_text(ctx, i);
-												ref #68, #79 : fix segment time output

											
										
										
											2022-10-23 10:29:36 +00:00
+								        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
 								        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
 								        fout << i + 1 + params.offset_n << "\n";
-												main : fix SRT timestamp to use comma "," instead of dot "."

											
										
										
											2022-10-24 15:28:23 +00:00
+								        fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
-												ref #68, #79 : fix segment time output

											
										
										
											2022-10-23 10:29:36 +00:00
+								        fout << text << "\n\n";
-												main : refactor subtitle output

											
										
										
											2022-10-22 17:42:11 +00:00
+								    }
 								    return true;
 								}
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								// karaoke video generation
 								// outputs a bash script that uses ffmpeg to generate a video with the subtitles
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								// TODO: font parameter adjustments
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    std::ofstream fout(fname);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								    // TODO: become parameter
 								    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
-												main : fix generated bash script

											
										
										
											2022-11-04 16:30:38 +00:00
+								    fout << "#!/bin/bash" << "\n";
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    fout << "\n";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								    fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
 								        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
 								        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								        const int n = whisper_full_n_tokens(ctx, i);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								        std::vector<whisper_token_data> tokens(n);
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								        for (int j = 0; j < n; ++j) {
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								            tokens[j] = whisper_full_get_token_data(ctx, i, j);
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								        }
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								        if (i > 0) {
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								            fout << ",";
 								        }
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								        // background text
 								        fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								        bool is_first = true;
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								        for (int j = 0; j < n; ++j) {
 								            const auto & token = tokens[j];
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								            if (tokens[j].id >= whisper_token_eot(ctx)) {
 								                continue;
 								            }
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								            std::string txt_bg;
 								            std::string txt_fg; // highlight token
 								            std::string txt_ul; // underline
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								            txt_bg = "> ";
 								            txt_fg = "> ";
 								            txt_ul = "\\ \\ ";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								            {
 								                int ncnt = 0;
 								                for (int k = 0; k < n; ++k) {
 								                    const auto & token2 = tokens[k];
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								                    if (tokens[k].id >= whisper_token_eot(ctx)) {
 								                        continue;
 								                    }
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								                    const std::string txt = whisper_token_to_str(ctx, token2.id);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								                    txt_bg += txt;
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								                    if (k == j) {
 								                        for (int l = 0; l < (int) txt.size(); ++l) {
 								                            txt_fg += txt[l];
 								                            txt_ul += "_";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								                        }
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								                        txt_fg += "|";
 								                    } else {
 								                        for (int l = 0; l < (int) txt.size(); ++l) {
 								                            txt_fg += "\\ ";
 								                            txt_ul += "\\ ";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								                        }
 								                    }
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								                    ncnt += txt.size();
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								                }
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								                ::replace_all(txt_bg, "'", "’");
 								                ::replace_all(txt_bg, "\"", "\\\"");
 								                ::replace_all(txt_fg, "'", "’");
 								                ::replace_all(txt_fg, "\"", "\\\"");
 								            }
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								            if (is_first) {
 								                // background text
 								                fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
 								                is_first = false;
 								            }
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								            // foreground text
 								            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								            // underline
 								            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								        }
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    }
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    fout << "\n\n";
 								    fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
 								    fout << "\n";
 								    fout << "echo \"  ffplay " << fname_inp << ".mp4\"\n";
 								    fout << "\n";
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    fout.close();
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
-												main : add some comments for the word-level timestamp algorithm

											
										
										
											2022-11-01 20:35:21 +00:00
+								    fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
 								    return true;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								int main(int argc, char ** argv) {
 								    whisper_params params;
 								    if (whisper_params_parse(argc, argv, params) == false) {
 								        return 1;
 								    }
 								    if (params.seed < 0) {
 								        params.seed = time(NULL);
 								    }
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								    if (params.fname_inp.empty()) {
 								        fprintf(stderr, "error: no input files specified\n");
 								        whisper_print_usage(argc, argv, params);
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
+								        return 2;
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								    }
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    // whisper init
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    struct whisper_context * ctx = whisper_init(params.model.c_str());
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								    if (ctx == nullptr) {
 								        fprintf(stderr, "error: failed to initialize whisper context\n");
 								        return 3;
 								    }
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
 								        const auto fname_inp = params.fname_inp[f];
 								        // WAV input
 								        std::vector<float> pcmf32;
 								        {
 								            drwav wav;
 								            if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
 								                fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
 								                whisper_print_usage(argc, argv, {});
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								                return 4;
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            if (wav.channels != 1 && wav.channels != 2) {
 								                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								                return 5;
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
 								                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								                return 6;
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            if (wav.bitsPerSample != 16) {
 								                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								                return 7;
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            int n = wav.totalPCMFrameCount;
-												Fix reading of stereo WAV files

											
										
										
											2022-10-01 05:41:57 +00:00
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            std::vector<int16_t> pcm16;
 								            pcm16.resize(n*wav.channels);
 								            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
 								            drwav_uninit(&wav);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            // convert to mono, float
 								            pcmf32.resize(n);
 								            if (wav.channels == 1) {
 								                for (int i = 0; i < n; i++) {
 								                    pcmf32[i] = float(pcm16[i])/32768.0f;
 								                }
 								            } else {
 								                for (int i = 0; i < n; i++) {
 								                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
 								                }
-												Reduce memory usage even more + better sampling

- The encode/decode memory buffers are now reused
- If the 30-sec segment goes for too long without a timestamp token, we
  force one. Improves transcription for large model
- Stereo support
- Add "micro-machines.wav" sample

											
										
										
											2022-09-30 16:33:09 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												Print system info at start of program

											
										
										
											2022-10-27 14:22:10 +00:00
+								        // print system information
 								        {
 								            fprintf(stderr, "\n");
-												main : merge parallel example in main

											
										
										
											2022-10-29 09:26:03 +00:00
+								            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
 								                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
-												Print system info at start of program

											
										
										
											2022-10-27 14:22:10 +00:00
+								        }
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								        // print some info about the processing
 								        {
-												ref #17 : print whisper logs to stderr

Only the transcribed/translted text is printed to stdout.
This way, one can redirect the result to a file.

											
										
										
											2022-10-08 14:28:06 +00:00
+								            fprintf(stderr, "\n");
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            if (!whisper_is_multilingual(ctx)) {
 								                if (params.language != "en" || params.translate) {
 								                    params.language = "en";
 								                    params.translate = false;
-												ref #17 : print whisper logs to stderr

Only the transcribed/translted text is printed to stdout.
This way, one can redirect the result to a file.

											
										
										
											2022-10-08 14:28:06 +00:00
+								                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								                }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            }
-												main : merge parallel example in main

											
										
										
											2022-10-29 09:26:03 +00:00
+								            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
 								                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
 								                    params.n_threads, params.n_processors,
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								                    params.language.c_str(),
 								                    params.translate ? "translate" : "transcribe",
 								                    params.no_timestamps ? 0 : 1);
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
-												ref #17 : print whisper logs to stderr

Only the transcribed/translted text is printed to stdout.
This way, one can redirect the result to a file.

											
										
										
											2022-10-08 14:28:06 +00:00
+								            fprintf(stderr, "\n");
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								        // run the inference
 								        {
-												ref #57, #62, #63 : remove unions in C-api + remove designated initializers

We are not ready for designated initializers - many compilers do not
support this C++ feature yet, so removing it's non-trivial usages.

											
										
										
											2022-10-18 15:17:24 +00:00
+								            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
+								            wparams.print_realtime       = false;
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            wparams.print_progress       = false;
 								            wparams.print_timestamps     = !params.no_timestamps;
 								            wparams.print_special_tokens = params.print_special_tokens;
 								            wparams.translate            = params.translate;
 								            wparams.language             = params.language.c_str();
 								            wparams.n_threads            = params.n_threads;
-												main : fix sampling time + add max_context parameter

											
										
										
											2022-10-29 06:42:14 +00:00
+								            wparams.n_max_text_ctx       = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-												ref #68 : add option "-on" to specify segment index offset for SRT

Also, change option "-o" to "-ot"

											
										
										
											2022-10-21 15:14:53 +00:00
+								            wparams.offset_ms            = params.offset_t_ms;
-												ref #22 : add "duration" option

Can be used to partially process a recording

											
										
										
											2022-11-07 18:14:52 +00:00
+								            wparams.duration_ms          = params.duration_ms;
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								            wparams.token_timestamps     = params.output_wts || params.max_len > 0;
 								            wparams.thold_pt             = params.word_thold;
 								            wparams.max_len              = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-												whisper : add new-segment callback

Can be used to process new segments as they are being generated.
Sample usage in main, for printing the resulting segments during the
inference.

											
										
										
											2022-10-22 18:06:50 +00:00
+								            // this callback is called on each new segment
 								            if (!wparams.print_realtime) {
 								                wparams.new_segment_callback           = whisper_print_segment_callback;
 								                wparams.new_segment_callback_user_data = &params;
 								            }
-												main : merge parallel example in main

											
										
										
											2022-10-29 09:26:03 +00:00
+								            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								                return 8;
-												ref #22 : add option to provide multiple input .wav files

											
										
										
											2022-10-05 20:44:10 +00:00
+								            }
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								        // output stuff
 								        {
-												ref #9 : add API documentation in whisper.h

											
										
										
											2022-10-08 15:09:56 +00:00
+								            printf("\n");
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
 								            // output to text file
 								            if (params.output_txt) {
 								                const auto fname_txt = fname_inp + ".txt";
-												main : refactor subtitle output

											
										
										
											2022-10-22 17:42:11 +00:00
+								                output_txt(ctx, fname_txt.c_str());
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
+								            }
 								            // output to VTT file
 								            if (params.output_vtt) {
 								                const auto fname_vtt = fname_inp + ".vtt";
-												main : refactor subtitle output

											
										
										
											2022-10-22 17:42:11 +00:00
+								                output_vtt(ctx, fname_vtt.c_str());
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
+								            }
 								            // output to SRT file
 								            if (params.output_srt) {
 								                const auto fname_srt = fname_inp + ".srt";
-												ref #68, #79 : fix segment time output

											
										
										
											2022-10-23 10:29:36 +00:00
+								                output_srt(ctx, fname_srt.c_str(), params);
-												ref #17 : add options to output result to file

Support for:

- plain text
- VTT
- SRT

											
										
										
											2022-10-08 14:22:22 +00:00
+								            }
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
 								            // output to WTS file
 								            if (params.output_wts) {
 								                const auto fname_wts = fname_inp + ".wts";
-												whisper : token-level timestamp refactoring (#49, #120)

This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters

											
										
										
											2022-11-02 19:18:20 +00:00
+								                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
-												main : add option for word-leve timestamps (very experimental)

											
										
										
											2022-10-30 08:05:58 +00:00
+								            }
-												ref #4 : added transcription timestamps

Can be turned off with "-nt" argument.
Performance has also improved.

											
										
										
											2022-09-29 20:09:04 +00:00
+								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												Initial C-style interface for whisper.cpp

											
										
										
											2022-10-04 17:35:01 +00:00
+								    whisper_print_timings(ctx);
 								    whisper_free(ctx);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return 0;
 								}