mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-12 20:18:08 +00:00
This turned out pretty good overall. The algorithm has been moved from main.cpp to whisper.cpp and can be reused for all subtitles types. This means that now you can specify the maximum length of the generated lines. Simply provide the "-ml" argument specifying the max length in number of characters
This commit is contained in:
@ -36,6 +36,7 @@ std::string to_timestamp(int64_t t, bool comma = false) {
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
// helper function to replace substrings
|
||||
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
for (size_t pos = 0; ; pos += replace.length()) {
|
||||
pos = s.find(search, pos);
|
||||
@ -45,31 +46,6 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
||||
}
|
||||
}
|
||||
|
||||
// a cost-function that is high for text that takes longer to pronounce
|
||||
float voice_length(const std::string & text) {
|
||||
float res = 0.0f;
|
||||
|
||||
for (size_t i = 0; i < text.size(); ++i) {
|
||||
if (text[i] == ' ') {
|
||||
res += 0.01f;
|
||||
} else if (text[i] == ',') {
|
||||
res += 2.00f;
|
||||
} else if (text[i] == '.') {
|
||||
res += 3.00f;
|
||||
} else if (text[i] == '!') {
|
||||
res += 3.00f;
|
||||
} else if (text[i] == '?') {
|
||||
res += 3.00f;
|
||||
} else if (text[i] >= '0' && text[i] <= '9') {
|
||||
res += 3.00f;
|
||||
} else {
|
||||
res += 1.00f;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t seed = -1; // RNG seed, not used currently
|
||||
@ -78,6 +54,7 @@ struct whisper_params {
|
||||
int32_t offset_t_ms = 0;
|
||||
int32_t offset_n = 0;
|
||||
int32_t max_context = -1;
|
||||
int32_t max_len = 0;
|
||||
|
||||
float word_thold = 0.01f;
|
||||
|
||||
@ -120,6 +97,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
params.offset_n = std::stoi(argv[++i]);
|
||||
} else if (arg == "-mc" || arg == "--max-context") {
|
||||
params.max_context = std::stoi(argv[++i]);
|
||||
} else if (arg == "-ml" || arg == "--max-len") {
|
||||
params.max_len = std::stoi(argv[++i]);
|
||||
} else if (arg == "-wt" || arg == "--word-thold") {
|
||||
params.word_thold = std::stof(argv[++i]);
|
||||
} else if (arg == "-v" || arg == "--verbose") {
|
||||
@ -176,13 +155,14 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
||||
fprintf(stderr, " -ot N, --offset-t N time offset in milliseconds (default: %d)\n", params.offset_t_ms);
|
||||
fprintf(stderr, " -on N, --offset-n N segment index offset (default: %d)\n", params.offset_n);
|
||||
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
|
||||
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
|
||||
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
|
||||
fprintf(stderr, " -v, --verbose verbose output\n");
|
||||
fprintf(stderr, " --translate translate from source language to english\n");
|
||||
fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
|
||||
fprintf(stderr, " -ovtt, --output-vtt output result in a vtt file\n");
|
||||
fprintf(stderr, " -osrt, --output-srt output result in a srt file\n");
|
||||
fprintf(stderr, " -owts, --output-words output word-level timestamps to a text file\n");
|
||||
fprintf(stderr, " -owts, --output-words output script for generating karaoke video\n");
|
||||
fprintf(stderr, " -ps, --print_special print special tokens\n");
|
||||
fprintf(stderr, " -pc, --print_colors print colors\n");
|
||||
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
|
||||
@ -192,65 +172,67 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
void whisper_print_segment_callback(struct whisper_context * ctx, void * user_data) {
|
||||
void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
|
||||
const whisper_params & params = *(whisper_params *) user_data;
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
|
||||
// print the last segment
|
||||
const int i = n_segments - 1;
|
||||
if (i == 0) {
|
||||
// print the last n_new segments
|
||||
const int s0 = n_segments - n_new;
|
||||
if (s0 == 0) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
if (params.no_timestamps) {
|
||||
if (params.print_colors) {
|
||||
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
||||
if (params.print_special_tokens == false) {
|
||||
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
||||
if (id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
for (int i = s0; i < n_segments; i++) {
|
||||
if (params.no_timestamps) {
|
||||
if (params.print_colors) {
|
||||
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
||||
if (params.print_special_tokens == false) {
|
||||
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
||||
if (id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const char * text = whisper_full_get_token_text(ctx, i, j);
|
||||
const float p = whisper_full_get_token_p (ctx, i, j);
|
||||
|
||||
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
|
||||
|
||||
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
|
||||
}
|
||||
|
||||
const char * text = whisper_full_get_token_text(ctx, i, j);
|
||||
const float p = whisper_full_get_token_p (ctx, i, j);
|
||||
|
||||
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
|
||||
|
||||
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
|
||||
} else {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
printf("%s", text);
|
||||
}
|
||||
fflush(stdout);
|
||||
} else {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
printf("%s", text);
|
||||
}
|
||||
fflush(stdout);
|
||||
} else {
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
if (params.print_colors) {
|
||||
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
|
||||
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
||||
if (params.print_special_tokens == false) {
|
||||
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
||||
if (id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
if (params.print_colors) {
|
||||
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
|
||||
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
||||
if (params.print_special_tokens == false) {
|
||||
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
||||
if (id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const char * text = whisper_full_get_token_text(ctx, i, j);
|
||||
const float p = whisper_full_get_token_p (ctx, i, j);
|
||||
|
||||
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
|
||||
|
||||
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
|
||||
}
|
||||
printf("\n");
|
||||
} else {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
const char * text = whisper_full_get_token_text(ctx, i, j);
|
||||
const float p = whisper_full_get_token_p (ctx, i, j);
|
||||
|
||||
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
|
||||
|
||||
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
|
||||
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
|
||||
}
|
||||
printf("\n");
|
||||
} else {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -320,297 +302,41 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
|
||||
return true;
|
||||
}
|
||||
|
||||
// word-level timestamps (experimental)
|
||||
// TODO: make ffmpeg output optional
|
||||
// TODO: extra pass to detect unused speech and assign to tokens
|
||||
// karaoke video generation
|
||||
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
||||
// TODO: font parameter adjustments
|
||||
// TODO: move to whisper.h/whisper.cpp and add parameter to select max line-length of subtitles
|
||||
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
|
||||
std::vector<float> pcm_avg(pcmf32.size(), 0);
|
||||
|
||||
// average the fabs of the signal
|
||||
{
|
||||
const int hw = 32;
|
||||
|
||||
for (int i = 0; i < pcmf32.size(); i++) {
|
||||
float sum = 0;
|
||||
for (int j = -hw; j <= hw; j++) {
|
||||
if (i + j >= 0 && i + j < pcmf32.size()) {
|
||||
sum += fabs(pcmf32[i + j]);
|
||||
}
|
||||
}
|
||||
pcm_avg[i] = sum/(2*hw + 1);
|
||||
}
|
||||
}
|
||||
|
||||
struct token_info {
|
||||
int64_t t0 = -1;
|
||||
int64_t t1 = -1;
|
||||
|
||||
int64_t tt0 = -1;
|
||||
int64_t tt1 = -1;
|
||||
|
||||
whisper_token id;
|
||||
whisper_token tid;
|
||||
|
||||
float p = 0.0f;
|
||||
float pt = 0.0f;
|
||||
float ptsum = 0.0f;
|
||||
|
||||
std::string text;
|
||||
float vlen = 0.0f; // voice length of this token
|
||||
};
|
||||
|
||||
int64_t t_beg = 0;
|
||||
int64_t t_last = 0;
|
||||
|
||||
whisper_token tid_last = 0;
|
||||
|
||||
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
|
||||
std::ofstream fout(fname);
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
// TODO: become parameter
|
||||
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
||||
|
||||
fout << "!/bin/bash" << "\n";
|
||||
fout << "\n";
|
||||
|
||||
fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE << ":rate=25:color=black -vf \"";
|
||||
|
||||
bool is_first = true;
|
||||
fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";
|
||||
|
||||
for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
const char *text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
const int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
|
||||
const int s1 = std::min((int) pcmf32.size(), (int) (t1*WHISPER_SAMPLE_RATE/100));
|
||||
|
||||
const int n = whisper_full_n_tokens(ctx, i);
|
||||
|
||||
std::vector<token_info> tokens(n);
|
||||
|
||||
if (n <= 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<whisper_token_data> tokens(n);
|
||||
for (int j = 0; j < n; ++j) {
|
||||
struct whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
|
||||
|
||||
if (j == 0) {
|
||||
if (token.id == whisper_token_beg(ctx)) {
|
||||
tokens[j ].t0 = t0;
|
||||
tokens[j ].t1 = t0;
|
||||
tokens[j + 1].t0 = t0;
|
||||
|
||||
t_beg = t0;
|
||||
t_last = t0;
|
||||
tid_last = whisper_token_beg(ctx);
|
||||
} else {
|
||||
tokens[j ].t0 = t_last;
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(ctx));
|
||||
|
||||
tokens[j].id = token.id;
|
||||
tokens[j].tid = token.tid;
|
||||
tokens[j].p = token.p;
|
||||
tokens[j].pt = token.pt;
|
||||
tokens[j].ptsum = token.ptsum;
|
||||
|
||||
tokens[j].text = whisper_token_to_str(ctx, token.id);
|
||||
tokens[j].vlen = voice_length(tokens[j].text);
|
||||
|
||||
if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last && tt <= t1) {
|
||||
if (j > 0) {
|
||||
tokens[j - 1].t1 = tt;
|
||||
}
|
||||
tokens[j].t0 = tt;
|
||||
tid_last = token.tid;
|
||||
}
|
||||
tokens[j] = whisper_full_get_token_data(ctx, i, j);
|
||||
}
|
||||
|
||||
tokens[n - 2].t1 = t1;
|
||||
tokens[n - 1].t0 = t1;
|
||||
tokens[n - 1].t1 = t1;
|
||||
|
||||
t_last = t1;
|
||||
|
||||
// find intervals of tokens with unknown timestamps
|
||||
// fill the timestamps by proportionally splitting the interval based on the token voice lengths
|
||||
{
|
||||
int p0 = 0;
|
||||
int p1 = 0;
|
||||
while (true) {
|
||||
while (p1 < n && tokens[p1].t1 < 0) {
|
||||
p1++;
|
||||
}
|
||||
|
||||
if (p1 >= n) {
|
||||
p1--;
|
||||
}
|
||||
|
||||
if (p1 > p0) {
|
||||
double psum = 0.0;
|
||||
for (int j = p0; j <= p1; j++) {
|
||||
psum += tokens[j].vlen;
|
||||
}
|
||||
|
||||
//printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
|
||||
|
||||
const double dt = tokens[p1].t1 - tokens[p0].t0;
|
||||
|
||||
// split the time proportionally to the voice length
|
||||
for (int j = p0 + 1; j <= p1; j++) {
|
||||
const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
|
||||
|
||||
tokens[j - 1].t1 = ct;
|
||||
tokens[j ].t0 = ct;
|
||||
}
|
||||
}
|
||||
|
||||
p1++;
|
||||
p0 = p1;
|
||||
if (p1 >= n) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fix up (just in case)
|
||||
for (int j = 0; j < n - 1; j++) {
|
||||
if (tokens[j].t1 < 0) {
|
||||
tokens[j + 1].t0 = tokens[j].t1;
|
||||
}
|
||||
|
||||
if (j > 0) {
|
||||
if (tokens[j - 1].t1 > tokens[j].t0) {
|
||||
tokens[j].t0 = tokens[j - 1].t1;
|
||||
tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1);
|
||||
}
|
||||
}
|
||||
|
||||
tokens[j].tt0 = tokens[j].t0;
|
||||
tokens[j].tt1 = tokens[j].t1;
|
||||
}
|
||||
|
||||
// VAD
|
||||
// expand or contract tokens based on voice activity
|
||||
{
|
||||
const int hw = WHISPER_SAMPLE_RATE/8;
|
||||
|
||||
for (int j = 0; j < n; j++) {
|
||||
if (tokens[j].id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int64_t t0 = tokens[j].t0;
|
||||
const int64_t t1 = tokens[j].t1;
|
||||
|
||||
int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
|
||||
int s1 = std::min((int) pcmf32.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100));
|
||||
|
||||
const int ss0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100) - hw);
|
||||
const int ss1 = std::min((int) pcmf32.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100) + hw);
|
||||
|
||||
const int n = ss1 - ss0;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int k = ss0; k < ss1; k++) {
|
||||
sum += pcm_avg[k];
|
||||
}
|
||||
|
||||
const float thold = 0.5*sum/n;
|
||||
|
||||
{
|
||||
int k = s0;
|
||||
if (pcm_avg[k] > thold && j > 0) {
|
||||
while (k > 0 && pcm_avg[k] > thold) {
|
||||
k--;
|
||||
}
|
||||
tokens[j].t0 = (int64_t) (100*k/WHISPER_SAMPLE_RATE);
|
||||
if (tokens[j].t0 < tokens[j - 1].t1) {
|
||||
tokens[j].t0 = tokens[j - 1].t1;
|
||||
} else {
|
||||
s0 = k;
|
||||
}
|
||||
} else {
|
||||
while (pcm_avg[k] < thold && k < s1) {
|
||||
k++;
|
||||
}
|
||||
s0 = k;
|
||||
tokens[j].t0 = 100*k/WHISPER_SAMPLE_RATE;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int k = s1;
|
||||
if (pcm_avg[k] > thold) {
|
||||
while (k < (int) pcmf32.size() - 1 && pcm_avg[k] > thold) {
|
||||
k++;
|
||||
}
|
||||
tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
|
||||
if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
|
||||
tokens[j].t1 = tokens[j + 1].t0;
|
||||
} else {
|
||||
s1 = k;
|
||||
}
|
||||
} else {
|
||||
while (pcm_avg[k] < thold && k > s0) {
|
||||
k--;
|
||||
}
|
||||
s1 = k;
|
||||
tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fixed token expand (optional)
|
||||
{
|
||||
const int t_expand = 0;
|
||||
|
||||
for (int j = 0; j < n; j++) {
|
||||
if (j > 0) {
|
||||
tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand));
|
||||
}
|
||||
if (j < n - 1) {
|
||||
tokens[j].t1 = tokens[j].t1 + t_expand;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// debug info
|
||||
// TODO: toggle via parameter
|
||||
for (int j = 0; j < n; ++j) {
|
||||
const auto & token = tokens[j];
|
||||
const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
|
||||
printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
|
||||
tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, token.text.c_str());
|
||||
|
||||
if (tokens[j].id >= whisper_token_eot(ctx)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
//printf("[%s --> %s] %s\n", to_timestamp(token.t0).c_str(), to_timestamp(token.t1).c_str(), whisper_token_to_str(ctx, token.id));
|
||||
|
||||
//fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
|
||||
}
|
||||
|
||||
// TODO: become parameters
|
||||
static const int line_wrap = 60;
|
||||
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
||||
|
||||
if (!is_first) {
|
||||
if (i > 0) {
|
||||
fout << ",";
|
||||
}
|
||||
|
||||
// background text
|
||||
fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
|
||||
|
||||
is_first = false;
|
||||
bool is_first = true;
|
||||
|
||||
for (int j = 0; j < n; ++j) {
|
||||
const auto & token = tokens[j];
|
||||
@ -654,17 +380,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
}
|
||||
|
||||
ncnt += txt.size();
|
||||
|
||||
if (ncnt > line_wrap) {
|
||||
if (k < j) {
|
||||
txt_bg = "> ";
|
||||
txt_fg = "> ";
|
||||
txt_ul = "\\ \\ ";
|
||||
ncnt = 0;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
::replace_all(txt_bg, "'", "’");
|
||||
@ -673,8 +388,11 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
|
||||
::replace_all(txt_fg, "\"", "\\\"");
|
||||
}
|
||||
|
||||
// background text
|
||||
fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << token.tt0/100.0 << "," << token.tt1/100.0 << ")'";
|
||||
if (is_first) {
|
||||
// background text
|
||||
fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
// foreground text
|
||||
fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
|
||||
@ -815,6 +533,10 @@ int main(int argc, char ** argv) {
|
||||
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
||||
wparams.offset_ms = params.offset_t_ms;
|
||||
|
||||
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
|
||||
// this callback is called on each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||
@ -852,7 +574,7 @@ int main(int argc, char ** argv) {
|
||||
// output to WTS file
|
||||
if (params.output_wts) {
|
||||
const auto fname_wts = fname_inp + ".wts";
|
||||
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, pcmf32);
|
||||
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user