mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-19 04:37:51 +00:00
main : provide option for creating JSON output (#615)
* examples : provide option for exporting also as JSON file (ggerganov/whisper.cpp#614) * main : remove leftovers --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
992aa2cd1b
commit
8fcd1a3b32
@ -31,6 +31,7 @@ options:
|
||||
-osrt, --output-srt [false ] output result in a srt file
|
||||
-owts, --output-words [false ] output script for generating karaoke video
|
||||
-ocsv, --output-csv [false ] output result in a CSV file
|
||||
-oj, --output-json [false ] output result in a JSON file
|
||||
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
|
||||
-ps, --print-special [false ] print special tokens
|
||||
-pc, --print-colors [false ] print colors
|
||||
|
@ -73,6 +73,7 @@ struct whisper_params {
|
||||
bool output_srt = false;
|
||||
bool output_wts = false;
|
||||
bool output_csv = false;
|
||||
bool output_jsn = false;
|
||||
bool print_special = false;
|
||||
bool print_colors = false;
|
||||
bool print_progress = false;
|
||||
@ -130,6 +131,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
||||
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
||||
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
||||
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
|
||||
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
||||
@ -178,6 +180,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
||||
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
|
||||
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
||||
fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
|
||||
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
||||
@ -368,6 +371,129 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
|
||||
std::ofstream fout(fname);
|
||||
int indent = 0;
|
||||
|
||||
auto doindent = [&]() {
|
||||
for (int i = 0; i < indent; i++) fout << "\t";
|
||||
};
|
||||
|
||||
auto start_arr = [&](const char *name) {
|
||||
doindent();
|
||||
fout << "\"" << name << "\": [\n";
|
||||
indent++;
|
||||
};
|
||||
|
||||
auto end_arr = [&](bool end = false) {
|
||||
indent--;
|
||||
doindent();
|
||||
fout << (end ? "]\n" : "},\n");
|
||||
};
|
||||
|
||||
auto start_obj = [&](const char *name = nullptr) {
|
||||
doindent();
|
||||
if (name) {
|
||||
fout << "\"" << name << "\": {\n";
|
||||
} else {
|
||||
fout << "{\n";
|
||||
}
|
||||
indent++;
|
||||
};
|
||||
|
||||
auto end_obj = [&](bool end = false) {
|
||||
indent--;
|
||||
doindent();
|
||||
fout << (end ? "}\n" : "},\n");
|
||||
};
|
||||
|
||||
auto start_value = [&](const char *name) {
|
||||
doindent();
|
||||
fout << "\"" << name << "\": ";
|
||||
};
|
||||
|
||||
auto value_s = [&](const char *name, const char *val, bool end = false) {
|
||||
start_value(name);
|
||||
fout << "\"" << val << (end ? "\"\n" : "\",\n");
|
||||
};
|
||||
|
||||
auto end_value = [&](bool end = false) {
|
||||
fout << (end ? "\n" : ",\n");
|
||||
};
|
||||
|
||||
auto value_i = [&](const char *name, const int64_t val, bool end = false) {
|
||||
start_value(name);
|
||||
fout << val;
|
||||
end_value(end);
|
||||
};
|
||||
|
||||
auto value_b = [&](const char *name, const bool val, bool end = false) {
|
||||
start_value(name);
|
||||
fout << (val ? "true" : "false");
|
||||
end_value(end);
|
||||
};
|
||||
|
||||
if (!fout.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
start_obj();
|
||||
value_s("systeminfo", whisper_print_system_info());
|
||||
start_obj("model");
|
||||
value_s("type", whisper_model_type_readable(ctx));
|
||||
value_b("multilingual", whisper_is_multilingual(ctx));
|
||||
value_i("vocab", whisper_model_n_vocab(ctx));
|
||||
start_obj("audio");
|
||||
value_i("ctx", whisper_model_n_audio_ctx(ctx));
|
||||
value_i("state", whisper_model_n_audio_state(ctx));
|
||||
value_i("head", whisper_model_n_audio_head(ctx));
|
||||
value_i("layer", whisper_model_n_audio_layer(ctx), true);
|
||||
end_obj();
|
||||
start_obj("text");
|
||||
value_i("ctx", whisper_model_n_text_ctx(ctx));
|
||||
value_i("state", whisper_model_n_text_state(ctx));
|
||||
value_i("head", whisper_model_n_text_head(ctx));
|
||||
value_i("leyer", whisper_model_n_text_layer(ctx), true);
|
||||
end_obj();
|
||||
value_i("mels", whisper_model_n_mels(ctx));
|
||||
value_i("f16", whisper_model_f16(ctx), true);
|
||||
end_obj();
|
||||
start_obj("params");
|
||||
value_s("model", params.model.c_str());
|
||||
value_s("language", params.language.c_str());
|
||||
value_b("translate", params.translate, true);
|
||||
end_obj();
|
||||
start_obj("result");
|
||||
value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
|
||||
end_obj();
|
||||
start_arr("transcription");
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
start_obj();
|
||||
start_obj("timestanps");
|
||||
value_s("from", to_timestamp(t0, true).c_str());
|
||||
value_s("to", to_timestamp(t1, true).c_str(), true);
|
||||
end_obj();
|
||||
start_obj("offsets");
|
||||
value_i("from", t0 * 10);
|
||||
value_i("to", t1 * 10, true);
|
||||
end_obj();
|
||||
value_s("text", text, true);
|
||||
end_obj(i == (n_segments - 1));
|
||||
}
|
||||
|
||||
end_arr(true);
|
||||
end_obj(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
// karaoke video generation
|
||||
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
||||
// TODO: font parameter adjustments
|
||||
@ -662,6 +788,12 @@ int main(int argc, char ** argv) {
|
||||
const auto fname_csv = fname_out + ".csv";
|
||||
output_csv(ctx, fname_csv.c_str());
|
||||
}
|
||||
|
||||
// output to JSON file
|
||||
if (params.output_jsn) {
|
||||
const auto fname_jsn = fname_out + ".json";
|
||||
output_json(ctx, fname_jsn.c_str(), params);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
65
whisper.cpp
65
whisper.cpp
@ -2919,6 +2919,71 @@ int whisper_lang_auto_detect(
|
||||
return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
|
||||
}
|
||||
|
||||
int whisper_model_n_vocab(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_vocab;
|
||||
}
|
||||
|
||||
int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_audio_ctx;
|
||||
}
|
||||
|
||||
int whisper_model_n_audio_state(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_audio_state;
|
||||
}
|
||||
|
||||
int whisper_model_n_audio_head(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_audio_head;
|
||||
}
|
||||
|
||||
int whisper_model_n_audio_layer(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_audio_layer;
|
||||
}
|
||||
|
||||
int whisper_model_n_text_ctx(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_text_ctx;
|
||||
}
|
||||
|
||||
int whisper_model_n_text_state(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_text_state;
|
||||
}
|
||||
|
||||
int whisper_model_n_text_head(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_text_head;
|
||||
}
|
||||
|
||||
int whisper_model_n_text_layer(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_text_layer;
|
||||
}
|
||||
|
||||
int whisper_model_n_mels(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.n_mels;
|
||||
}
|
||||
|
||||
int whisper_model_f16(struct whisper_context * ctx) {
|
||||
return ctx->model.hparams.f16;
|
||||
}
|
||||
|
||||
int whisper_model_type(struct whisper_context * ctx) {
|
||||
return ctx->model.type;
|
||||
}
|
||||
|
||||
const char *whisper_model_type_readable(struct whisper_context * ctx) {
|
||||
switch (ctx->model.type) {
|
||||
case e_model::MODEL_TINY:
|
||||
return "tiny";
|
||||
case e_model::MODEL_BASE:
|
||||
return "base";
|
||||
case e_model::MODEL_SMALL:
|
||||
return "small";
|
||||
case e_model::MODEL_MEDIUM:
|
||||
return "medium";
|
||||
case e_model::MODEL_LARGE:
|
||||
return "large";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
int whisper_n_len_from_state(struct whisper_state * state) {
|
||||
return state->mel.n_len;
|
||||
}
|
||||
|
15
whisper.h
15
whisper.h
@ -248,6 +248,19 @@ extern "C" {
|
||||
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
|
||||
|
||||
WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_f16 (struct whisper_context * ctx);
|
||||
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
|
||||
|
||||
// Token logits obtained from the last call to whisper_decode()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Rows: n_tokens
|
||||
@ -257,6 +270,8 @@ extern "C" {
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
|
||||
WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
|
||||
|
||||
|
||||
// Special tokens
|
||||
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
|
||||
|
Loading…
Reference in New Issue
Block a user