mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-12 20:18:08 +00:00
whisper : add option to speed up the audio tempo by x2
Using a Phase Vocoder for speeding up the audio tempo by scaling down the frequencies in the frequency domain. This reduces the computation in the Encoder by a factor of 2. The transcription accuracy is degraded, but for slow to normal speech - it seems to be still very good. I think this can find application for real-time transcription - i.e. the "stream" example.
This commit is contained in:
@ -59,6 +59,7 @@ struct whisper_params {
|
||||
|
||||
float word_thold = 0.01f;
|
||||
|
||||
bool speed_up = false;
|
||||
bool verbose = false;
|
||||
bool translate = false;
|
||||
bool output_txt = false;
|
||||
@ -104,6 +105,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
params.max_len = std::stoi(argv[++i]);
|
||||
} else if (arg == "-wt" || arg == "--word-thold") {
|
||||
params.word_thold = std::stof(argv[++i]);
|
||||
} else if (arg == "-su" || arg == "--speed-up") {
|
||||
params.speed_up = true;
|
||||
} else if (arg == "-v" || arg == "--verbose") {
|
||||
params.verbose = true;
|
||||
} else if (arg == "--translate") {
|
||||
@ -161,6 +164,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
||||
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
|
||||
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
|
||||
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
|
||||
fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -v, --verbose verbose output\n");
|
||||
fprintf(stderr, " --translate translate from source language to english\n");
|
||||
fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
|
||||
@ -454,7 +458,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<float> pcmf32;
|
||||
{
|
||||
drwav wav;
|
||||
|
||||
|
||||
if (fname_inp == "-") {
|
||||
std::vector<uint8_t> wav_data;
|
||||
{
|
||||
@ -563,6 +567,8 @@ int main(int argc, char ** argv) {
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
// this callback is called on each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||
|
Reference in New Issue
Block a user