From dec40be58fb8cde60b77700a0c2f50bb179d7edd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 29 Oct 2022 14:08:23 +0300 Subject: [PATCH] parallel : print time of audio boundaries + fix timings --- whisper.cpp | 50 ++++++++++++++++++++++++++++++++++++++------------ whisper.h | 3 +++ 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index ff8e1b0e..235f5592 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -1910,14 +1910,19 @@ whisper_vocab::id whisper_sample_timestamp( return probs_id[0].second; } -static std::string to_timestamp(int64_t t) { - int64_t sec = t/100; - int64_t msec = t - sec*100; - int64_t min = sec/60; - sec = sec - min*60; +// 500 -> 00:05.000 +// 6000 -> 01:00.000 +std::string to_timestamp(int64_t t, bool comma = false) { + int64_t msec = t * 10; + int64_t hr = msec / (1000 * 60 * 60); + msec = msec - hr * (1000 * 60 * 60); + int64_t min = msec / (1000 * 60); + msec = msec - min * (1000 * 60); + int64_t sec = msec / 1000; + msec = msec - sec * 1000; char buf[32]; - snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec); + snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec); return std::string(buf); } @@ -2727,25 +2732,46 @@ int whisper_full_parallel( // combine results into ctx->result_all for (int i = 0; i < n_processors - 1; ++i) { - auto & result_all = ctxs[i].result_all; + auto & results_i = ctxs[i].result_all; - for (int j = 0; j < (int) result_all.size(); ++j) { - result_all[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t; - result_all[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t; + for (int j = 0; j < (int) results_i.size(); ++j) { + // correct the segment timestamp taking into account the offset + results_i[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t; + results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t; + // make sure that segments are not overlapping if (ctx->result_all.size() > 0) { - result_all[j].t0 = std::max(result_all[j].t0, ctx->result_all.back().t1); + results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1); } - ctx->result_all.push_back(std::move(result_all[j])); + ctx->result_all.push_back(std::move(results_i[j])); // call the new_segment_callback for each segment if (params.new_segment_callback) { params.new_segment_callback(ctx, params.new_segment_callback_user_data); } } + + ctx->t_mel_us += ctxs[i].t_mel_us; + ctx->t_sample_us += ctxs[i].t_sample_us; + ctx->t_encode_us += ctxs[i].t_encode_us; + ctx->t_decode_us += ctxs[i].t_decode_us; } + // average the timings + ctx->t_mel_us /= n_processors; + ctx->t_sample_us /= n_processors; + ctx->t_encode_us /= n_processors; + ctx->t_decode_us /= n_processors; + + // print information about the audio boundaries + fprintf(stderr, "\n"); + fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors); + for (int i = 0; i < n_processors - 1; ++i) { + fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str()); + } + fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__); + return ret; } diff --git a/whisper.h b/whisper.h index 0016d0d0..cb2d7470 100644 --- a/whisper.h +++ b/whisper.h @@ -213,6 +213,9 @@ extern "C" { const float * samples, int n_samples); + // Split the input audio in chunks and process each chunk separately using whisper_full() + // It seems this approach can offer some speedup in some cases. + // However, the transcription accuracy can be worse at the beginning and end of each chunk. WHISPER_API int whisper_full_parallel( struct whisper_context * ctx, struct whisper_full_params params,