mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-22 08:30:07 +00:00
Some checks failed
Bindings Tests (Ruby) / ubuntu-22 (push) Has been cancelled
CI / determine-tag (push) Has been cancelled
CI / ubuntu-22 (linux/amd64) (push) Has been cancelled
CI / ubuntu-22 (linux/ppc64le) (push) Has been cancelled
CI / ubuntu-22-arm64 (linux/arm64) (push) Has been cancelled
CI / ubuntu-22-arm-v7 (linux/arm/v7) (push) Has been cancelled
CI / macOS-latest (generic/platform=iOS) (push) Has been cancelled
CI / macOS-latest (generic/platform=macOS) (push) Has been cancelled
CI / macOS-latest (generic/platform=tvOS) (push) Has been cancelled
CI / ubuntu-22-gcc (linux/amd64, Debug) (push) Has been cancelled
CI / ubuntu-22-gcc (linux/amd64, Release) (push) Has been cancelled
CI / ubuntu-22-gcc (linux/ppc64le, Debug) (push) Has been cancelled
CI / ubuntu-22-gcc (linux/ppc64le, Release) (push) Has been cancelled
CI / ubuntu-22-gcc-arm64 (linux/arm64, Debug) (push) Has been cancelled
CI / ubuntu-22-gcc-arm64 (linux/arm64, Release) (push) Has been cancelled
CI / ubuntu-22-gcc-arm-v7 (linux/arm/v7, Debug) (push) Has been cancelled
CI / ubuntu-22-gcc-arm-v7 (linux/arm/v7, Release) (push) Has been cancelled
CI / ubuntu-22-clang (linux/amd64, Debug) (push) Has been cancelled
CI / ubuntu-22-clang (linux/amd64, Release) (push) Has been cancelled
CI / ubuntu-22-clang (linux/arm64, Debug) (push) Has been cancelled
CI / ubuntu-22-clang (linux/arm64, Release) (push) Has been cancelled
CI / ubuntu-22-clang (linux/ppc64le, Debug) (push) Has been cancelled
CI / ubuntu-22-clang (linux/ppc64le, Release) (push) Has been cancelled
CI / ubuntu-22-gcc-sanitized (linux/amd64, ADDRESS) (push) Has been cancelled
CI / ubuntu-22-gcc-sanitized (linux/amd64, THREAD) (push) Has been cancelled
CI / ubuntu-22-gcc-sanitized (linux/amd64, UNDEFINED) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/amd64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/arm/v7, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/arm64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (linux/ppc64le, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/amd64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm/v7, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm64, icx, icpx, ON) (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (linux/ppc64le, icx, icpx, ON) (push) Has been cancelled
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Has been cancelled
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Has been cancelled
CI / windows (Win32, Release, win32-x86, x86, 2.28.5, ON) (push) Has been cancelled
CI / windows (x64, Release, win32-x86-64, x64, 2.28.5, ON) (push) Has been cancelled
CI / windows-blas (Win32, ON, x86, 0.3.29, Release, x86, 2.28.5, ON) (push) Has been cancelled
CI / windows-blas (x64, ON, x64_64, 0.3.29, Release, x64, 2.28.5, ON) (push) Has been cancelled
CI / windows-cublas (x64, Release, ON, 11.8.0, ON, 2.28.5) (push) Has been cancelled
CI / windows-cublas (x64, Release, ON, 12.4.0, ON, 2.28.5) (push) Has been cancelled
CI / emscripten (Release) (push) Has been cancelled
CI / ios-xcode-build (Release) (push) Has been cancelled
CI / android (push) Has been cancelled
CI / android_java (push) Has been cancelled
CI / bindings-java (push) Has been cancelled
CI / quantize (push) Has been cancelled
CI / release (push) Has been cancelled
CI / coreml-base-en (push) Has been cancelled
CI / vad (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/main-intel.Dockerfile platform:linux/amd64 tag:main-intel]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/main-musa.Dockerfile platform:linux/amd64 tag:main-musa]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/main.Dockerfile platform:linux/amd64 tag:main]) (push) Has been cancelled
Examples WASM / deploy-wasm-github-pages (push) Has been cancelled
This commit adds a conversion from stereo to mono in the
`read_audio_data` function of `common-whisper.cpp`.
The motivation for this change is prior to Commit
7d3da68f79
("examples : use miniaudio for
direct decoding flac, mp3, ogg and wav (#2759)", there was a step that
read stereo int16 data -> pcm16 (448512 samples), and then converted to
mono (224256 samples), and then also convert to stereo in `pcmf32s.
The middle step here seems to have been missed when rewriting the code to
use Miniaudio and caused issues then transcribing stereo audio files.
For example, currently using the audio sample in the linked issue the
output is:
```console
[00:00:00.000 --> 00:00:03.000] (speaker 1) Sous-titres réalisés para la communauté d'Amara.org
```
And with the change in this commit the output is:
```
[00:00:00.000 --> 00:00:01.500] (speaker 1) *sonnerie de téléphone*
[00:00:01.500 --> 00:00:07.000] (speaker 1) Salut jeune homme !
[00:00:07.000 --> 00:00:08.500] (speaker 0) C'est vrai que je te dérange ?
[00:00:08.500 --> 00:00:10.500] (speaker 1) Ah pas du tout, pas du tout, pas du tout !
[00:00:10.500 --> 00:00:12.500] (speaker 1) J'étais en train de...
[00:00:12.500 --> 00:00:14.500] (speaker 1) de préparer un courrier
```
Resolves: https://github.com/ggml-org/whisper.cpp/issues/3092
176 lines
5.1 KiB
C++
176 lines
5.1 KiB
C++
#define _USE_MATH_DEFINES // for M_PI
|
|
|
|
#include "common-whisper.h"
|
|
|
|
#include "common.h"
|
|
|
|
#include "whisper.h"
|
|
|
|
// third-party utilities
|
|
// use your favorite implementations
|
|
#define STB_VORBIS_HEADER_ONLY
|
|
#include "stb_vorbis.c" /* Enables Vorbis decoding. */
|
|
|
|
#ifdef _WIN32
|
|
#ifndef NOMINMAX
|
|
#define NOMINMAX
|
|
#endif
|
|
#endif
|
|
|
|
#define MA_NO_DEVICE_IO
|
|
#define MA_NO_THREADING
|
|
#define MA_NO_ENCODING
|
|
#define MA_NO_GENERATION
|
|
#define MA_NO_RESOURCE_MANAGER
|
|
#define MA_NO_NODE_GRAPH
|
|
#define MINIAUDIO_IMPLEMENTATION
|
|
#include "miniaudio.h"
|
|
|
|
#ifdef _WIN32
|
|
#include <fcntl.h>
|
|
#include <io.h>
|
|
#endif
|
|
|
|
#include <cstring>
|
|
#include <fstream>
|
|
|
|
#ifdef WHISPER_FFMPEG
|
|
// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
|
|
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
|
|
#endif
|
|
|
|
bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
|
std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output
|
|
|
|
ma_result result;
|
|
ma_decoder_config decoder_config;
|
|
ma_decoder decoder;
|
|
|
|
decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE);
|
|
|
|
if (fname == "-") {
|
|
#ifdef _WIN32
|
|
_setmode(_fileno(stdin), _O_BINARY);
|
|
#endif
|
|
|
|
uint8_t buf[1024];
|
|
while (true)
|
|
{
|
|
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
|
if (n == 0) {
|
|
break;
|
|
}
|
|
audio_data.insert(audio_data.end(), buf, buf + n);
|
|
}
|
|
|
|
if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
|
|
|
|
fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result));
|
|
|
|
return false;
|
|
}
|
|
|
|
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size());
|
|
}
|
|
else if (((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS)) {
|
|
#if defined(WHISPER_FFMPEG)
|
|
if (ffmpeg_decode_audio(fname, audio_data) != 0) {
|
|
fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str());
|
|
|
|
return false;
|
|
}
|
|
|
|
if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
|
|
fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result));
|
|
|
|
return false;
|
|
}
|
|
#else
|
|
if ((result = ma_decoder_init_memory(fname.c_str(), fname.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
|
|
fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result));
|
|
|
|
return false;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
ma_uint64 frame_count;
|
|
ma_uint64 frames_read;
|
|
|
|
if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
|
|
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
|
|
|
|
return false;
|
|
}
|
|
|
|
pcmf32.resize(stereo ? frame_count*2 : frame_count);
|
|
|
|
if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
|
|
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
|
|
|
|
return false;
|
|
}
|
|
|
|
if (stereo) {
|
|
std::vector<float> stereo_data = pcmf32;
|
|
pcmf32.resize(frame_count);
|
|
|
|
for (uint64_t i = 0; i < frame_count; i++) {
|
|
pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
|
|
}
|
|
|
|
pcmf32s.resize(2);
|
|
pcmf32s[0].resize(frame_count);
|
|
pcmf32s[1].resize(frame_count);
|
|
for (uint64_t i = 0; i < frame_count; i++) {
|
|
pcmf32s[0][i] = stereo_data[2*i];
|
|
pcmf32s[1][i] = stereo_data[2*i + 1];
|
|
}
|
|
}
|
|
|
|
ma_decoder_uninit(&decoder);
|
|
|
|
return true;
|
|
}
|
|
|
|
// 500 -> 00:05.000
|
|
// 6000 -> 01:00.000
|
|
std::string to_timestamp(int64_t t, bool comma) {
|
|
int64_t msec = t * 10;
|
|
int64_t hr = msec / (1000 * 60 * 60);
|
|
msec = msec - hr * (1000 * 60 * 60);
|
|
int64_t min = msec / (1000 * 60);
|
|
msec = msec - min * (1000 * 60);
|
|
int64_t sec = msec / 1000;
|
|
msec = msec - sec * 1000;
|
|
|
|
char buf[32];
|
|
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
|
|
|
|
return std::string(buf);
|
|
}
|
|
|
|
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) {
|
|
return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100)));
|
|
}
|
|
|
|
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id) {
|
|
std::ofstream speak_file(path.c_str());
|
|
if (speak_file.fail()) {
|
|
fprintf(stderr, "%s: failed to open speak_file\n", __func__);
|
|
return false;
|
|
} else {
|
|
speak_file.write(text.c_str(), text.size());
|
|
speak_file.close();
|
|
int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
|
|
if (ret != 0) {
|
|
fprintf(stderr, "%s: failed to speak\n", __func__);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#undef STB_VORBIS_HEADER_ONLY
|
|
#include "stb_vorbis.c"
|