From c42f67e2d2ed906555ca2222d0622d568cec8f0f Mon Sep 17 00:00:00 2001 From: KITAITI Makoto Date: Fri, 28 Feb 2025 15:09:02 +0900 Subject: [PATCH] ruby : follow audio library change (#2851) * Enable CPU * Follow audio lib change --- .github/workflows/bindings-ruby.yml | 14 +++- bindings/ruby/ext/extconf.rb | 6 +- bindings/ruby/ext/ruby_whisper_transcribe.cpp | 84 +------------------ 3 files changed, 20 insertions(+), 84 deletions(-) diff --git a/.github/workflows/bindings-ruby.yml b/.github/workflows/bindings-ruby.yml index 94ccf835..63f7f615 100644 --- a/.github/workflows/bindings-ruby.yml +++ b/.github/workflows/bindings-ruby.yml @@ -19,7 +19,12 @@ on: - ggml/**/*.m - ggml/**/*.metal - scripts/get-flags.mk - - examples/dr_wav.h + - examples/common.h + - examples/common.cpp + - examples/common-whisper.h + - examples/common-whisper.cpp + - examples/stb_vorbis.c + - examples/miniaudio.h pull_request: paths: - bindings/ruby/** @@ -39,7 +44,12 @@ on: - ggml/**/*.m - ggml/**/*.metal - scripts/get-flags.mk - - examples/dr_wav.h + - examples/common.h + - examples/common.cpp + - examples/common-whisper.h + - examples/common-whisper.cpp + - examples/stb_vorbis.c + - examples/miniaudio.h jobs: ubuntu-22: diff --git a/bindings/ruby/ext/extconf.rb b/bindings/ruby/ext/extconf.rb index af50904d..c474d434 100644 --- a/bindings/ruby/ext/extconf.rb +++ b/bindings/ruby/ext/extconf.rb @@ -35,7 +35,7 @@ if $GGML_METAL $GGML_METAL_EMBED_LIBRARY = true end -$MK_CPPFLAGS = '-Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -Iexamples' +$MK_CPPFLAGS = '-Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -Iexamples -DGGML_USE_CPU' $MK_CFLAGS = '-std=c11 -fPIC' $MK_CXXFLAGS = '-std=c++17 -fPIC' $MK_NVCCFLAGS = '-std=c++17' @@ -171,7 +171,9 @@ $OBJ_GGML << 'ggml/src/ggml-cpu/ggml-cpu-traits.o' $OBJ_WHISPER << - 'src/whisper.o' + 'src/whisper.o' << + 'examples/common.o' << + 'examples/common-whisper.o' $objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL $objs << diff --git a/bindings/ruby/ext/ruby_whisper_transcribe.cpp b/bindings/ruby/ext/ruby_whisper_transcribe.cpp index d50ed063..00b9d2e1 100644 --- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp +++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp @@ -1,7 +1,6 @@ #include #include "ruby_whisper.h" -#define DR_WAV_IMPLEMENTATION -#include "dr_wav.h" +#include "common-whisper.h" #include #include @@ -47,84 +46,9 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { std::vector pcmf32; // mono-channel F32 PCM std::vector> pcmf32s; // stereo-channel F32 PCM - // WAV input - this is directly from main.cpp example - { - drwav wav; - std::vector wav_data; // used for pipe input from stdin - - if (fname_inp == "-") { - { - uint8_t buf[1024]; - while (true) { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - wav_data.insert(wav_data.end(), buf, buf + n); - } - } - - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { - fprintf(stderr, "error: failed to open WAV file from stdin\n"); - return self; - } - - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); - } else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) { - fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str()); - return self; - } - - if (wav.channels != 1 && wav.channels != 2) { - fprintf(stderr, "WAV file '%s' must be mono or stereo\n", fname_inp.c_str()); - return self; - } - - if (rwp->diarize && wav.channels != 2 && rwp->params.print_timestamps == false) { - fprintf(stderr, "WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str()); - return self; - } - - if (wav.sampleRate != WHISPER_SAMPLE_RATE) { - fprintf(stderr, "WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000); - return self; - } - - if (wav.bitsPerSample != 16) { - fprintf(stderr, "WAV file '%s' must be 16-bit\n", fname_inp.c_str()); - return self; - } - - const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); - - std::vector pcm16; - pcm16.resize(n*wav.channels); - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); - drwav_uninit(&wav); - - // convert to mono, float - pcmf32.resize(n); - if (wav.channels == 1) { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i])/32768.0f; - } - } else { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float((int32_t)pcm16[2*i] + pcm16[2*i + 1])/65536.0f; - } - } - - if (rwp->diarize) { - // convert to stereo, float - pcmf32s.resize(2); - - pcmf32s[0].resize(n); - pcmf32s[1].resize(n); - for (uint64_t i = 0; i < n; i++) { - pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; - pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; - } - } + if (!read_audio_data(fname_inp, pcmf32, pcmf32s, rwp->diarize)) { + fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str()); + return self; } { static bool is_aborted = false; // NOTE: this should be atomic to avoid data race