mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-25 22:00:25 +00:00
021eef1000
* Add tests for Whisper::Context#full * Add Whisper::Context#full * Add tests for Whisper::Error * Add document of Whisper::Context#full [skip ci] * Add additional signature for Whisper::Context#full * Add description to Whisper::Context#full * Add test for Whisper::Context#full_parallel * Add Whisper::Context#full_parallel * Hide Whisper's instance methods from Ruby code * Add class to test MemoryView * Build test class before running test * Add test for MemoryView * Make Whisper::Context#full and #full_parallel accept MemoryView * Use Ruby 3.1 on CI * Add comment on samples data type * Update README * Update README * Remove unused code
1874 lines
59 KiB
C++
1874 lines
59 KiB
C++
#include <ruby.h>
|
|
#include <ruby/memory_view.h>
|
|
#include "ruby_whisper.h"
|
|
#define DR_WAV_IMPLEMENTATION
|
|
#include "dr_wav.h"
|
|
#include <cmath>
|
|
#include <fstream>
|
|
#include <cstdio>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#define BOOL_PARAMS_SETTER(self, prop, value) \
|
|
ruby_whisper_params *rwp; \
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp); \
|
|
if (value == Qfalse || value == Qnil) { \
|
|
rwp->params.prop = false; \
|
|
} else { \
|
|
rwp->params.prop = true; \
|
|
} \
|
|
return value; \
|
|
|
|
#define BOOL_PARAMS_GETTER(self, prop) \
|
|
ruby_whisper_params *rwp; \
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp); \
|
|
if (rwp->params.prop) { \
|
|
return Qtrue; \
|
|
} else { \
|
|
return Qfalse; \
|
|
}
|
|
|
|
VALUE mWhisper;
|
|
VALUE cContext;
|
|
VALUE cParams;
|
|
VALUE eError;
|
|
|
|
static ID id_to_s;
|
|
static ID id_call;
|
|
static ID id___method__;
|
|
static ID id_to_enum;
|
|
static ID id_length;
|
|
static ID id_next;
|
|
static ID id_new;
|
|
|
|
static bool is_log_callback_finalized = false;
|
|
|
|
/*
|
|
* call-seq:
|
|
* lang_max_id -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_s_lang_max_id(VALUE self) {
|
|
return INT2NUM(whisper_lang_max_id());
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* lang_id(lang_name) -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_s_lang_id(VALUE self, VALUE lang) {
|
|
const char * lang_str = StringValueCStr(lang);
|
|
const int id = whisper_lang_id(lang_str);
|
|
if (-1 == id) {
|
|
rb_raise(rb_eArgError, "language not found: %s", lang_str);
|
|
}
|
|
return INT2NUM(id);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* lang_str(lang_id) -> String
|
|
*/
|
|
static VALUE ruby_whisper_s_lang_str(VALUE self, VALUE id) {
|
|
const int lang_id = NUM2INT(id);
|
|
const char * str = whisper_lang_str(lang_id);
|
|
if (nullptr == str) {
|
|
rb_raise(rb_eIndexError, "id %d outside of language id", lang_id);
|
|
}
|
|
return rb_str_new2(str);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* lang_str(lang_id) -> String
|
|
*/
|
|
static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
|
|
const int lang_id = NUM2INT(id);
|
|
const char * str_full = whisper_lang_str_full(lang_id);
|
|
if (nullptr == str_full) {
|
|
rb_raise(rb_eIndexError, "id %d outside of language id", lang_id);
|
|
}
|
|
return rb_str_new2(str_full);
|
|
}
|
|
|
|
static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
|
|
is_log_callback_finalized = true;
|
|
return Qnil;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* log_set ->(level, buffer, user_data) { ... }, user_data -> nil
|
|
*/
|
|
static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_data) {
|
|
VALUE old_callback = rb_iv_get(self, "log_callback");
|
|
if (!NIL_P(old_callback)) {
|
|
rb_undefine_finalizer(old_callback);
|
|
}
|
|
|
|
rb_iv_set(self, "log_callback", log_callback);
|
|
rb_iv_set(self, "user_data", user_data);
|
|
|
|
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
|
|
rb_define_finalizer(log_callback, finalize_log_callback);
|
|
|
|
whisper_log_set([](ggml_log_level level, const char * buffer, void * user_data) {
|
|
if (is_log_callback_finalized) {
|
|
return;
|
|
}
|
|
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
|
|
VALUE udata = rb_iv_get(mWhisper, "user_data");
|
|
rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
|
|
}, nullptr);
|
|
|
|
return Qnil;
|
|
}
|
|
|
|
static void ruby_whisper_free(ruby_whisper *rw) {
|
|
if (rw->context) {
|
|
whisper_free(rw->context);
|
|
rw->context = NULL;
|
|
}
|
|
}
|
|
|
|
static void ruby_whisper_params_free(ruby_whisper_params *rwp) {
|
|
}
|
|
|
|
void rb_whisper_mark(ruby_whisper *rw) {
|
|
// call rb_gc_mark on any ruby references in rw
|
|
}
|
|
|
|
void rb_whisper_free(ruby_whisper *rw) {
|
|
ruby_whisper_free(rw);
|
|
free(rw);
|
|
}
|
|
|
|
void rb_whisper_callbcack_container_mark(ruby_whisper_callback_container *rwc) {
|
|
rb_gc_mark(rwc->user_data);
|
|
rb_gc_mark(rwc->callback);
|
|
rb_gc_mark(rwc->callbacks);
|
|
}
|
|
|
|
void rb_whisper_params_mark(ruby_whisper_params *rwp) {
|
|
rb_whisper_callbcack_container_mark(rwp->new_segment_callback_container);
|
|
rb_whisper_callbcack_container_mark(rwp->progress_callback_container);
|
|
rb_whisper_callbcack_container_mark(rwp->abort_callback_container);
|
|
}
|
|
|
|
void rb_whisper_params_free(ruby_whisper_params *rwp) {
|
|
// How to free user_data and callback only when not referred to by others?
|
|
ruby_whisper_params_free(rwp);
|
|
free(rwp);
|
|
}
|
|
|
|
static VALUE ruby_whisper_allocate(VALUE klass) {
|
|
ruby_whisper *rw;
|
|
rw = ALLOC(ruby_whisper);
|
|
rw->context = NULL;
|
|
return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
|
|
}
|
|
|
|
static ruby_whisper_callback_container * rb_whisper_callback_container_allocate() {
|
|
ruby_whisper_callback_container *container;
|
|
container = ALLOC(ruby_whisper_callback_container);
|
|
container->context = nullptr;
|
|
container->user_data = Qnil;
|
|
container->callback = Qnil;
|
|
container->callbacks = rb_ary_new();
|
|
return container;
|
|
}
|
|
|
|
static VALUE ruby_whisper_params_allocate(VALUE klass) {
|
|
ruby_whisper_params *rwp;
|
|
rwp = ALLOC(ruby_whisper_params);
|
|
rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
|
|
rwp->progress_callback_container = rb_whisper_callback_container_allocate();
|
|
rwp->abort_callback_container = rb_whisper_callback_container_allocate();
|
|
return Data_Wrap_Struct(klass, rb_whisper_params_mark, rb_whisper_params_free, rwp);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* new("path/to/model.bin") -> Whisper::Context
|
|
*/
|
|
static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
|
|
ruby_whisper *rw;
|
|
VALUE whisper_model_file_path;
|
|
|
|
// TODO: we can support init from buffer here too maybe another ruby object to expose
|
|
rb_scan_args(argc, argv, "01", &whisper_model_file_path);
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
|
|
if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
|
|
rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
|
|
}
|
|
rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
|
|
if (rw->context == nullptr) {
|
|
rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
|
|
}
|
|
return self;
|
|
}
|
|
|
|
// High level API
|
|
static VALUE rb_whisper_segment_initialize(VALUE context, int index);
|
|
|
|
/*
|
|
* transcribe a single file
|
|
* can emit to a block results
|
|
*
|
|
* params = Whisper::Params.new
|
|
* params.duration = 60_000
|
|
* whisper.transcribe "path/to/audio.wav", params do |text|
|
|
* puts text
|
|
* end
|
|
*
|
|
* call-seq:
|
|
* transcribe(path_to_audio, params) {|text| ...}
|
|
**/
|
|
static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
|
ruby_whisper *rw;
|
|
ruby_whisper_params *rwp;
|
|
VALUE wave_file_path, blk, params;
|
|
|
|
rb_scan_args(argc, argv, "02&", &wave_file_path, ¶ms, &blk);
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
|
|
|
if (!rb_respond_to(wave_file_path, id_to_s)) {
|
|
rb_raise(rb_eRuntimeError, "Expected file path to wave file");
|
|
}
|
|
|
|
std::string fname_inp = StringValueCStr(wave_file_path);
|
|
|
|
std::vector<float> pcmf32; // mono-channel F32 PCM
|
|
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
|
|
|
// WAV input - this is directly from main.cpp example
|
|
{
|
|
drwav wav;
|
|
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
|
|
|
if (fname_inp == "-") {
|
|
{
|
|
uint8_t buf[1024];
|
|
while (true) {
|
|
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
|
if (n == 0) {
|
|
break;
|
|
}
|
|
wav_data.insert(wav_data.end(), buf, buf + n);
|
|
}
|
|
}
|
|
|
|
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
|
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
|
return self;
|
|
}
|
|
|
|
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
|
} else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
|
|
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
|
|
return self;
|
|
}
|
|
|
|
if (wav.channels != 1 && wav.channels != 2) {
|
|
fprintf(stderr, "WAV file '%s' must be mono or stereo\n", fname_inp.c_str());
|
|
return self;
|
|
}
|
|
|
|
if (rwp->diarize && wav.channels != 2 && rwp->params.print_timestamps == false) {
|
|
fprintf(stderr, "WAV file '%s' must be stereo for diarization and timestamps have to be enabled\n", fname_inp.c_str());
|
|
return self;
|
|
}
|
|
|
|
if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
|
|
fprintf(stderr, "WAV file '%s' must be %i kHz\n", fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
|
|
return self;
|
|
}
|
|
|
|
if (wav.bitsPerSample != 16) {
|
|
fprintf(stderr, "WAV file '%s' must be 16-bit\n", fname_inp.c_str());
|
|
return self;
|
|
}
|
|
|
|
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
|
|
|
std::vector<int16_t> pcm16;
|
|
pcm16.resize(n*wav.channels);
|
|
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
|
drwav_uninit(&wav);
|
|
|
|
// convert to mono, float
|
|
pcmf32.resize(n);
|
|
if (wav.channels == 1) {
|
|
for (uint64_t i = 0; i < n; i++) {
|
|
pcmf32[i] = float(pcm16[i])/32768.0f;
|
|
}
|
|
} else {
|
|
for (uint64_t i = 0; i < n; i++) {
|
|
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
|
}
|
|
}
|
|
|
|
if (rwp->diarize) {
|
|
// convert to stereo, float
|
|
pcmf32s.resize(2);
|
|
|
|
pcmf32s[0].resize(n);
|
|
pcmf32s[1].resize(n);
|
|
for (uint64_t i = 0; i < n; i++) {
|
|
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
|
|
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
|
|
}
|
|
}
|
|
}
|
|
{
|
|
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
|
|
|
rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
|
bool is_aborted = *(bool*)user_data;
|
|
return !is_aborted;
|
|
};
|
|
rwp->params.encoder_begin_callback_user_data = &is_aborted;
|
|
}
|
|
|
|
if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
|
|
rwp->params.new_segment_callback = [](struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
|
|
const ruby_whisper_callback_container *container = (ruby_whisper_callback_container *)user_data;
|
|
|
|
// Currently, doesn't support state because
|
|
// those require to resolve GC-related problems.
|
|
if (!NIL_P(container->callback)) {
|
|
rb_funcall(container->callback, id_call, 4, *container->context, Qnil, INT2NUM(n_new), container->user_data);
|
|
}
|
|
const long callbacks_len = RARRAY_LEN(container->callbacks);
|
|
if (0 == callbacks_len) {
|
|
return;
|
|
}
|
|
const int n_segments = whisper_full_n_segments_from_state(state);
|
|
for (int i = n_new; i > 0; i--) {
|
|
int i_segment = n_segments - i;
|
|
VALUE segment = rb_whisper_segment_initialize(*container->context, i_segment);
|
|
for (int j = 0; j < callbacks_len; j++) {
|
|
VALUE cb = rb_ary_entry(container->callbacks, j);
|
|
rb_funcall(cb, id_call, 1, segment);
|
|
}
|
|
}
|
|
};
|
|
rwp->new_segment_callback_container->context = &self;
|
|
rwp->params.new_segment_callback_user_data = rwp->new_segment_callback_container;
|
|
}
|
|
|
|
if (!NIL_P(rwp->progress_callback_container->callback) || 0 != RARRAY_LEN(rwp->progress_callback_container->callbacks)) {
|
|
rwp->params.progress_callback = [](struct whisper_context *ctx, struct whisper_state * /*state*/, int progress_cur, void *user_data) {
|
|
const ruby_whisper_callback_container *container = (ruby_whisper_callback_container *)user_data;
|
|
const VALUE progress = INT2NUM(progress_cur);
|
|
// Currently, doesn't support state because
|
|
// those require to resolve GC-related problems.
|
|
if (!NIL_P(container->callback)) {
|
|
rb_funcall(container->callback, id_call, 4, *container->context, Qnil, progress, container->user_data);
|
|
}
|
|
const long callbacks_len = RARRAY_LEN(container->callbacks);
|
|
if (0 == callbacks_len) {
|
|
return;
|
|
}
|
|
for (int j = 0; j < callbacks_len; j++) {
|
|
VALUE cb = rb_ary_entry(container->callbacks, j);
|
|
rb_funcall(cb, id_call, 1, progress);
|
|
}
|
|
};
|
|
rwp->progress_callback_container->context = &self;
|
|
rwp->params.progress_callback_user_data = rwp->progress_callback_container;
|
|
}
|
|
|
|
if (!NIL_P(rwp->abort_callback_container->callback) || 0 != RARRAY_LEN(rwp->abort_callback_container->callbacks)) {
|
|
rwp->params.abort_callback = [](void * user_data) {
|
|
const ruby_whisper_callback_container *container = (ruby_whisper_callback_container *)user_data;
|
|
if (!NIL_P(container->callback)) {
|
|
VALUE result = rb_funcall(container->callback, id_call, 1, container->user_data);
|
|
if (!NIL_P(result) && Qfalse != result) {
|
|
return true;
|
|
}
|
|
}
|
|
const long callbacks_len = RARRAY_LEN(container->callbacks);
|
|
if (0 == callbacks_len) {
|
|
return false;
|
|
}
|
|
for (int j = 0; j < callbacks_len; j++) {
|
|
VALUE cb = rb_ary_entry(container->callbacks, j);
|
|
VALUE result = rb_funcall(cb, id_call, 1, container->user_data);
|
|
if (!NIL_P(result) && Qfalse != result) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
};
|
|
rwp->abort_callback_container->context = &self;
|
|
rwp->params.abort_callback_user_data = rwp->abort_callback_container;
|
|
}
|
|
|
|
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
|
|
fprintf(stderr, "failed to process audio\n");
|
|
return self;
|
|
}
|
|
const int n_segments = whisper_full_n_segments(rw->context);
|
|
VALUE output = rb_str_new2("");
|
|
for (int i = 0; i < n_segments; ++i) {
|
|
const char * text = whisper_full_get_segment_text(rw->context, i);
|
|
output = rb_str_concat(output, rb_str_new2(text));
|
|
}
|
|
VALUE idCall = id_call;
|
|
if (blk != Qnil) {
|
|
rb_funcall(blk, idCall, 1, output);
|
|
}
|
|
return self;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_vocab -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_vocab(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_vocab(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_audio_ctx -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_audio_ctx(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_audio_state -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_audio_state(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_audio_state(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_audio_head -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_audio_head(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_audio_head(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_audio_layer -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_audio_layer(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_audio_layer(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_text_ctx -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_text_ctx(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_text_ctx(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_text_state -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_text_state(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_text_state(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_text_head -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_text_head(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_text_head(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_text_layer -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_text_layer(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_text_layer(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_n_mels -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_n_mels(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_mels(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_ftype -> Integer
|
|
*/
|
|
VALUE ruby_whisper_model_ftype(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_ftype(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* model_type -> String
|
|
*/
|
|
VALUE ruby_whisper_model_type(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return rb_str_new2(whisper_model_type_readable(rw->context));
|
|
}
|
|
|
|
/*
|
|
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
|
* Not thread safe for same context
|
|
* Uses the specified decoding strategy to obtain the text.
|
|
*
|
|
* call-seq:
|
|
* full(params, samples, n_samples) -> nil
|
|
* full(params, samples) -> nil
|
|
*
|
|
* The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
|
|
*/
|
|
VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) {
|
|
if (argc < 2 || argc > 3) {
|
|
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
|
|
}
|
|
|
|
ruby_whisper *rw;
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
VALUE params = argv[0];
|
|
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
|
VALUE samples = argv[1];
|
|
int n_samples;
|
|
rb_memory_view_t view;
|
|
const bool memory_view_available_p = rb_memory_view_available_p(samples);
|
|
if (argc == 3) {
|
|
n_samples = NUM2INT(argv[2]);
|
|
if (TYPE(samples) == T_ARRAY) {
|
|
if (RARRAY_LEN(samples) < n_samples) {
|
|
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
|
|
}
|
|
}
|
|
// Should check when samples.respond_to?(:length)?
|
|
} else {
|
|
if (TYPE(samples) == T_ARRAY) {
|
|
n_samples = RARRAY_LEN(samples);
|
|
} else if (memory_view_available_p) {
|
|
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
|
|
view.obj = Qnil;
|
|
rb_raise(rb_eArgError, "unable to get a memory view");
|
|
}
|
|
n_samples = view.byte_size / view.item_size;
|
|
} else if (rb_respond_to(samples, id_length)) {
|
|
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
|
} else {
|
|
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
|
|
}
|
|
}
|
|
float * c_samples = (float *)malloc(n_samples * sizeof(float));
|
|
if (memory_view_available_p) {
|
|
c_samples = (float *)view.data;
|
|
} else {
|
|
if (TYPE(samples) == T_ARRAY) {
|
|
for (int i = 0; i < n_samples; i++) {
|
|
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
|
|
}
|
|
} else {
|
|
// TODO: use rb_block_call
|
|
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
|
|
for (int i = 0; i < n_samples; i++) {
|
|
// TODO: check if iter is exhausted and raise ArgumentError appropriately
|
|
VALUE sample = rb_funcall(iter, id_next, 0);
|
|
c_samples[i] = RFLOAT_VALUE(sample);
|
|
}
|
|
}
|
|
}
|
|
const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
|
|
if (0 == result) {
|
|
return Qnil;
|
|
} else {
|
|
rb_exc_raise(rb_funcall(eError, id_new, 1, result));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
|
* Result is stored in the default state of the context
|
|
* Not thread safe if executed in parallel on the same context.
|
|
* It seems this approach can offer some speedup in some cases.
|
|
* However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
|
*
|
|
* call-seq:
|
|
* full_parallel(params, samples) -> nil
|
|
* full_parallel(params, samples, n_samples) -> nil
|
|
* full_parallel(params, samples, n_samples, n_processors) -> nil
|
|
* full_parallel(params, samples, nil, n_processors) -> nil
|
|
*/
|
|
static VALUE ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) {
|
|
if (argc < 2 || argc > 4) {
|
|
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
|
|
}
|
|
|
|
ruby_whisper *rw;
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
VALUE params = argv[0];
|
|
Data_Get_Struct(params, ruby_whisper_params, rwp);
|
|
VALUE samples = argv[1];
|
|
int n_samples;
|
|
int n_processors;
|
|
rb_memory_view_t view;
|
|
const bool memory_view_available_p = rb_memory_view_available_p(samples);
|
|
switch (argc) {
|
|
case 2:
|
|
n_processors = 1;
|
|
break;
|
|
case 3:
|
|
n_processors = 1;
|
|
break;
|
|
case 4:
|
|
n_processors = NUM2INT(argv[3]);
|
|
break;
|
|
}
|
|
if (argc >= 3 && !NIL_P(argv[2])) {
|
|
n_samples = NUM2INT(argv[2]);
|
|
if (TYPE(samples) == T_ARRAY) {
|
|
if (RARRAY_LEN(samples) < n_samples) {
|
|
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
|
|
}
|
|
}
|
|
// Should check when samples.respond_to?(:length)?
|
|
} else if (memory_view_available_p) {
|
|
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
|
|
view.obj = Qnil;
|
|
rb_raise(rb_eArgError, "unable to get a memory view");
|
|
}
|
|
n_samples = view.byte_size / view.item_size;
|
|
} else {
|
|
if (TYPE(samples) == T_ARRAY) {
|
|
n_samples = RARRAY_LEN(samples);
|
|
} else if (rb_respond_to(samples, id_length)) {
|
|
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
|
|
} else {
|
|
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
|
|
}
|
|
}
|
|
float * c_samples = (float *)malloc(n_samples * sizeof(float));
|
|
if (memory_view_available_p) {
|
|
c_samples = (float *)view.data;
|
|
} else {
|
|
if (TYPE(samples) == T_ARRAY) {
|
|
for (int i = 0; i < n_samples; i++) {
|
|
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
|
|
}
|
|
} else {
|
|
// FIXME: use rb_block_call
|
|
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
|
|
for (int i = 0; i < n_samples; i++) {
|
|
// TODO: check if iter is exhausted and raise ArgumentError
|
|
VALUE sample = rb_funcall(iter, id_next, 0);
|
|
c_samples[i] = RFLOAT_VALUE(sample);
|
|
}
|
|
}
|
|
}
|
|
const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
|
|
if (0 == result) {
|
|
return Qnil;
|
|
} else {
|
|
rb_exc_raise(rb_funcall(eError, id_new, 1, result));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Number of segments.
|
|
*
|
|
* call-seq:
|
|
* full_n_segments -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_full_n_segments(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_full_n_segments(rw->context));
|
|
}
|
|
|
|
/*
|
|
* Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full.
|
|
*
|
|
* call-seq:
|
|
* full_lang_id -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_full_lang_id(VALUE self) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
return INT2NUM(whisper_full_lang_id(rw->context));
|
|
}
|
|
|
|
static int ruby_whisper_full_check_segment_index(const ruby_whisper * rw, const VALUE i_segment) {
|
|
const int c_i_segment = NUM2INT(i_segment);
|
|
if (c_i_segment < 0 || c_i_segment >= whisper_full_n_segments(rw->context)) {
|
|
rb_raise(rb_eIndexError, "segment index %d out of range", c_i_segment);
|
|
}
|
|
return c_i_segment;
|
|
}
|
|
|
|
/*
|
|
* Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
|
|
*
|
|
* full_get_segment_t0(3) # => 1668 (16680 ms)
|
|
*
|
|
* call-seq:
|
|
* full_get_segment_t0(segment_index) -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
|
const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
|
|
return INT2NUM(t0);
|
|
}
|
|
|
|
/*
|
|
* End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
|
|
*
|
|
* full_get_segment_t1(3) # => 1668 (16680 ms)
|
|
*
|
|
* call-seq:
|
|
* full_get_segment_t1(segment_index) -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
|
const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
|
|
return INT2NUM(t1);
|
|
}
|
|
|
|
/*
|
|
* Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
|
|
*
|
|
* full_get_segment_speacker_turn_next(3) # => true
|
|
*
|
|
* call-seq:
|
|
* full_get_segment_speacker_turn_next(segment_index) -> bool
|
|
*/
|
|
static VALUE ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
|
const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
|
|
return speaker_turn_next ? Qtrue : Qfalse;
|
|
}
|
|
|
|
/*
|
|
* Text of a segment indexed by +segment_index+.
|
|
*
|
|
* full_get_segment_text(3) # => "ask not what your country can do for you, ..."
|
|
*
|
|
* call-seq:
|
|
* full_get_segment_text(segment_index) -> String
|
|
*/
|
|
static VALUE ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment) {
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
|
|
const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
|
|
return rb_str_new2(text);
|
|
}
|
|
|
|
/*
|
|
* params.language = "auto" | "en", etc...
|
|
*
|
|
* call-seq:
|
|
* language = lang_name -> lang_name
|
|
*/
|
|
static VALUE ruby_whisper_params_set_language(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
if (value == Qfalse || value == Qnil) {
|
|
rwp->params.language = "auto";
|
|
} else {
|
|
rwp->params.language = StringValueCStr(value);
|
|
}
|
|
return value;
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* language -> String
|
|
*/
|
|
static VALUE ruby_whisper_params_get_language(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
if (rwp->params.language) {
|
|
return rb_str_new2(rwp->params.language);
|
|
} else {
|
|
return rb_str_new2("auto");
|
|
}
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* translate = do_translate -> do_translate
|
|
*/
|
|
static VALUE ruby_whisper_params_set_translate(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, translate, value)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* translate -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_translate(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, translate)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* no_context = dont_use_context -> dont_use_context
|
|
*/
|
|
static VALUE ruby_whisper_params_set_no_context(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, no_context, value)
|
|
}
|
|
/*
|
|
* If true, does not use past transcription (if any) as initial prompt for the decoder.
|
|
*
|
|
* call-seq:
|
|
* no_context -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_no_context(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, no_context)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* single_segment = force_single -> force_single
|
|
*/
|
|
static VALUE ruby_whisper_params_set_single_segment(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, single_segment, value)
|
|
}
|
|
/*
|
|
* If true, forces single segment output (useful for streaming).
|
|
*
|
|
* call-seq:
|
|
* single_segment -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_single_segment(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, single_segment)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* print_special = force_print -> force_print
|
|
*/
|
|
static VALUE ruby_whisper_params_set_print_special(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, print_special, value)
|
|
}
|
|
/*
|
|
* If true, prints special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.).
|
|
*
|
|
* call-seq:
|
|
* print_special -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_print_special(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, print_special)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* print_progress = force_print -> force_print
|
|
*/
|
|
static VALUE ruby_whisper_params_set_print_progress(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, print_progress, value)
|
|
}
|
|
/*
|
|
* If true, prints progress information.
|
|
*
|
|
* call-seq:
|
|
* print_progress -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_print_progress(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, print_progress)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* print_realtime = force_print -> force_print
|
|
*/
|
|
static VALUE ruby_whisper_params_set_print_realtime(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, print_realtime, value)
|
|
}
|
|
/*
|
|
* If true, prints results from within whisper.cpp. (avoid it, use callback instead)
|
|
* call-seq:
|
|
* print_realtime -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_print_realtime(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, print_realtime)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* print_timestamps = force_print -> force_print
|
|
*/
|
|
static VALUE ruby_whisper_params_set_print_timestamps(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, print_timestamps, value)
|
|
}
|
|
/*
|
|
* If true, prints timestamps for each text segment when printing realtime.
|
|
*
|
|
* call-seq:
|
|
* print_timestamps -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_print_timestamps(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, print_timestamps)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* suppress_blank = force_suppress -> force_suppress
|
|
*/
|
|
static VALUE ruby_whisper_params_set_suppress_blank(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, suppress_blank, value)
|
|
}
|
|
/*
|
|
* If true, suppresses blank outputs.
|
|
*
|
|
* call-seq:
|
|
* suppress_blank -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_suppress_blank(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, suppress_blank)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* suppress_non_speech_tokens = force_suppress -> force_suppress
|
|
*/
|
|
static VALUE ruby_whisper_params_set_suppress_non_speech_tokens(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, suppress_non_speech_tokens, value)
|
|
}
|
|
/*
|
|
* If true, suppresses non-speech-tokens.
|
|
*
|
|
* call-seq:
|
|
* suppress_non_speech_tokens -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_suppress_non_speech_tokens(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, suppress_non_speech_tokens)
|
|
}
|
|
/*
|
|
* If true, enables token-level timestamps.
|
|
*
|
|
* call-seq:
|
|
* token_timestamps -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_token_timestamps(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, token_timestamps)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* token_timestamps = force_timestamps -> force_timestamps
|
|
*/
|
|
static VALUE ruby_whisper_params_set_token_timestamps(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, token_timestamps, value)
|
|
}
|
|
/*
|
|
* If true, split on word rather than on token (when used with max_len).
|
|
*
|
|
* call-seq:
|
|
* translate -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
|
|
BOOL_PARAMS_GETTER(self, split_on_word)
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* split_on_word = force_split -> force_split
|
|
*/
|
|
static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
|
|
BOOL_PARAMS_SETTER(self, split_on_word, value)
|
|
}
|
|
/*
|
|
* Tokens to provide to the whisper decoder as initial prompt
|
|
* these are prepended to any existing text context from a previous call
|
|
* use whisper_tokenize() to convert text to tokens.
|
|
* Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224).
|
|
*
|
|
* call-seq:
|
|
* initial_prompt -> String
|
|
*/
|
|
static VALUE ruby_whisper_params_get_initial_prompt(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return rwp->params.initial_prompt == nullptr ? Qnil : rb_str_new2(rwp->params.initial_prompt);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* initial_prompt = prompt -> prompt
|
|
*/
|
|
static VALUE ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.initial_prompt = StringValueCStr(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* If true, enables diarization.
|
|
*
|
|
* call-seq:
|
|
* diarize -> bool
|
|
*/
|
|
static VALUE ruby_whisper_params_get_diarize(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
if (rwp->diarize) {
|
|
return Qtrue;
|
|
} else {
|
|
return Qfalse;
|
|
}
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* diarize = force_diarize -> force_diarize
|
|
*/
|
|
static VALUE ruby_whisper_params_set_diarize(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
if (value == Qfalse || value == Qnil) {
|
|
rwp->diarize = false;
|
|
} else {
|
|
rwp->diarize = true;
|
|
} \
|
|
return value;
|
|
}
|
|
|
|
/*
|
|
* Start offset in ms.
|
|
*
|
|
* call-seq:
|
|
* offset -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_params_get_offset(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return INT2NUM(rwp->params.offset_ms);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* offset = offset_ms -> offset_ms
|
|
*/
|
|
static VALUE ruby_whisper_params_set_offset(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.offset_ms = NUM2INT(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* Audio duration to process in ms.
|
|
*
|
|
* call-seq:
|
|
* duration -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_params_get_duration(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return INT2NUM(rwp->params.duration_ms);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* duration = duration_ms -> duration_ms
|
|
*/
|
|
static VALUE ruby_whisper_params_set_duration(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.duration_ms = NUM2INT(value);
|
|
return value;
|
|
}
|
|
|
|
/*
|
|
* Max tokens to use from past text as prompt for the decoder.
|
|
*
|
|
* call-seq:
|
|
* max_text_tokens -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_params_get_max_text_tokens(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return INT2NUM(rwp->params.n_max_text_ctx);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* max_text_tokens = n_tokens -> n_tokens
|
|
*/
|
|
static VALUE ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.n_max_text_ctx = NUM2INT(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* temperature -> Float
|
|
*/
|
|
static VALUE ruby_whisper_params_get_temperature(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return DBL2NUM(rwp->params.temperature);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* temperature = temp -> temp
|
|
*/
|
|
static VALUE ruby_whisper_params_set_temperature(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.temperature = RFLOAT_VALUE(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* See https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
|
|
*
|
|
* call-seq:
|
|
* max_initial_ts -> Flaot
|
|
*/
|
|
static VALUE ruby_whisper_params_get_max_initial_ts(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return DBL2NUM(rwp->params.max_initial_ts);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* max_initial_ts = timestamp -> timestamp
|
|
*/
|
|
static VALUE ruby_whisper_params_set_max_initial_ts(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.max_initial_ts = RFLOAT_VALUE(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* length_penalty -> Float
|
|
*/
|
|
static VALUE ruby_whisper_params_get_length_penalty(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return DBL2NUM(rwp->params.length_penalty);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* length_penalty = penalty -> penalty
|
|
*/
|
|
static VALUE ruby_whisper_params_set_length_penalty(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.length_penalty = RFLOAT_VALUE(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* temperature_inc -> Float
|
|
*/
|
|
static VALUE ruby_whisper_params_get_temperature_inc(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return DBL2NUM(rwp->params.temperature_inc);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* temperature_inc = inc -> inc
|
|
*/
|
|
static VALUE ruby_whisper_params_set_temperature_inc(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.temperature_inc = RFLOAT_VALUE(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* Similar to OpenAI's "compression_ratio_threshold"
|
|
*
|
|
* call-seq:
|
|
* entropy_thold -> Float
|
|
*/
|
|
static VALUE ruby_whisper_params_get_entropy_thold(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return DBL2NUM(rwp->params.entropy_thold);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* entropy_thold = threshold -> threshold
|
|
*/
|
|
static VALUE ruby_whisper_params_set_entropy_thold(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.entropy_thold = RFLOAT_VALUE(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* logprob_thold -> Float
|
|
*/
|
|
static VALUE ruby_whisper_params_get_logprob_thold(VALUE self) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
return DBL2NUM(rwp->params.logprob_thold);
|
|
}
|
|
/*
|
|
* call-seq:
|
|
* logprob_thold = threshold -> threshold
|
|
*/
|
|
static VALUE ruby_whisper_params_set_logprob_thold(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->params.logprob_thold = RFLOAT_VALUE(value);
|
|
return value;
|
|
}
|
|
/*
|
|
* Sets new segment callback, called for every newly generated text segment.
|
|
*
|
|
* params.new_segment_callback = ->(context, _, n_new, user_data) {
|
|
* # ...
|
|
* }
|
|
*
|
|
* call-seq:
|
|
* new_segment_callback = callback -> callback
|
|
*/
|
|
static VALUE ruby_whisper_params_set_new_segment_callback(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->new_segment_callback_container->callback = value;
|
|
return value;
|
|
}
|
|
/*
|
|
* Sets user data passed to the last argument of new segment callback.
|
|
*
|
|
* call-seq:
|
|
* new_segment_callback_user_data = user_data -> use_data
|
|
*/
|
|
static VALUE ruby_whisper_params_set_new_segment_callback_user_data(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->new_segment_callback_container->user_data = value;
|
|
return value;
|
|
}
|
|
/*
|
|
* Sets progress callback, called on each progress update.
|
|
*
|
|
* params.new_segment_callback = ->(context, _, n_new, user_data) {
|
|
* # ...
|
|
* }
|
|
*
|
|
* call-seq:
|
|
* progress_callback = callback -> callback
|
|
*/
|
|
static VALUE ruby_whisper_params_set_progress_callback(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->progress_callback_container->callback = value;
|
|
return value;
|
|
}
|
|
/*
|
|
* Sets user data passed to the last argument of progress callback.
|
|
*
|
|
* call-seq:
|
|
* progress_callback_user_data = user_data -> use_data
|
|
*/
|
|
static VALUE ruby_whisper_params_set_progress_callback_user_data(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->progress_callback_container->user_data = value;
|
|
return value;
|
|
}
|
|
/*
|
|
* Sets abort callback, called to check if the process should be aborted.
|
|
*
|
|
* params.abort_callback = ->(user_data) {
|
|
* # ...
|
|
* }
|
|
*
|
|
* call-seq:
|
|
* abort_callback = callback -> callback
|
|
*/
|
|
static VALUE ruby_whisper_params_set_abort_callback(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->abort_callback_container->callback = value;
|
|
return value;
|
|
}
|
|
/*
|
|
* Sets user data passed to the last argument of abort callback.
|
|
*
|
|
* call-seq:
|
|
* abort_callback_user_data = user_data -> use_data
|
|
*/
|
|
static VALUE ruby_whisper_params_set_abort_callback_user_data(VALUE self, VALUE value) {
|
|
ruby_whisper_params *rwp;
|
|
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
|
rwp->abort_callback_container->user_data = value;
|
|
return value;
|
|
}
|
|
|
|
// High level API
|
|
|
|
typedef struct {
|
|
VALUE context;
|
|
int index;
|
|
} ruby_whisper_segment;
|
|
|
|
typedef struct {
|
|
VALUE context;
|
|
} ruby_whisper_model;
|
|
|
|
VALUE cSegment;
|
|
VALUE cModel;
|
|
|
|
static void rb_whisper_segment_mark(ruby_whisper_segment *rws) {
|
|
rb_gc_mark(rws->context);
|
|
}
|
|
|
|
static VALUE ruby_whisper_segment_allocate(VALUE klass) {
|
|
ruby_whisper_segment *rws;
|
|
rws = ALLOC(ruby_whisper_segment);
|
|
return Data_Wrap_Struct(klass, rb_whisper_segment_mark, RUBY_DEFAULT_FREE, rws);
|
|
}
|
|
|
|
static VALUE rb_whisper_segment_initialize(VALUE context, int index) {
|
|
ruby_whisper_segment *rws;
|
|
const VALUE segment = ruby_whisper_segment_allocate(cSegment);
|
|
Data_Get_Struct(segment, ruby_whisper_segment, rws);
|
|
rws->context = context;
|
|
rws->index = index;
|
|
return segment;
|
|
};
|
|
|
|
/*
|
|
* Yields each Whisper::Segment:
|
|
*
|
|
* whisper.transcribe("path/to/audio.wav", params)
|
|
* whisper.each_segment do |segment|
|
|
* puts segment.text
|
|
* end
|
|
*
|
|
* Returns an Enumerator if no block given:
|
|
*
|
|
* whisper.transcribe("path/to/audio.wav", params)
|
|
* enum = whisper.each_segment
|
|
* enum.to_a # => [#<Whisper::Segment>, ...]
|
|
*
|
|
* call-seq:
|
|
* each_segment {|segment| ... }
|
|
* each_segment -> Enumerator
|
|
*/
|
|
static VALUE ruby_whisper_each_segment(VALUE self) {
|
|
if (!rb_block_given_p()) {
|
|
const VALUE method_name = rb_funcall(self, id___method__, 0);
|
|
return rb_funcall(self, id_to_enum, 1, method_name);
|
|
}
|
|
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(self, ruby_whisper, rw);
|
|
|
|
const int n_segments = whisper_full_n_segments(rw->context);
|
|
for (int i = 0; i < n_segments; ++i) {
|
|
rb_yield(rb_whisper_segment_initialize(self, i));
|
|
}
|
|
|
|
return self;
|
|
}
|
|
|
|
/*
|
|
* Hook called on new segment. Yields each Whisper::Segment.
|
|
*
|
|
* whisper.on_new_segment do |segment|
|
|
* # ...
|
|
* end
|
|
*
|
|
* call-seq:
|
|
* on_new_segment {|segment| ... }
|
|
*/
|
|
static VALUE ruby_whisper_params_on_new_segment(VALUE self) {
|
|
ruby_whisper_params *rws;
|
|
Data_Get_Struct(self, ruby_whisper_params, rws);
|
|
const VALUE blk = rb_block_proc();
|
|
rb_ary_push(rws->new_segment_callback_container->callbacks, blk);
|
|
return Qnil;
|
|
}
|
|
|
|
/*
|
|
* Hook called on progress update. Yields each progress Integer between 0 and 100.
|
|
*
|
|
* whisper.on_progress do |progress|
|
|
* # ...
|
|
* end
|
|
*
|
|
* call-seq:
|
|
* on_progress {|progress| ... }
|
|
*/
|
|
static VALUE ruby_whisper_params_on_progress(VALUE self) {
|
|
ruby_whisper_params *rws;
|
|
Data_Get_Struct(self, ruby_whisper_params, rws);
|
|
const VALUE blk = rb_block_proc();
|
|
rb_ary_push(rws->progress_callback_container->callbacks, blk);
|
|
return Qnil;
|
|
}
|
|
|
|
/*
|
|
* Call block to determine whether abort or not. Return +true+ when you want to abort.
|
|
*
|
|
* params.abort_on do
|
|
* if some_condition
|
|
* true # abort
|
|
* else
|
|
* false # continue
|
|
* end
|
|
* end
|
|
*
|
|
* call-seq:
|
|
* abort_on { ... }
|
|
*/
|
|
static VALUE ruby_whisper_params_abort_on(VALUE self) {
|
|
ruby_whisper_params *rws;
|
|
Data_Get_Struct(self, ruby_whisper_params, rws);
|
|
const VALUE blk = rb_block_proc();
|
|
rb_ary_push(rws->abort_callback_container->callbacks, blk);
|
|
return Qnil;
|
|
}
|
|
|
|
/*
|
|
* Start time in milliseconds.
|
|
*
|
|
* call-seq:
|
|
* start_time -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_segment_get_start_time(VALUE self) {
|
|
ruby_whisper_segment *rws;
|
|
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
|
const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
|
|
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
|
|
return INT2NUM(t0 * 10);
|
|
}
|
|
|
|
/*
|
|
* End time in milliseconds.
|
|
*
|
|
* call-seq:
|
|
* end_time -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_segment_get_end_time(VALUE self) {
|
|
ruby_whisper_segment *rws;
|
|
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
|
const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
|
|
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
|
|
return INT2NUM(t1 * 10);
|
|
}
|
|
|
|
/*
|
|
* Whether the next segment is predicted as a speaker turn.
|
|
*
|
|
* call-seq:
|
|
* speaker_turn_next? -> bool
|
|
*/
|
|
static VALUE ruby_whisper_segment_get_speaker_turn_next(VALUE self) {
|
|
ruby_whisper_segment *rws;
|
|
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
|
return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* text -> String
|
|
*/
|
|
static VALUE ruby_whisper_segment_get_text(VALUE self) {
|
|
ruby_whisper_segment *rws;
|
|
Data_Get_Struct(self, ruby_whisper_segment, rws);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rws->context, ruby_whisper, rw);
|
|
const char * text = whisper_full_get_segment_text(rw->context, rws->index);
|
|
return rb_str_new2(text);
|
|
}
|
|
|
|
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
|
|
rb_gc_mark(rwm->context);
|
|
}
|
|
|
|
static VALUE ruby_whisper_model_allocate(VALUE klass) {
|
|
ruby_whisper_model *rwm;
|
|
rwm = ALLOC(ruby_whisper_model);
|
|
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
|
|
}
|
|
|
|
static VALUE rb_whisper_model_initialize(VALUE context) {
|
|
ruby_whisper_model *rwm;
|
|
const VALUE model = ruby_whisper_model_allocate(cModel);
|
|
Data_Get_Struct(model, ruby_whisper_model, rwm);
|
|
rwm->context = context;
|
|
return model;
|
|
};
|
|
|
|
/*
|
|
* call-seq:
|
|
* model -> Whisper::Model
|
|
*/
|
|
static VALUE ruby_whisper_get_model(VALUE self) {
|
|
return rb_whisper_model_initialize(self);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_vocab -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_vocab(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_vocab(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_audio_ctx -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_audio_ctx(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_audio_state -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_audio_state(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_audio_state(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_audio_head -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_audio_head(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_audio_head(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_audio_layer -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_audio_layer(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_audio_layer(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_text_ctx -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_text_ctx(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_text_ctx(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_text_state -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_text_state(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_text_state(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_text_head -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_text_head(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_text_head(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_text_layer -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_text_layer(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_text_layer(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* n_mels -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_n_mels(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_n_mels(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ftype -> Integer
|
|
*/
|
|
static VALUE ruby_whisper_c_model_ftype(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return INT2NUM(whisper_model_ftype(rw->context));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* type -> String
|
|
*/
|
|
static VALUE ruby_whisper_c_model_type(VALUE self) {
|
|
ruby_whisper_model *rwm;
|
|
Data_Get_Struct(self, ruby_whisper_model, rwm);
|
|
ruby_whisper *rw;
|
|
Data_Get_Struct(rwm->context, ruby_whisper, rw);
|
|
return rb_str_new2(whisper_model_type_readable(rw->context));
|
|
}
|
|
|
|
static VALUE ruby_whisper_error_initialize(VALUE self, VALUE code) {
|
|
const int c_code = NUM2INT(code);
|
|
char *raw_message;
|
|
switch (c_code) {
|
|
case -2:
|
|
raw_message = "failed to compute log mel spectrogram";
|
|
break;
|
|
case -3:
|
|
raw_message = "failed to auto-detect language";
|
|
break;
|
|
case -4:
|
|
raw_message = "too many decoders requested";
|
|
break;
|
|
case -5:
|
|
raw_message = "audio_ctx is larger than the maximum allowed";
|
|
break;
|
|
case -6:
|
|
raw_message = "failed to encode";
|
|
break;
|
|
case -7:
|
|
raw_message = "whisper_kv_cache_init() failed for self-attention cache";
|
|
break;
|
|
case -8:
|
|
raw_message = "failed to decode";
|
|
break;
|
|
case -9:
|
|
raw_message = "failed to decode";
|
|
break;
|
|
default:
|
|
raw_message = "unknown error";
|
|
break;
|
|
}
|
|
const VALUE message = rb_str_new2(raw_message);
|
|
rb_call_super(1, &message);
|
|
rb_iv_set(self, "@code", code);
|
|
|
|
return self;
|
|
}
|
|
|
|
|
|
void Init_whisper() {
|
|
id_to_s = rb_intern("to_s");
|
|
id_call = rb_intern("call");
|
|
id___method__ = rb_intern("__method__");
|
|
id_to_enum = rb_intern("to_enum");
|
|
id_length = rb_intern("length");
|
|
id_next = rb_intern("next");
|
|
id_new = rb_intern("new");
|
|
|
|
mWhisper = rb_define_module("Whisper");
|
|
cContext = rb_define_class_under(mWhisper, "Context", rb_cObject);
|
|
cParams = rb_define_class_under(mWhisper, "Params", rb_cObject);
|
|
eError = rb_define_class_under(mWhisper, "Error", rb_eStandardError);
|
|
|
|
rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
|
|
rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
|
|
rb_define_const(mWhisper, "LOG_LEVEL_WARN", INT2NUM(GGML_LOG_LEVEL_WARN));
|
|
rb_define_const(mWhisper, "LOG_LEVEL_ERROR", INT2NUM(GGML_LOG_LEVEL_ERROR));
|
|
rb_define_const(mWhisper, "LOG_LEVEL_DEBUG", INT2NUM(GGML_LOG_LEVEL_DEBUG));
|
|
rb_define_const(mWhisper, "LOG_LEVEL_CONT", INT2NUM(GGML_LOG_LEVEL_CONT));
|
|
|
|
rb_define_singleton_method(mWhisper, "lang_max_id", ruby_whisper_s_lang_max_id, 0);
|
|
rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
|
|
rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
|
|
rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
|
|
rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
|
|
rb_define_singleton_method(mWhisper, "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);
|
|
|
|
rb_define_alloc_func(cContext, ruby_whisper_allocate);
|
|
rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
|
|
|
|
rb_define_method(cContext, "transcribe", ruby_whisper_transcribe, -1);
|
|
rb_define_method(cContext, "model_n_vocab", ruby_whisper_model_n_vocab, 0);
|
|
rb_define_method(cContext, "model_n_audio_ctx", ruby_whisper_model_n_audio_ctx, 0);
|
|
rb_define_method(cContext, "model_n_audio_state", ruby_whisper_model_n_audio_state, 0);
|
|
rb_define_method(cContext, "model_n_audio_head", ruby_whisper_model_n_audio_head, 0);
|
|
rb_define_method(cContext, "model_n_audio_layer", ruby_whisper_model_n_audio_layer, 0);
|
|
rb_define_method(cContext, "model_n_text_ctx", ruby_whisper_model_n_text_ctx, 0);
|
|
rb_define_method(cContext, "model_n_text_state", ruby_whisper_model_n_text_state, 0);
|
|
rb_define_method(cContext, "model_n_text_head", ruby_whisper_model_n_text_head, 0);
|
|
rb_define_method(cContext, "model_n_text_layer", ruby_whisper_model_n_text_layer, 0);
|
|
rb_define_method(cContext, "model_n_mels", ruby_whisper_model_n_mels, 0);
|
|
rb_define_method(cContext, "model_ftype", ruby_whisper_model_ftype, 0);
|
|
rb_define_method(cContext, "model_type", ruby_whisper_model_type, 0);
|
|
rb_define_method(cContext, "full_n_segments", ruby_whisper_full_n_segments, 0);
|
|
rb_define_method(cContext, "full_lang_id", ruby_whisper_full_lang_id, 0);
|
|
rb_define_method(cContext, "full_get_segment_t0", ruby_whisper_full_get_segment_t0, 1);
|
|
rb_define_method(cContext, "full_get_segment_t1", ruby_whisper_full_get_segment_t1, 1);
|
|
rb_define_method(cContext, "full_get_segment_speaker_turn_next", ruby_whisper_full_get_segment_speaker_turn_next, 1);
|
|
rb_define_method(cContext, "full_get_segment_text", ruby_whisper_full_get_segment_text, 1);
|
|
rb_define_method(cContext, "full", ruby_whisper_full, -1);
|
|
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
|
|
|
|
rb_define_alloc_func(cParams, ruby_whisper_params_allocate);
|
|
|
|
rb_define_method(cParams, "language=", ruby_whisper_params_set_language, 1);
|
|
rb_define_method(cParams, "language", ruby_whisper_params_get_language, 0);
|
|
rb_define_method(cParams, "translate=", ruby_whisper_params_set_translate, 1);
|
|
rb_define_method(cParams, "translate", ruby_whisper_params_get_translate, 0);
|
|
rb_define_method(cParams, "no_context=", ruby_whisper_params_set_no_context, 1);
|
|
rb_define_method(cParams, "no_context", ruby_whisper_params_get_no_context, 0);
|
|
rb_define_method(cParams, "single_segment=", ruby_whisper_params_set_single_segment, 1);
|
|
rb_define_method(cParams, "single_segment", ruby_whisper_params_get_single_segment, 0);
|
|
rb_define_method(cParams, "print_special", ruby_whisper_params_get_print_special, 0);
|
|
rb_define_method(cParams, "print_special=", ruby_whisper_params_set_print_special, 1);
|
|
rb_define_method(cParams, "print_progress", ruby_whisper_params_get_print_progress, 0);
|
|
rb_define_method(cParams, "print_progress=", ruby_whisper_params_set_print_progress, 1);
|
|
rb_define_method(cParams, "print_realtime", ruby_whisper_params_get_print_realtime, 0);
|
|
rb_define_method(cParams, "print_realtime=", ruby_whisper_params_set_print_realtime, 1);
|
|
rb_define_method(cParams, "print_timestamps", ruby_whisper_params_get_print_timestamps, 0);
|
|
rb_define_method(cParams, "print_timestamps=", ruby_whisper_params_set_print_timestamps, 1);
|
|
rb_define_method(cParams, "suppress_blank", ruby_whisper_params_get_suppress_blank, 0);
|
|
rb_define_method(cParams, "suppress_blank=", ruby_whisper_params_set_suppress_blank, 1);
|
|
rb_define_method(cParams, "suppress_non_speech_tokens", ruby_whisper_params_get_suppress_non_speech_tokens, 0);
|
|
rb_define_method(cParams, "suppress_non_speech_tokens=", ruby_whisper_params_set_suppress_non_speech_tokens, 1);
|
|
rb_define_method(cParams, "token_timestamps", ruby_whisper_params_get_token_timestamps, 0);
|
|
rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
|
|
rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
|
|
rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
|
|
rb_define_method(cParams, "initial_prompt", ruby_whisper_params_get_initial_prompt, 0);
|
|
rb_define_method(cParams, "initial_prompt=", ruby_whisper_params_set_initial_prompt, 1);
|
|
rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
|
|
rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
|
|
|
|
rb_define_method(cParams, "offset", ruby_whisper_params_get_offset, 0);
|
|
rb_define_method(cParams, "offset=", ruby_whisper_params_set_offset, 1);
|
|
rb_define_method(cParams, "duration", ruby_whisper_params_get_duration, 0);
|
|
rb_define_method(cParams, "duration=", ruby_whisper_params_set_duration, 1);
|
|
|
|
rb_define_method(cParams, "max_text_tokens", ruby_whisper_params_get_max_text_tokens, 0);
|
|
rb_define_method(cParams, "max_text_tokens=", ruby_whisper_params_set_max_text_tokens, 1);
|
|
rb_define_method(cParams, "temperature", ruby_whisper_params_get_temperature, 0);
|
|
rb_define_method(cParams, "temperature=", ruby_whisper_params_set_temperature, 1);
|
|
rb_define_method(cParams, "max_initial_ts", ruby_whisper_params_get_max_initial_ts, 0);
|
|
rb_define_method(cParams, "max_initial_ts=", ruby_whisper_params_set_max_initial_ts, 1);
|
|
rb_define_method(cParams, "length_penalty", ruby_whisper_params_get_length_penalty, 0);
|
|
rb_define_method(cParams, "length_penalty=", ruby_whisper_params_set_length_penalty, 1);
|
|
rb_define_method(cParams, "temperature_inc", ruby_whisper_params_get_temperature_inc, 0);
|
|
rb_define_method(cParams, "temperature_inc=", ruby_whisper_params_set_temperature_inc, 1);
|
|
rb_define_method(cParams, "entropy_thold", ruby_whisper_params_get_entropy_thold, 0);
|
|
rb_define_method(cParams, "entropy_thold=", ruby_whisper_params_set_entropy_thold, 1);
|
|
rb_define_method(cParams, "logprob_thold", ruby_whisper_params_get_logprob_thold, 0);
|
|
rb_define_method(cParams, "logprob_thold=", ruby_whisper_params_set_logprob_thold, 1);
|
|
|
|
rb_define_method(cParams, "new_segment_callback=", ruby_whisper_params_set_new_segment_callback, 1);
|
|
rb_define_method(cParams, "new_segment_callback_user_data=", ruby_whisper_params_set_new_segment_callback_user_data, 1);
|
|
rb_define_method(cParams, "progress_callback=", ruby_whisper_params_set_progress_callback, 1);
|
|
rb_define_method(cParams, "progress_callback_user_data=", ruby_whisper_params_set_progress_callback_user_data, 1);
|
|
rb_define_method(cParams, "abort_callback=", ruby_whisper_params_set_abort_callback, 1);
|
|
rb_define_method(cParams, "abort_callback_user_data=", ruby_whisper_params_set_abort_callback_user_data, 1);
|
|
|
|
rb_define_attr(eError, "code", true, false);
|
|
rb_define_method(eError, "initialize", ruby_whisper_error_initialize, 1);
|
|
|
|
// High leve
|
|
cSegment = rb_define_class_under(mWhisper, "Segment", rb_cObject);
|
|
|
|
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
|
|
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
|
|
rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0);
|
|
rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0);
|
|
rb_define_method(cParams, "abort_on", ruby_whisper_params_abort_on, 0);
|
|
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
|
|
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
|
|
rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
|
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
|
|
|
|
cModel = rb_define_class_under(mWhisper, "Model", rb_cObject);
|
|
rb_define_alloc_func(cModel, ruby_whisper_model_allocate);
|
|
rb_define_method(cContext, "model", ruby_whisper_get_model, 0);
|
|
rb_define_method(cModel, "n_vocab", ruby_whisper_c_model_n_vocab, 0);
|
|
rb_define_method(cModel, "n_audio_ctx", ruby_whisper_c_model_n_audio_ctx, 0);
|
|
rb_define_method(cModel, "n_audio_state", ruby_whisper_c_model_n_audio_state, 0);
|
|
rb_define_method(cModel, "n_audio_head", ruby_whisper_c_model_n_audio_head, 0);
|
|
rb_define_method(cModel, "n_audio_layer", ruby_whisper_c_model_n_audio_layer, 0);
|
|
rb_define_method(cModel, "n_text_ctx", ruby_whisper_c_model_n_text_ctx, 0);
|
|
rb_define_method(cModel, "n_text_state", ruby_whisper_c_model_n_text_state, 0);
|
|
rb_define_method(cModel, "n_text_head", ruby_whisper_c_model_n_text_head, 0);
|
|
rb_define_method(cModel, "n_text_layer", ruby_whisper_c_model_n_text_layer, 0);
|
|
rb_define_method(cModel, "n_mels", ruby_whisper_c_model_n_mels, 0);
|
|
rb_define_method(cModel, "ftype", ruby_whisper_c_model_ftype, 0);
|
|
rb_define_method(cModel, "type", ruby_whisper_c_model_type, 0);
|
|
}
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|