diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md index 6de00fb2..5ba88e6f 100644 --- a/bindings/ruby/README.md +++ b/bindings/ruby/README.md @@ -70,17 +70,6 @@ end Some models are prepared up-front: -```ruby -base_en = Whisper::Model.pre_converted_models["base.en"] -whisper = Whisper::Context.new(base_en) -``` - -At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`: - -```ruby -Whisper::Model.pre_converted_models["base"].clear_cache -``` - You also can use shorthand for pre-converted models: ```ruby @@ -105,6 +94,19 @@ puts Whisper::Model.pre_converted_models.keys # : ``` +You can also retrieve each model: + +```ruby +base_en = Whisper::Model.pre_converted_models["base.en"] +whisper = Whisper::Context.new(base_en) +``` + +At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`: + +```ruby +Whisper::Model.pre_converted_models["base"].clear_cache +``` + You can also use local model files you prepared: ```ruby @@ -163,6 +165,16 @@ For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisp API --- +### Transcription ### + +By default, `Whisper::Context#transcribe` works in a single thread. You can make it work in parallel by passing `n_processors` option: + +```ruby +whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors) +``` + +Note that transcription occasionally might be low accuracy when it works in parallel. + ### Segments ### Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`: @@ -297,6 +309,11 @@ First call of `rake test` builds an extension and downloads a model for testing. If something seems wrong on build, running `rake clean` solves some cases. +### Need help ### + +* Windows support +* Refinement of C/C++ code, especially memory management + License ------- diff --git a/bindings/ruby/Rakefile b/bindings/ruby/Rakefile index bc6f8433..08a2312a 100644 --- a/bindings/ruby/Rakefile +++ b/bindings/ruby/Rakefile @@ -67,17 +67,15 @@ file LIB_FILE => [SO_FILE, "lib"] do |t| end CLEAN.include LIB_FILE -Rake::TestTask.new do |t| - t.test_files = FileList["tests/test_*.rb"] -end +Rake::TestTask.new -TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}" -file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t| - chdir "tests/jfk_reader" do +TEST_MEMORY_VIEW = "test/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}" +file TEST_MEMORY_VIEW => "test/jfk_reader/jfk_reader.c" do |t| + chdir "test/jfk_reader" do ruby "extconf.rb" sh "make" end end -CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}" +CLEAN.include "test/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}" task test: [LIB_FILE, TEST_MEMORY_VIEW] diff --git a/bindings/ruby/ext/ruby_whisper.c b/bindings/ruby/ext/ruby_whisper.c index e88aa29c..a1c2c520 100644 --- a/bindings/ruby/ext/ruby_whisper.c +++ b/bindings/ruby/ext/ruby_whisper.c @@ -24,6 +24,7 @@ ID id_URI; ID id_pre_converted_models; ID id_coreml_compiled_models; ID id_cache; +ID id_n_processors; static bool is_log_callback_finalized = false; @@ -142,6 +143,7 @@ void Init_whisper() { id_pre_converted_models = rb_intern("pre_converted_models"); id_coreml_compiled_models = rb_intern("coreml_compiled_models"); id_cache = rb_intern("cache"); + id_n_processors = rb_intern("n_processors"); mWhisper = rb_define_module("Whisper"); mVAD = rb_define_module_under(mWhisper, "VAD"); diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index 75aa8dc9..cb58c8d4 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -13,6 +13,7 @@ extern ID id_URI; extern ID id_pre_converted_models; extern ID id_coreml_compiled_models; extern ID id_cache; +extern ID id_n_processors; extern VALUE cContext; extern VALUE eError; @@ -24,6 +25,8 @@ extern VALUE rb_whisper_model_s_new(VALUE context); extern VALUE rb_whisper_segment_s_new(VALUE context, int index); extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context); +ID transcribe_option_names[1]; + static void ruby_whisper_free(ruby_whisper *rw) { @@ -633,6 +636,8 @@ init_ruby_whisper_context(VALUE *mWhisper) { cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject); + transcribe_option_names[0] = id_n_processors; + rb_define_alloc_func(cContext, ruby_whisper_allocate); rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1); diff --git a/bindings/ruby/ext/ruby_whisper_transcribe.cpp b/bindings/ruby/ext/ruby_whisper_transcribe.cpp index d12d2de9..71c4b49b 100644 --- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp +++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp @@ -13,6 +13,7 @@ extern const rb_data_type_t ruby_whisper_params_type; extern ID id_to_s; extern ID id_call; +extern ID transcribe_option_names[1]; extern void prepare_transcription(ruby_whisper_params * rwp, VALUE * self); @@ -34,9 +35,14 @@ VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { ruby_whisper *rw; ruby_whisper_params *rwp; - VALUE wave_file_path, blk, params; + VALUE wave_file_path, blk, params, kws; + VALUE opts[1]; + + rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, ¶ms, &kws, &blk); + rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts); + + int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]); - rb_scan_args(argc, argv, "02&", &wave_file_path, ¶ms, &blk); TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw); TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp); @@ -66,7 +72,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { prepare_transcription(rwp, &self); - if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) { + if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) { fprintf(stderr, "failed to process audio\n"); return self; } @@ -76,9 +82,8 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { const char * text = whisper_full_get_segment_text(rw->context, i); output = rb_str_concat(output, rb_str_new2(text)); } - VALUE idCall = id_call; if (blk != Qnil) { - rb_funcall(blk, idCall, 1, output); + rb_funcall(blk, id_call, 1, output); } return self; } diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index 6f8be29a..f9d09631 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -25,19 +25,19 @@ module Whisper def self.system_info_str: () -> String class Context - def self.new: (path | ::URI::HTTP) -> instance + def self.new: (String | path | ::URI::HTTP) -> instance # transcribe a single file # can emit to a block results # - # params = Whisper::Params.new - # params.duration = 60_000 - # whisper.transcribe "path/to/audio.wav", params do |text| - # puts text - # end + # params = Whisper::Params.new + # params.duration = 60_000 + # whisper.transcribe "path/to/audio.wav", params do |text| + # puts text + # end # - def transcribe: (string, Params) -> self - | (string, Params) { (String) -> void } -> self + def transcribe: (string, Params, ?n_processors: Integer) -> self + | (string, Params, ?n_processors: Integer) { (String) -> void } -> self def model_n_vocab: () -> Integer def model_n_audio_ctx: () -> Integer @@ -50,16 +50,16 @@ module Whisper # Yields each Whisper::Segment: # - # whisper.transcribe("path/to/audio.wav", params) - # whisper.each_segment do |segment| - # puts segment.text - # end + # whisper.transcribe("path/to/audio.wav", params) + # whisper.each_segment do |segment| + # puts segment.text + # end # # Returns an Enumerator if no block given: # - # whisper.transcribe("path/to/audio.wav", params) - # enum = whisper.each_segment - # enum.to_a # => [#, ...] + # whisper.transcribe("path/to/audio.wav", params) + # enum = whisper.each_segment + # enum.to_a # => [#, ...] # def each_segment: { (Segment) -> void } -> void | () -> Enumerator[Segment] @@ -74,25 +74,25 @@ module Whisper # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds). # - # full_get_segment_t0(3) # => 1668 (16680 ms) + # full_get_segment_t0(3) # => 1668 (16680 ms) # def full_get_segment_t0: (Integer) -> Integer # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds). # - # full_get_segment_t1(3) # => 1668 (16680 ms) + # full_get_segment_t1(3) # => 1668 (16680 ms) # def full_get_segment_t1: (Integer) -> Integer # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn. # - # full_get_segment_speacker_turn_next(3) # => true + # full_get_segment_speacker_turn_next(3) # => true # def full_get_segment_speaker_turn_next: (Integer) -> (true | false) # Text of a segment indexed by +segment_index+. # - # full_get_segment_text(3) # => "ask not what your country can do for you, ..." + # full_get_segment_text(3) # => "ask not what your country can do for you, ..." # def full_get_segment_text: (Integer) -> String @@ -282,9 +282,9 @@ module Whisper # Sets new segment callback, called for every newly generated text segment. # - # params.new_segment_callback = ->(context, _, n_new, user_data) { - # # ... - # } + # params.new_segment_callback = ->(context, _, n_new, user_data) { + # # ... + # } # def new_segment_callback=: (new_segment_callback) -> new_segment_callback def new_segment_callback: () -> (new_segment_callback | nil) @@ -297,9 +297,9 @@ module Whisper # Sets progress callback, called on each progress update. # - # params.new_segment_callback = ->(context, _, progress, user_data) { - # # ... - # } + # params.new_segment_callback = ->(context, _, progress, user_data) { + # # ... + # } # # +progress+ is an Integer between 0 and 100. # @@ -327,9 +327,9 @@ module Whisper # Sets abort callback, called to check if the process should be aborted. # - # params.abort_callback = ->(user_data) { - # # ... - # } + # params.abort_callback = ->(user_data) { + # # ... + # } # # def abort_callback=: (abort_callback) -> abort_callback @@ -358,9 +358,9 @@ module Whisper # Hook called on new segment. Yields each Whisper::Segment. # - # whisper.on_new_segment do |segment| - # # ... - # end + # whisper.on_new_segment do |segment| + # # ... + # end # def on_new_segment: { (Segment) -> void } -> void @@ -374,13 +374,13 @@ module Whisper # Call block to determine whether abort or not. Return +true+ when you want to abort. # - # params.abort_on do - # if some_condition - # true # abort - # else - # false # continue + # params.abort_on do + # if some_condition + # true # abort + # else + # false # continue + # end # end - # end # def abort_on: { (Object user_data) -> boolish } -> void end diff --git a/bindings/ruby/tests/helper.rb b/bindings/ruby/test/helper.rb similarity index 100% rename from bindings/ruby/tests/helper.rb rename to bindings/ruby/test/helper.rb diff --git a/bindings/ruby/tests/jfk_reader/.gitignore b/bindings/ruby/test/jfk_reader/.gitignore similarity index 100% rename from bindings/ruby/tests/jfk_reader/.gitignore rename to bindings/ruby/test/jfk_reader/.gitignore diff --git a/bindings/ruby/tests/jfk_reader/extconf.rb b/bindings/ruby/test/jfk_reader/extconf.rb similarity index 100% rename from bindings/ruby/tests/jfk_reader/extconf.rb rename to bindings/ruby/test/jfk_reader/extconf.rb diff --git a/bindings/ruby/tests/jfk_reader/jfk_reader.c b/bindings/ruby/test/jfk_reader/jfk_reader.c similarity index 100% rename from bindings/ruby/tests/jfk_reader/jfk_reader.c rename to bindings/ruby/test/jfk_reader/jfk_reader.c diff --git a/bindings/ruby/tests/test_callback.rb b/bindings/ruby/test/test_callback.rb similarity index 100% rename from bindings/ruby/tests/test_callback.rb rename to bindings/ruby/test/test_callback.rb diff --git a/bindings/ruby/tests/test_error.rb b/bindings/ruby/test/test_error.rb similarity index 100% rename from bindings/ruby/tests/test_error.rb rename to bindings/ruby/test/test_error.rb diff --git a/bindings/ruby/tests/test_model.rb b/bindings/ruby/test/test_model.rb similarity index 100% rename from bindings/ruby/tests/test_model.rb rename to bindings/ruby/test/test_model.rb diff --git a/bindings/ruby/tests/test_package.rb b/bindings/ruby/test/test_package.rb similarity index 100% rename from bindings/ruby/tests/test_package.rb rename to bindings/ruby/test/test_package.rb diff --git a/bindings/ruby/tests/test_params.rb b/bindings/ruby/test/test_params.rb similarity index 100% rename from bindings/ruby/tests/test_params.rb rename to bindings/ruby/test/test_params.rb diff --git a/bindings/ruby/tests/test_segment.rb b/bindings/ruby/test/test_segment.rb similarity index 100% rename from bindings/ruby/tests/test_segment.rb rename to bindings/ruby/test/test_segment.rb diff --git a/bindings/ruby/tests/test_vad.rb b/bindings/ruby/test/test_vad.rb similarity index 100% rename from bindings/ruby/tests/test_vad.rb rename to bindings/ruby/test/test_vad.rb diff --git a/bindings/ruby/tests/test_vad_params.rb b/bindings/ruby/test/test_vad_params.rb similarity index 100% rename from bindings/ruby/tests/test_vad_params.rb rename to bindings/ruby/test/test_vad_params.rb diff --git a/bindings/ruby/tests/test_whisper.rb b/bindings/ruby/test/test_whisper.rb similarity index 91% rename from bindings/ruby/tests/test_whisper.rb rename to bindings/ruby/test/test_whisper.rb index d915041f..8f1e69db 100644 --- a/bindings/ruby/tests/test_whisper.rb +++ b/bindings/ruby/test/test_whisper.rb @@ -20,6 +20,24 @@ class TestWhisper < TestBase } end + def test_transcribe_non_parallel + @whisper = Whisper::Context.new("base.en") + params = Whisper::Params.new + + @whisper.transcribe(AUDIO, params, n_processors: 1) {|text| + assert_match(/ask not what your country can do for you, ask what you can do for your country/, text) + } + end + + def test_transcribe_n_processors + @whisper = Whisper::Context.new("base.en") + params = Whisper::Params.new + + @whisper.transcribe(AUDIO, params, n_processors: 4) {|text| + assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text) + } + end + sub_test_case "After transcription" do def test_full_n_segments assert_equal 1, whisper.full_n_segments diff --git a/bindings/ruby/whispercpp.gemspec b/bindings/ruby/whispercpp.gemspec index 06bef943..b838aa9f 100644 --- a/bindings/ruby/whispercpp.gemspec +++ b/bindings/ruby/whispercpp.gemspec @@ -4,7 +4,7 @@ Gem::Specification.new do |s| s.name = "whispercpp" s.authors = ["Georgi Gerganov", "Todd A. Fisher"] s.version = '1.3.3' - s.date = '2025-06-01' + s.date = '2025-06-03' s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby} s.email = 'todd.fisher@gmail.com' s.extra_rdoc_files = ['LICENSE', 'README.md'] @@ -21,7 +21,7 @@ Gem::Specification.new do |s| } s.summary = %q{Ruby whisper.cpp bindings} - s.test_files = s.files.select {|file| file.start_with? "tests/"} + s.test_files = s.files.select {|file| file.start_with? "test/"} s.extensions << 'ext/extconf.rb' s.required_ruby_version = '>= 3.1.0'