From fbead675497cd9385e2f3953d5d7174e09e8ca91 Mon Sep 17 00:00:00 2001 From: KITAITI Makoto Date: Tue, 10 Jun 2025 13:10:17 +0900 Subject: [PATCH] ruby : output format (#3237) * Fix a typo * Don't allocate output string unless needed * Add methods to output SRT and WebVTT * Add tests for output methods * Make constants for output private * Add signatures for output methods * Add document on output methods * Fix method name: Segment#speaker_next_turn? -> #speacker_turn_next? * Add Whisper::Segment#descotruct_keys * Add test for Whisper::Context#descotruct_keys * Add signature of Whisper::Segment#deconstruct_keys * Use parentheses to suppress warning * Update date --- bindings/ruby/README.md | 30 ++++++- bindings/ruby/ext/ruby_whisper.c | 2 + bindings/ruby/ext/ruby_whisper_context.c | 2 +- bindings/ruby/ext/ruby_whisper_segment.c | 79 ++++++++++++++++++- bindings/ruby/ext/ruby_whisper_transcribe.cpp | 7 +- bindings/ruby/lib/whisper/context.rb | 15 ++++ bindings/ruby/lib/whisper/segment.rb | 58 ++++++++++++++ bindings/ruby/sig/whisper.rbs | 24 +++++- bindings/ruby/test/test_segment.rb | 62 +++++++++++++++ bindings/ruby/test/test_whisper.rb | 46 ++++++++++- bindings/ruby/whispercpp.gemspec | 2 +- 11 files changed, 317 insertions(+), 10 deletions(-) create mode 100644 bindings/ruby/lib/whisper/context.rb create mode 100644 bindings/ruby/lib/whisper/segment.rb diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md index 5ba88e6f..fff6efc7 100644 --- a/bindings/ruby/README.md +++ b/bindings/ruby/README.md @@ -162,6 +162,32 @@ Whisper::Params.new( For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad). +### Output ### + +whispercpp supports SRT and WebVTT output: + +```ruby +puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt +# => +WEBVTT + +1 +00:00:00.000 --> 00:00:03.860 + My thought I have nobody by a beauty and will as you poured. + +2 +00:00:03.860 --> 00:00:09.840 + Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in + +3 +00:00:09.840 --> 00:00:09.940 + a + +``` + +You may call `#to_srt`, too + + API --- @@ -196,7 +222,7 @@ whisper ed: format_time(segment.end_time), text: segment.text } - line << " (speaker turned)" if segment.speaker_next_turn? + line << " (speaker turned)" if segment.speaker_turn_next? puts line end @@ -212,7 +238,7 @@ params.on_new_segment do |segment| ed: format_time(segment.end_time), text: segment.text } - line << " (speaker turned)" if segment.speaker_next_turn? + line << " (speaker turned)" if segment.speaker_turn_next? puts line end diff --git a/bindings/ruby/ext/ruby_whisper.c b/bindings/ruby/ext/ruby_whisper.c index a1c2c520..35c196ab 100644 --- a/bindings/ruby/ext/ruby_whisper.c +++ b/bindings/ruby/ext/ruby_whisper.c @@ -170,5 +170,7 @@ void Init_whisper() { init_ruby_whisper_model(&mWhisper); init_ruby_whisper_vad_params(&mVAD); + rb_require("whisper/context"); + rb_require("whisper/segment"); rb_require("whisper/model/uri"); } diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c index cb58c8d4..bc0c6e99 100644 --- a/bindings/ruby/ext/ruby_whisper_context.c +++ b/bindings/ruby/ext/ruby_whisper_context.c @@ -664,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper) rb_define_method(cContext, "full", ruby_whisper_full, -1); rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1); - // High leve + // High level rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1); rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0); diff --git a/bindings/ruby/ext/ruby_whisper_segment.c b/bindings/ruby/ext/ruby_whisper_segment.c index ce54a52d..a303187c 100644 --- a/bindings/ruby/ext/ruby_whisper_segment.c +++ b/bindings/ruby/ext/ruby_whisper_segment.c @@ -1,6 +1,15 @@ #include #include "ruby_whisper.h" +#define N_KEY_NAMES 5 + +static VALUE sym_start_time; +static VALUE sym_end_time; +static VALUE sym_text; +static VALUE sym_no_speech_prob; +static VALUE sym_speaker_turn_next; +static VALUE key_names; + extern const rb_data_type_t ruby_whisper_type; extern VALUE cSegment; @@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self) return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index)); } +/* + * call-seq: + * deconstruct_keys(keys) -> hash + * + * Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next + * + * whisper.each_segment do |segment| + * segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:} + * + * puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})" + * end + */ +static VALUE +ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys) +{ + ruby_whisper_segment *rws; + TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws); + ruby_whisper *rw; + TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw); + + VALUE hash = rb_hash_new(); + long n_keys; + if (NIL_P(keys)) { + keys = key_names; + n_keys = N_KEY_NAMES; + } else { + n_keys = RARRAY_LEN(keys); + if (n_keys > N_KEY_NAMES) { + return hash; + } + } + for (int i = 0; i < n_keys; i++) { + VALUE key = rb_ary_entry(keys, i); + if (key == sym_start_time) { + rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self)); + } + if (key == sym_end_time) { + rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self)); + } + if (key == sym_text) { + rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self)); + } + if (key == sym_no_speech_prob) { + rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self)); + } + if (key == sym_speaker_turn_next) { + rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self)); + } + } + + return hash; +} + void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext) { cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject); + sym_start_time = ID2SYM(rb_intern("start_time")); + sym_end_time = ID2SYM(rb_intern("end_time")); + sym_text = ID2SYM(rb_intern("text")); + sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob")); + sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next")); + key_names = rb_ary_new3( + N_KEY_NAMES, + sym_start_time, + sym_end_time, + sym_text, + sym_no_speech_prob, + sym_speaker_turn_next + ); + rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate); rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0); rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0); - rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0); + rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0); rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0); rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0); + rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1); } diff --git a/bindings/ruby/ext/ruby_whisper_transcribe.cpp b/bindings/ruby/ext/ruby_whisper_transcribe.cpp index 71c4b49b..dc64af00 100644 --- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp +++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp @@ -76,15 +76,16 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { fprintf(stderr, "failed to process audio\n"); return self; } + if (NIL_P(blk)) { + return self; + } const int n_segments = whisper_full_n_segments(rw->context); VALUE output = rb_str_new2(""); for (int i = 0; i < n_segments; ++i) { const char * text = whisper_full_get_segment_text(rw->context, i); output = rb_str_concat(output, rb_str_new2(text)); } - if (blk != Qnil) { - rb_funcall(blk, id_call, 1, output); - } + rb_funcall(blk, id_call, 1, output); return self; } #ifdef __cplusplus diff --git a/bindings/ruby/lib/whisper/context.rb b/bindings/ruby/lib/whisper/context.rb new file mode 100644 index 00000000..c3a134b7 --- /dev/null +++ b/bindings/ruby/lib/whisper/context.rb @@ -0,0 +1,15 @@ +module Whisper + class Context + def to_srt + each_segment.with_index.reduce("") {|srt, (segment, index)| + srt << "#{index + 1}\n#{segment.to_srt_cue}\n" + } + end + + def to_webvtt + each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)| + webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n" + } + end + end +end diff --git a/bindings/ruby/lib/whisper/segment.rb b/bindings/ruby/lib/whisper/segment.rb new file mode 100644 index 00000000..dc187dca --- /dev/null +++ b/bindings/ruby/lib/whisper/segment.rb @@ -0,0 +1,58 @@ +module Whisper + class Segment + SRT_ESCAPES = { + "&" => "&", + "<" => "<", + ">" => ">", + } + SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys) + private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE + + def to_srt_cue + "#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n" + end + + def to_webvtt_cue + "#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n" + end + + private + + def time_to_a(time) + sec, decimal_part = time.divmod(1000) + min, sec = sec.divmod(60) + hour, min = min.divmod(60) + [hour, min, sec, decimal_part] + end + + def srt_time(time) + "%02d:%02d:%02d,%03d" % time_to_a(time) + end + + def srt_start_time + srt_time(start_time) + end + + def srt_end_time + srt_time(end_time) + end + + def srt_text + text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES) + end + + def webvtt_time(time) + "%02d:%02d:%02d.%03d" % time_to_a(time) + end + + def webvtt_start_time + webvtt_time(start_time) + end + + def webvtt_end_time + webvtt_time(end_time) + end + + alias webvtt_text srt_text + end +end diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs index f9d09631..c73e6ad6 100644 --- a/bindings/ruby/sig/whisper.rbs +++ b/bindings/ruby/sig/whisper.rbs @@ -116,6 +116,9 @@ module Whisper def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self | (Params, _Samples, ?Integer n_samples) -> self | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self + + def to_srt: () -> String + def to_webvtt: () -> String end class Params @@ -415,6 +418,14 @@ module Whisper end class Segment + type deconstructed_keys = { + start_time: (Integer | nil), + end_time: (Integer | nil), + text: (String | nil), + no_speech_prob: (Float | nil), + speaker_turn_next: (true | false | nil) + } + # Start time in milliseconds. # def start_time: () -> Integer @@ -424,10 +435,21 @@ module Whisper def end_time: () -> Integer # Whether the next segment is predicted as a speaker turn. - def speaker_next_turn?: () -> (true | false) + def speaker_turn_next?: () -> (true | false) def text: () -> String def no_speech_prob: () -> Float + def to_srt_cue: () -> String + def to_webvtt_cue: () -> String + + # Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next + # + # whisper.each_segment do |segment| + # segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:} + # + # puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})" + # end + def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys end module VAD diff --git a/bindings/ruby/test/test_segment.rb b/bindings/ruby/test/test_segment.rb index e8b99870..5b63e0c4 100644 --- a/bindings/ruby/test/test_segment.rb +++ b/bindings/ruby/test/test_segment.rb @@ -71,4 +71,66 @@ class TestSegment < TestBase end whisper.transcribe(AUDIO, params) end + + def test_pattern_matching + segment = whisper.each_segment.first + segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:} + + assert_equal segment.start_time, start_time + assert_equal segment.end_time, end_time + assert_equal segment.text, text + assert_equal segment.no_speech_prob, no_speech_prob + assert_equal segment.speaker_turn_next?, speaker_turn_next + end + + def test_pattern_matching_partial + segment = whisper.each_segment.first + segment => {start_time:, end_time:, text:} + + assert_equal segment.start_time, start_time + assert_equal segment.end_time, end_time + assert_equal segment.text, text + end + + def test_deconstruct_keys + segment = whisper.each_segment.first + expected = { + start_time: segment.start_time, + end_time: segment.end_time, + text: segment.text, + no_speech_prob: segment.no_speech_prob, + speaker_turn_next: segment.speaker_turn_next? + } + assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next]) + end + + def test_deconstruct_keys_non_existent + omit "Undefined behavior" + + segment = whisper.each_segment.first + + assert_equal({}, segment.deconstruct_keys([:non_existent])) + end + + def test_deconstruct_keys_too_many_keys + omit "Undefined behavior" + + segment = whisper.each_segment.first + + assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key])) + end + + def test_deconstruct_keys_includes_non_existent_keys_not_too_many + omit "Undefined behavior" + + segment = whisper.each_segment.first + + expected = { + start_time: segment.start_time, + end_time: segment.end_time, + text: segment.text, + no_speech_prob: segment.no_speech_prob + } + assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent])) + end end diff --git a/bindings/ruby/test/test_whisper.rb b/bindings/ruby/test/test_whisper.rb index 8f1e69db..e429c543 100644 --- a/bindings/ruby/test/test_whisper.rb +++ b/bindings/ruby/test/test_whisper.rb @@ -113,7 +113,7 @@ class TestWhisper < TestBase end def test_system_info_str - assert_match /\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str + assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str) end def test_log_set @@ -245,4 +245,48 @@ class TestWhisper < TestBase assert_match(/for your country/i, text) end end + + def test_to_srt + whisper = Whisper::Context.new("base.en") + whisper.transcribe AUDIO, @params + + lines = whisper.to_srt.lines + assert_match(/\A\d+\n/, lines[0]) + assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1]) + assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2]) + end + + def test_to_webvtt + whisper = Whisper::Context.new("base.en") + whisper.transcribe AUDIO, @params + + lines = whisper.to_webvtt.lines + assert_equal "WEBVTT\n", lines[0] + assert_equal "\n", lines[1] + assert_match(/\A\d+\n/, lines[2]) + assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3]) + assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4]) + end + + sub_test_case "Format needs escape" do + def setup + @whisper = Whisper::Context.new("base.en") + @whisper.transcribe AUDIO, Whisper::Params.new + segment = @whisper.each_segment.first + segment.define_singleton_method :text do + "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country." + end + @whisper.define_singleton_method :each_segment do + Enumerator.new(3) {|yielder| 3.times {yielder << segment}} + end + end + + def test_to_srt_escape + assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_srt.lines[2] + end + + def test_to_webvtt_escape + assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4] + end + end end diff --git a/bindings/ruby/whispercpp.gemspec b/bindings/ruby/whispercpp.gemspec index b838aa9f..0a2a0c5f 100644 --- a/bindings/ruby/whispercpp.gemspec +++ b/bindings/ruby/whispercpp.gemspec @@ -4,7 +4,7 @@ Gem::Specification.new do |s| s.name = "whispercpp" s.authors = ["Georgi Gerganov", "Todd A. Fisher"] s.version = '1.3.3' - s.date = '2025-06-03' + s.date = '2025-06-10' s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby} s.email = 'todd.fisher@gmail.com' s.extra_rdoc_files = ['LICENSE', 'README.md']