From fbead675497cd9385e2f3953d5d7174e09e8ca91 Mon Sep 17 00:00:00 2001
From: KITAITI Makoto <KitaitiMakoto@gmail.com>
Date: Tue, 10 Jun 2025 13:10:17 +0900
Subject: [PATCH] ruby : output format (#3237)

* Fix a typo

* Don't allocate output string unless needed

* Add methods to output SRT and WebVTT

* Add tests for output methods

* Make constants for output private

* Add signatures for output methods

* Add document on output methods

* Fix method name: Segment#speaker_next_turn? -> #speacker_turn_next?

* Add Whisper::Segment#descotruct_keys

* Add test for Whisper::Context#descotruct_keys

* Add signature of Whisper::Segment#deconstruct_keys

* Use parentheses to suppress warning

* Update date
---
 bindings/ruby/README.md                       | 30 ++++++-
 bindings/ruby/ext/ruby_whisper.c              |  2 +
 bindings/ruby/ext/ruby_whisper_context.c      |  2 +-
 bindings/ruby/ext/ruby_whisper_segment.c      | 79 ++++++++++++++++++-
 bindings/ruby/ext/ruby_whisper_transcribe.cpp |  7 +-
 bindings/ruby/lib/whisper/context.rb          | 15 ++++
 bindings/ruby/lib/whisper/segment.rb          | 58 ++++++++++++++
 bindings/ruby/sig/whisper.rbs                 | 24 +++++-
 bindings/ruby/test/test_segment.rb            | 62 +++++++++++++++
 bindings/ruby/test/test_whisper.rb            | 46 ++++++++++-
 bindings/ruby/whispercpp.gemspec              |  2 +-
 11 files changed, 317 insertions(+), 10 deletions(-)
 create mode 100644 bindings/ruby/lib/whisper/context.rb
 create mode 100644 bindings/ruby/lib/whisper/segment.rb

diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md
index 5ba88e6f..fff6efc7 100644
--- a/bindings/ruby/README.md
+++ b/bindings/ruby/README.md
@@ -162,6 +162,32 @@ Whisper::Params.new(
 
 For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
 
+### Output ###
+
+whispercpp supports SRT and WebVTT output:
+
+```ruby
+puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
+# =>
+WEBVTT
+
+1
+00:00:00.000 --> 00:00:03.860
+ My thought I have nobody by a beauty and will as you poured.
+
+2
+00:00:03.860 --> 00:00:09.840
+ Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
+
+3
+00:00:09.840 --> 00:00:09.940
+ a
+
+```
+
+You may call `#to_srt`, too
+
+
 API
 ---
 
@@ -196,7 +222,7 @@ whisper
       ed: format_time(segment.end_time),
       text: segment.text
     }
-    line << " (speaker turned)" if segment.speaker_next_turn?
+    line << " (speaker turned)" if segment.speaker_turn_next?
     puts line
   end
 
@@ -212,7 +238,7 @@ params.on_new_segment do |segment|
     ed: format_time(segment.end_time),
     text: segment.text
   }
-  line << " (speaker turned)" if segment.speaker_next_turn?
+  line << " (speaker turned)" if segment.speaker_turn_next?
   puts line
 end
 
diff --git a/bindings/ruby/ext/ruby_whisper.c b/bindings/ruby/ext/ruby_whisper.c
index a1c2c520..35c196ab 100644
--- a/bindings/ruby/ext/ruby_whisper.c
+++ b/bindings/ruby/ext/ruby_whisper.c
@@ -170,5 +170,7 @@ void Init_whisper() {
   init_ruby_whisper_model(&mWhisper);
   init_ruby_whisper_vad_params(&mVAD);
 
+  rb_require("whisper/context");
+  rb_require("whisper/segment");
   rb_require("whisper/model/uri");
 }
diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c
index cb58c8d4..bc0c6e99 100644
--- a/bindings/ruby/ext/ruby_whisper_context.c
+++ b/bindings/ruby/ext/ruby_whisper_context.c
@@ -664,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
   rb_define_method(cContext, "full", ruby_whisper_full, -1);
   rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
 
-  // High leve
+  // High level
   rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
   rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
 
diff --git a/bindings/ruby/ext/ruby_whisper_segment.c b/bindings/ruby/ext/ruby_whisper_segment.c
index ce54a52d..a303187c 100644
--- a/bindings/ruby/ext/ruby_whisper_segment.c
+++ b/bindings/ruby/ext/ruby_whisper_segment.c
@@ -1,6 +1,15 @@
 #include <ruby.h>
 #include "ruby_whisper.h"
 
+#define N_KEY_NAMES 5
+
+static VALUE sym_start_time;
+static VALUE sym_end_time;
+static VALUE sym_text;
+static VALUE sym_no_speech_prob;
+static VALUE sym_speaker_turn_next;
+static VALUE key_names;
+
 extern const rb_data_type_t ruby_whisper_type;
 
 extern VALUE cSegment;
@@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
   return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
 }
 
+/*
+ * call-seq:
+ *   deconstruct_keys(keys) -> hash
+ *
+ *  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+ *
+ *   whisper.each_segment do |segment|
+ *     segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+ *
+ *     puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
+ *   end
+ */
+static VALUE
+ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
+{
+  ruby_whisper_segment *rws;
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  ruby_whisper *rw;
+  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+
+  VALUE hash = rb_hash_new();
+  long n_keys;
+  if (NIL_P(keys)) {
+    keys = key_names;
+    n_keys = N_KEY_NAMES;
+  } else {
+    n_keys = RARRAY_LEN(keys);
+    if (n_keys > N_KEY_NAMES) {
+      return hash;
+    }
+  }
+  for (int i = 0; i < n_keys; i++) {
+    VALUE key = rb_ary_entry(keys, i);
+    if (key == sym_start_time) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
+    }
+    if (key == sym_end_time) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
+    }
+    if (key == sym_text) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
+    }
+    if (key == sym_no_speech_prob) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
+    }
+    if (key == sym_speaker_turn_next) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
+    }
+  }
+
+  return hash;
+}
+
 void
 init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
 {
   cSegment  = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
 
+  sym_start_time = ID2SYM(rb_intern("start_time"));
+  sym_end_time = ID2SYM(rb_intern("end_time"));
+  sym_text = ID2SYM(rb_intern("text"));
+  sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
+  sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
+  key_names = rb_ary_new3(
+    N_KEY_NAMES,
+    sym_start_time,
+    sym_end_time,
+    sym_text,
+    sym_no_speech_prob,
+    sym_speaker_turn_next
+  );
+
   rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
   rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
   rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
-  rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
+  rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
   rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
   rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
+  rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
 }
diff --git a/bindings/ruby/ext/ruby_whisper_transcribe.cpp b/bindings/ruby/ext/ruby_whisper_transcribe.cpp
index 71c4b49b..dc64af00 100644
--- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp
+++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp
@@ -76,15 +76,16 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
     fprintf(stderr, "failed to process audio\n");
     return self;
   }
+  if (NIL_P(blk)) {
+    return self;
+  }
   const int n_segments = whisper_full_n_segments(rw->context);
   VALUE output = rb_str_new2("");
   for (int i = 0; i < n_segments; ++i) {
     const char * text = whisper_full_get_segment_text(rw->context, i);
     output = rb_str_concat(output, rb_str_new2(text));
   }
-  if (blk != Qnil) {
-    rb_funcall(blk, id_call, 1, output);
-  }
+  rb_funcall(blk, id_call, 1, output);
   return self;
 }
 #ifdef __cplusplus
diff --git a/bindings/ruby/lib/whisper/context.rb b/bindings/ruby/lib/whisper/context.rb
new file mode 100644
index 00000000..c3a134b7
--- /dev/null
+++ b/bindings/ruby/lib/whisper/context.rb
@@ -0,0 +1,15 @@
+module Whisper
+  class Context
+    def to_srt
+      each_segment.with_index.reduce("") {|srt, (segment, index)|
+        srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
+      }
+    end
+
+    def to_webvtt
+      each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
+        webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
+      }
+    end
+  end
+end
diff --git a/bindings/ruby/lib/whisper/segment.rb b/bindings/ruby/lib/whisper/segment.rb
new file mode 100644
index 00000000..dc187dca
--- /dev/null
+++ b/bindings/ruby/lib/whisper/segment.rb
@@ -0,0 +1,58 @@
+module Whisper
+  class Segment
+    SRT_ESCAPES = {
+      "&" => "&amp;",
+      "<" => "&lt;",
+      ">" => "&gt;",
+    }
+    SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
+    private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
+
+    def to_srt_cue
+      "#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
+    end
+
+    def to_webvtt_cue
+      "#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
+    end
+
+    private
+
+    def time_to_a(time)
+      sec, decimal_part = time.divmod(1000)
+      min, sec = sec.divmod(60)
+      hour, min = min.divmod(60)
+      [hour, min, sec, decimal_part]
+    end
+
+    def srt_time(time)
+      "%02d:%02d:%02d,%03d" % time_to_a(time)
+    end
+
+    def srt_start_time
+      srt_time(start_time)
+    end
+
+    def srt_end_time
+      srt_time(end_time)
+    end
+
+    def srt_text
+      text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
+    end
+
+    def webvtt_time(time)
+      "%02d:%02d:%02d.%03d" % time_to_a(time)
+    end
+
+    def webvtt_start_time
+      webvtt_time(start_time)
+    end
+
+    def webvtt_end_time
+      webvtt_time(end_time)
+    end
+
+    alias webvtt_text srt_text
+  end
+end
diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs
index f9d09631..c73e6ad6 100644
--- a/bindings/ruby/sig/whisper.rbs
+++ b/bindings/ruby/sig/whisper.rbs
@@ -116,6 +116,9 @@ module Whisper
     def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
+
+    def to_srt: () -> String
+    def to_webvtt: () -> String
   end
 
   class Params
@@ -415,6 +418,14 @@ module Whisper
   end
 
   class Segment
+    type deconstructed_keys = {
+      start_time: (Integer | nil),
+      end_time: (Integer | nil),
+      text: (String | nil),
+      no_speech_prob: (Float | nil),
+      speaker_turn_next: (true | false | nil)
+    }
+
     # Start time in milliseconds.
     #
     def start_time: () -> Integer
@@ -424,10 +435,21 @@ module Whisper
     def end_time: () -> Integer
 
     # Whether the next segment is predicted as a speaker turn.
-    def speaker_next_turn?: () -> (true | false)
+    def speaker_turn_next?: () -> (true | false)
 
     def text: () -> String
     def no_speech_prob: () -> Float
+    def to_srt_cue: () -> String
+    def to_webvtt_cue: () -> String
+
+    #  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+    #
+    #      whisper.each_segment do |segment|
+    #        segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+    #
+    #        puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
+    #      end
+    def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
   end
 
   module VAD
diff --git a/bindings/ruby/test/test_segment.rb b/bindings/ruby/test/test_segment.rb
index e8b99870..5b63e0c4 100644
--- a/bindings/ruby/test/test_segment.rb
+++ b/bindings/ruby/test/test_segment.rb
@@ -71,4 +71,66 @@ class TestSegment < TestBase
     end
     whisper.transcribe(AUDIO, params)
   end
+
+  def test_pattern_matching
+    segment = whisper.each_segment.first
+    segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+
+    assert_equal segment.start_time, start_time
+    assert_equal segment.end_time, end_time
+    assert_equal segment.text, text
+    assert_equal segment.no_speech_prob, no_speech_prob
+    assert_equal segment.speaker_turn_next?, speaker_turn_next
+  end
+
+  def test_pattern_matching_partial
+    segment = whisper.each_segment.first
+    segment => {start_time:, end_time:, text:}
+
+    assert_equal segment.start_time, start_time
+    assert_equal segment.end_time, end_time
+    assert_equal segment.text, text
+  end
+
+  def test_deconstruct_keys
+    segment = whisper.each_segment.first
+    expected = {
+      start_time: segment.start_time,
+      end_time: segment.end_time,
+      text: segment.text,
+      no_speech_prob: segment.no_speech_prob,
+      speaker_turn_next: segment.speaker_turn_next?
+    }
+    assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
+  end
+
+  def test_deconstruct_keys_non_existent
+    omit "Undefined behavior"
+
+    segment = whisper.each_segment.first
+
+    assert_equal({}, segment.deconstruct_keys([:non_existent]))
+  end
+
+  def test_deconstruct_keys_too_many_keys
+    omit "Undefined behavior"
+
+    segment = whisper.each_segment.first
+
+    assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
+  end
+
+  def test_deconstruct_keys_includes_non_existent_keys_not_too_many
+    omit "Undefined behavior"
+
+    segment = whisper.each_segment.first
+
+    expected = {
+      start_time: segment.start_time,
+      end_time: segment.end_time,
+      text: segment.text,
+      no_speech_prob: segment.no_speech_prob
+    }
+    assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
+  end
 end
diff --git a/bindings/ruby/test/test_whisper.rb b/bindings/ruby/test/test_whisper.rb
index 8f1e69db..e429c543 100644
--- a/bindings/ruby/test/test_whisper.rb
+++ b/bindings/ruby/test/test_whisper.rb
@@ -113,7 +113,7 @@ class TestWhisper < TestBase
   end
 
   def test_system_info_str
-    assert_match /\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str
+    assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
   end
 
   def test_log_set
@@ -245,4 +245,48 @@ class TestWhisper < TestBase
       assert_match(/for your country/i, text)
     end
   end
+
+  def test_to_srt
+    whisper = Whisper::Context.new("base.en")
+    whisper.transcribe AUDIO, @params
+
+    lines = whisper.to_srt.lines
+    assert_match(/\A\d+\n/, lines[0])
+    assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
+    assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
+  end
+
+  def test_to_webvtt
+    whisper = Whisper::Context.new("base.en")
+    whisper.transcribe AUDIO, @params
+
+    lines = whisper.to_webvtt.lines
+    assert_equal "WEBVTT\n", lines[0]
+    assert_equal "\n", lines[1]
+    assert_match(/\A\d+\n/, lines[2])
+    assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
+    assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
+  end
+
+  sub_test_case "Format needs escape" do
+    def setup
+      @whisper = Whisper::Context.new("base.en")
+      @whisper.transcribe AUDIO, Whisper::Params.new
+      segment = @whisper.each_segment.first
+      segment.define_singleton_method :text do
+        "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
+      end
+      @whisper.define_singleton_method :each_segment do
+        Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
+      end
+    end
+
+    def test_to_srt_escape
+      assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
+    end
+
+    def test_to_webvtt_escape
+      assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
+    end
+  end
 end
diff --git a/bindings/ruby/whispercpp.gemspec b/bindings/ruby/whispercpp.gemspec
index b838aa9f..0a2a0c5f 100644
--- a/bindings/ruby/whispercpp.gemspec
+++ b/bindings/ruby/whispercpp.gemspec
@@ -4,7 +4,7 @@ Gem::Specification.new do |s|
   s.name    = "whispercpp"
   s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
   s.version = '1.3.3'
-  s.date    = '2025-06-03'
+  s.date    = '2025-06-10'
   s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
   s.email   = 'todd.fisher@gmail.com'
   s.extra_rdoc_files = ['LICENSE', 'README.md']