mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-11 11:41:34 +00:00
ruby : output format (#3237)
* Fix a typo * Don't allocate output string unless needed * Add methods to output SRT and WebVTT * Add tests for output methods * Make constants for output private * Add signatures for output methods * Add document on output methods * Fix method name: Segment#speaker_next_turn? -> #speacker_turn_next? * Add Whisper::Segment#descotruct_keys * Add test for Whisper::Context#descotruct_keys * Add signature of Whisper::Segment#deconstruct_keys * Use parentheses to suppress warning * Update date
This commit is contained in:
parent
d78f081423
commit
fbead67549
@ -162,6 +162,32 @@ Whisper::Params.new(
|
||||
|
||||
For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
|
||||
|
||||
### Output ###
|
||||
|
||||
whispercpp supports SRT and WebVTT output:
|
||||
|
||||
```ruby
|
||||
puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
|
||||
# =>
|
||||
WEBVTT
|
||||
|
||||
1
|
||||
00:00:00.000 --> 00:00:03.860
|
||||
My thought I have nobody by a beauty and will as you poured.
|
||||
|
||||
2
|
||||
00:00:03.860 --> 00:00:09.840
|
||||
Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
|
||||
|
||||
3
|
||||
00:00:09.840 --> 00:00:09.940
|
||||
a
|
||||
|
||||
```
|
||||
|
||||
You may call `#to_srt`, too
|
||||
|
||||
|
||||
API
|
||||
---
|
||||
|
||||
@ -196,7 +222,7 @@ whisper
|
||||
ed: format_time(segment.end_time),
|
||||
text: segment.text
|
||||
}
|
||||
line << " (speaker turned)" if segment.speaker_next_turn?
|
||||
line << " (speaker turned)" if segment.speaker_turn_next?
|
||||
puts line
|
||||
end
|
||||
|
||||
@ -212,7 +238,7 @@ params.on_new_segment do |segment|
|
||||
ed: format_time(segment.end_time),
|
||||
text: segment.text
|
||||
}
|
||||
line << " (speaker turned)" if segment.speaker_next_turn?
|
||||
line << " (speaker turned)" if segment.speaker_turn_next?
|
||||
puts line
|
||||
end
|
||||
|
||||
|
@ -170,5 +170,7 @@ void Init_whisper() {
|
||||
init_ruby_whisper_model(&mWhisper);
|
||||
init_ruby_whisper_vad_params(&mVAD);
|
||||
|
||||
rb_require("whisper/context");
|
||||
rb_require("whisper/segment");
|
||||
rb_require("whisper/model/uri");
|
||||
}
|
||||
|
@ -664,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
|
||||
rb_define_method(cContext, "full", ruby_whisper_full, -1);
|
||||
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
|
||||
|
||||
// High leve
|
||||
// High level
|
||||
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
|
||||
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
|
||||
|
||||
|
@ -1,6 +1,15 @@
|
||||
#include <ruby.h>
|
||||
#include "ruby_whisper.h"
|
||||
|
||||
#define N_KEY_NAMES 5
|
||||
|
||||
static VALUE sym_start_time;
|
||||
static VALUE sym_end_time;
|
||||
static VALUE sym_text;
|
||||
static VALUE sym_no_speech_prob;
|
||||
static VALUE sym_speaker_turn_next;
|
||||
static VALUE key_names;
|
||||
|
||||
extern const rb_data_type_t ruby_whisper_type;
|
||||
|
||||
extern VALUE cSegment;
|
||||
@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
|
||||
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
|
||||
}
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* deconstruct_keys(keys) -> hash
|
||||
*
|
||||
* Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
|
||||
*
|
||||
* whisper.each_segment do |segment|
|
||||
* segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
*
|
||||
* puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
||||
* end
|
||||
*/
|
||||
static VALUE
|
||||
ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
|
||||
{
|
||||
ruby_whisper_segment *rws;
|
||||
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
|
||||
ruby_whisper *rw;
|
||||
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
|
||||
|
||||
VALUE hash = rb_hash_new();
|
||||
long n_keys;
|
||||
if (NIL_P(keys)) {
|
||||
keys = key_names;
|
||||
n_keys = N_KEY_NAMES;
|
||||
} else {
|
||||
n_keys = RARRAY_LEN(keys);
|
||||
if (n_keys > N_KEY_NAMES) {
|
||||
return hash;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < n_keys; i++) {
|
||||
VALUE key = rb_ary_entry(keys, i);
|
||||
if (key == sym_start_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
|
||||
}
|
||||
if (key == sym_end_time) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
|
||||
}
|
||||
if (key == sym_text) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
|
||||
}
|
||||
if (key == sym_no_speech_prob) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
|
||||
}
|
||||
if (key == sym_speaker_turn_next) {
|
||||
rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
|
||||
}
|
||||
}
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
void
|
||||
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
|
||||
{
|
||||
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
|
||||
|
||||
sym_start_time = ID2SYM(rb_intern("start_time"));
|
||||
sym_end_time = ID2SYM(rb_intern("end_time"));
|
||||
sym_text = ID2SYM(rb_intern("text"));
|
||||
sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
|
||||
sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
|
||||
key_names = rb_ary_new3(
|
||||
N_KEY_NAMES,
|
||||
sym_start_time,
|
||||
sym_end_time,
|
||||
sym_text,
|
||||
sym_no_speech_prob,
|
||||
sym_speaker_turn_next
|
||||
);
|
||||
|
||||
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
|
||||
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
|
||||
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
|
||||
rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
||||
rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
|
||||
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
|
||||
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
|
||||
rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
|
||||
}
|
||||
|
@ -76,15 +76,16 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
return self;
|
||||
}
|
||||
if (NIL_P(blk)) {
|
||||
return self;
|
||||
}
|
||||
const int n_segments = whisper_full_n_segments(rw->context);
|
||||
VALUE output = rb_str_new2("");
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(rw->context, i);
|
||||
output = rb_str_concat(output, rb_str_new2(text));
|
||||
}
|
||||
if (blk != Qnil) {
|
||||
rb_funcall(blk, id_call, 1, output);
|
||||
}
|
||||
rb_funcall(blk, id_call, 1, output);
|
||||
return self;
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
|
15
bindings/ruby/lib/whisper/context.rb
Normal file
15
bindings/ruby/lib/whisper/context.rb
Normal file
@ -0,0 +1,15 @@
|
||||
module Whisper
|
||||
class Context
|
||||
def to_srt
|
||||
each_segment.with_index.reduce("") {|srt, (segment, index)|
|
||||
srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
|
||||
}
|
||||
end
|
||||
|
||||
def to_webvtt
|
||||
each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
|
||||
webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
58
bindings/ruby/lib/whisper/segment.rb
Normal file
58
bindings/ruby/lib/whisper/segment.rb
Normal file
@ -0,0 +1,58 @@
|
||||
module Whisper
|
||||
class Segment
|
||||
SRT_ESCAPES = {
|
||||
"&" => "&",
|
||||
"<" => "<",
|
||||
">" => ">",
|
||||
}
|
||||
SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
|
||||
private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
|
||||
|
||||
def to_srt_cue
|
||||
"#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
|
||||
end
|
||||
|
||||
def to_webvtt_cue
|
||||
"#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def time_to_a(time)
|
||||
sec, decimal_part = time.divmod(1000)
|
||||
min, sec = sec.divmod(60)
|
||||
hour, min = min.divmod(60)
|
||||
[hour, min, sec, decimal_part]
|
||||
end
|
||||
|
||||
def srt_time(time)
|
||||
"%02d:%02d:%02d,%03d" % time_to_a(time)
|
||||
end
|
||||
|
||||
def srt_start_time
|
||||
srt_time(start_time)
|
||||
end
|
||||
|
||||
def srt_end_time
|
||||
srt_time(end_time)
|
||||
end
|
||||
|
||||
def srt_text
|
||||
text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
|
||||
end
|
||||
|
||||
def webvtt_time(time)
|
||||
"%02d:%02d:%02d.%03d" % time_to_a(time)
|
||||
end
|
||||
|
||||
def webvtt_start_time
|
||||
webvtt_time(start_time)
|
||||
end
|
||||
|
||||
def webvtt_end_time
|
||||
webvtt_time(end_time)
|
||||
end
|
||||
|
||||
alias webvtt_text srt_text
|
||||
end
|
||||
end
|
@ -116,6 +116,9 @@ module Whisper
|
||||
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer n_samples) -> self
|
||||
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
|
||||
|
||||
def to_srt: () -> String
|
||||
def to_webvtt: () -> String
|
||||
end
|
||||
|
||||
class Params
|
||||
@ -415,6 +418,14 @@ module Whisper
|
||||
end
|
||||
|
||||
class Segment
|
||||
type deconstructed_keys = {
|
||||
start_time: (Integer | nil),
|
||||
end_time: (Integer | nil),
|
||||
text: (String | nil),
|
||||
no_speech_prob: (Float | nil),
|
||||
speaker_turn_next: (true | false | nil)
|
||||
}
|
||||
|
||||
# Start time in milliseconds.
|
||||
#
|
||||
def start_time: () -> Integer
|
||||
@ -424,10 +435,21 @@ module Whisper
|
||||
def end_time: () -> Integer
|
||||
|
||||
# Whether the next segment is predicted as a speaker turn.
|
||||
def speaker_next_turn?: () -> (true | false)
|
||||
def speaker_turn_next?: () -> (true | false)
|
||||
|
||||
def text: () -> String
|
||||
def no_speech_prob: () -> Float
|
||||
def to_srt_cue: () -> String
|
||||
def to_webvtt_cue: () -> String
|
||||
|
||||
# Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
|
||||
#
|
||||
# whisper.each_segment do |segment|
|
||||
# segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
#
|
||||
# puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
|
||||
# end
|
||||
def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
|
||||
end
|
||||
|
||||
module VAD
|
||||
|
@ -71,4 +71,66 @@ class TestSegment < TestBase
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
end
|
||||
|
||||
def test_pattern_matching
|
||||
segment = whisper.each_segment.first
|
||||
segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
|
||||
|
||||
assert_equal segment.start_time, start_time
|
||||
assert_equal segment.end_time, end_time
|
||||
assert_equal segment.text, text
|
||||
assert_equal segment.no_speech_prob, no_speech_prob
|
||||
assert_equal segment.speaker_turn_next?, speaker_turn_next
|
||||
end
|
||||
|
||||
def test_pattern_matching_partial
|
||||
segment = whisper.each_segment.first
|
||||
segment => {start_time:, end_time:, text:}
|
||||
|
||||
assert_equal segment.start_time, start_time
|
||||
assert_equal segment.end_time, end_time
|
||||
assert_equal segment.text, text
|
||||
end
|
||||
|
||||
def test_deconstruct_keys
|
||||
segment = whisper.each_segment.first
|
||||
expected = {
|
||||
start_time: segment.start_time,
|
||||
end_time: segment.end_time,
|
||||
text: segment.text,
|
||||
no_speech_prob: segment.no_speech_prob,
|
||||
speaker_turn_next: segment.speaker_turn_next?
|
||||
}
|
||||
assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_non_existent
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
assert_equal({}, segment.deconstruct_keys([:non_existent]))
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_too_many_keys
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
|
||||
end
|
||||
|
||||
def test_deconstruct_keys_includes_non_existent_keys_not_too_many
|
||||
omit "Undefined behavior"
|
||||
|
||||
segment = whisper.each_segment.first
|
||||
|
||||
expected = {
|
||||
start_time: segment.start_time,
|
||||
end_time: segment.end_time,
|
||||
text: segment.text,
|
||||
no_speech_prob: segment.no_speech_prob
|
||||
}
|
||||
assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
|
||||
end
|
||||
end
|
||||
|
@ -113,7 +113,7 @@ class TestWhisper < TestBase
|
||||
end
|
||||
|
||||
def test_system_info_str
|
||||
assert_match /\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str
|
||||
assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
|
||||
end
|
||||
|
||||
def test_log_set
|
||||
@ -245,4 +245,48 @@ class TestWhisper < TestBase
|
||||
assert_match(/for your country/i, text)
|
||||
end
|
||||
end
|
||||
|
||||
def test_to_srt
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
whisper.transcribe AUDIO, @params
|
||||
|
||||
lines = whisper.to_srt.lines
|
||||
assert_match(/\A\d+\n/, lines[0])
|
||||
assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
|
||||
end
|
||||
|
||||
def test_to_webvtt
|
||||
whisper = Whisper::Context.new("base.en")
|
||||
whisper.transcribe AUDIO, @params
|
||||
|
||||
lines = whisper.to_webvtt.lines
|
||||
assert_equal "WEBVTT\n", lines[0]
|
||||
assert_equal "\n", lines[1]
|
||||
assert_match(/\A\d+\n/, lines[2])
|
||||
assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
|
||||
end
|
||||
|
||||
sub_test_case "Format needs escape" do
|
||||
def setup
|
||||
@whisper = Whisper::Context.new("base.en")
|
||||
@whisper.transcribe AUDIO, Whisper::Params.new
|
||||
segment = @whisper.each_segment.first
|
||||
segment.define_singleton_method :text do
|
||||
"& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
|
||||
end
|
||||
@whisper.define_singleton_method :each_segment do
|
||||
Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
|
||||
end
|
||||
end
|
||||
|
||||
def test_to_srt_escape
|
||||
assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
|
||||
end
|
||||
|
||||
def test_to_webvtt_escape
|
||||
assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -4,7 +4,7 @@ Gem::Specification.new do |s|
|
||||
s.name = "whispercpp"
|
||||
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
||||
s.version = '1.3.3'
|
||||
s.date = '2025-06-03'
|
||||
s.date = '2025-06-10'
|
||||
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
|
||||
s.email = 'todd.fisher@gmail.com'
|
||||
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
||||
|
Loading…
x
Reference in New Issue
Block a user