module Whisper interface _Samples def length: () -> Integer def each: { (Float) -> void } -> void end type log_callback = ^(Integer level, String message, Object user_data) -> void type new_segment_callback = ^(Whisper::Context, void, Integer n_new, Object user_data) -> void type progress_callback = ^(Whisper::Context, void, Integer progress, Object user_data) -> void type encoder_begin_callback = ^(Whisper::Context, void, Object user_data) -> void type abort_callback = ^(Whisper::Context, void, Object user_data) -> boolish LOG_LEVEL_NONE: Integer LOG_LEVEL_INFO: Integer LOG_LEVEL_WARN: Integer LOG_LEVEL_ERROR: Integer LOG_LEVEL_DEBUG: Integer LOG_LEVEL_CONT: Integer def self.lang_max_id: () -> Integer def self.lang_id: (string name) -> Integer def self.lang_str: (Integer id) -> String def self.lang_str_full: (Integer id) -> String def self.log_set: (log_callback, Object? user_data) -> log_callback def self.system_info_str: () -> String class Context def self.new: (String | path | ::URI::HTTP) -> instance # transcribe a single file # can emit to a block results # # params = Whisper::Params.new # params.duration = 60_000 # whisper.transcribe "path/to/audio.wav", params do |text| # puts text # end # def transcribe: (string, Params, ?n_processors: Integer) -> self | (string, Params, ?n_processors: Integer) { (String) -> void } -> self def model_n_vocab: () -> Integer def model_n_audio_ctx: () -> Integer def model_n_audio_state: () -> Integer def model_n_text_head: () -> Integer def model_n_text_layer: () -> Integer def model_n_mels: () -> Integer def model_ftype: () -> Integer def model_type: () -> String # Yields each Whisper::Segment: # # whisper.transcribe("path/to/audio.wav", params) # whisper.each_segment do |segment| # puts segment.text # end # # Returns an Enumerator if no block given: # # whisper.transcribe("path/to/audio.wav", params) # enum = whisper.each_segment # enum.to_a # => [#, ...] # def each_segment: { (Segment) -> void } -> void | () -> Enumerator[Segment] def model: () -> Model def full_get_segment: (Integer nth) -> Segment def full_n_segments: () -> Integer # Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full. # def full_lang_id: () -> Integer # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds). # # full_get_segment_t0(3) # => 1668 (16680 ms) # def full_get_segment_t0: (Integer) -> Integer # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds). # # full_get_segment_t1(3) # => 1668 (16680 ms) # def full_get_segment_t1: (Integer) -> Integer # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn. # # full_get_segment_speacker_turn_next(3) # => true # def full_get_segment_speaker_turn_next: (Integer) -> (true | false) # Text of a segment indexed by +segment_index+. # # full_get_segment_text(3) # => "ask not what your country can do for you, ..." # def full_get_segment_text: (Integer) -> String def full_get_segment_no_speech_prob: (Integer) -> Float # Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text # Not thread safe for same context # Uses the specified decoding strategy to obtain the text. # # The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data. # def full: (Params, Array[Float] samples, ?Integer n_samples) -> self | (Params, _Samples, ?Integer n_samples) -> self # Split the input audio in chunks and process each chunk separately using whisper_full_with_state() # Result is stored in the default state of the context # Not thread safe if executed in parallel on the same context. # It seems this approach can offer some speedup in some cases. # However, the transcription accuracy can be worse at the beginning and end of each chunk. # def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self | (Params, _Samples, ?Integer n_samples) -> self | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self def to_srt: () -> String def to_webvtt: () -> String end class Params def self.new: ( ?language: string, ?translate: boolish, ?no_context: boolish, ?single_segment: boolish, ?print_special: boolish, ?print_progress: boolish, ?print_realtime: boolish, ?print_timestamps: boolish, ?suppress_blank: boolish, ?suppress_nst: boolish, ?token_timestamps: boolish, ?split_on_word: boolish, ?initial_prompt: string | nil, ?diarize: boolish, ?offset: Integer, ?duration: Integer, ?max_text_tokens: Integer, ?temperature: Float, ?max_initial_ts: Float, ?length_penalty: Float, ?temperature_inc: Float, ?entropy_thold: Float, ?logprob_thold: Float, ?no_speech_thold: Float, ?new_segment_callback: new_segment_callback, ?new_segment_callback_user_data: Object, ?progress_callback: progress_callback, ?progress_callback_user_data: Object, ?encoder_begin_callback: encoder_begin_callback, ?encoder_begin_callback_user_data: Object, ?abort_callback: abort_callback, ?abort_callback_user_data: Object, ?vad: boolish, ?vad_model_path: path | URI, ?vad_params: Whisper::VAD::Params ) -> instance # params.language = "auto" | "en", etc... # def language=: (String) -> String # TODO: Enumerate lang names def language: () -> String def translate=: (boolish) -> boolish def translate: () -> (true | false) def no_context=: (boolish) -> boolish # If true, does not use past transcription (if any) as initial prompt for the decoder. # def no_context: () -> (true | false) def single_segment=: (boolish) -> boolish # If true, forces single segment output (useful for streaming). # def single_segment: () -> (true | false) def print_special=: (boolish) -> boolish # If true, prints special tokens (e.g. , , , etc.). # def print_special: () -> (true | false) def print_progress=: (boolish) -> boolish # If true, prints progress information. # def print_progress: () -> (true | false) def print_realtime=: (boolish) -> boolish # If true, prints results from within whisper.cpp. (avoid it, use callback instead) # def print_realtime: () -> (true | false) # If true, prints timestamps for each text segment when printing realtime. # def print_timestamps=: (boolish) -> boolish def print_timestamps: () -> (true | false) def suppress_blank=: (boolish) -> boolish # If true, suppresses blank outputs. # def suppress_blank: () -> (true | false) def suppress_nst=: (boolish) -> boolish # If true, suppresses non-speech-tokens. # def suppress_nst: () -> (true | false) def token_timestamps=: (boolish) -> boolish # If true, enables token-level timestamps. # def token_timestamps: () -> (true | false) def split_on_word=: (boolish) -> boolish # If true, split on word rather than on token (when used with max_len). # def split_on_word: () -> (true | false) def initial_prompt=: (_ToS) -> _ToS # Tokens to provide to the whisper decoder as initial prompt # these are prepended to any existing text context from a previous call # use whisper_tokenize() to convert text to tokens. # Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224). # def initial_prompt: () -> (String | nil) def diarize=: (boolish) -> boolish # If true, enables diarization. # def diarize: () -> (true | false) def offset=: (Integer) -> Integer # Start offset in ms. # def offset: () -> Integer def duration=: (Integer) -> Integer # Audio duration to process in ms. # def duration: () -> Integer def max_text_tokens=: (Integer) -> Integer # Max tokens to use from past text as prompt for the decoder. # def max_text_tokens: () -> Integer def temperature=: (Float) -> Float def temperature: () -> Float def max_initial_ts=: (Float) -> Float # See https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97 # def max_initial_ts: () -> Float def length_penalty=: (Float) -> Float def length_penalty: () -> Float def temperature_inc=: (Float) -> Float def temperature_inc: () -> Float def entropy_thold=: (Float) -> Float # Similar to OpenAI's "compression_ratio_threshold" # def entropy_thold: () -> Float def logprob_thold=: (Float) -> Float def logprob_thold: () -> Float def no_speech_thold=: (Float) -> Float def no_speech_thold: () -> Float # Sets new segment callback, called for every newly generated text segment. # # params.new_segment_callback = ->(context, _, n_new, user_data) { # # ... # } # def new_segment_callback=: (new_segment_callback) -> new_segment_callback def new_segment_callback: () -> (new_segment_callback | nil) # Sets user data passed to the last argument of new segment callback. # def new_segment_callback_user_data=: (Object) -> Object def new_segment_callback_user_data: () -> Object # Sets progress callback, called on each progress update. # # params.new_segment_callback = ->(context, _, progress, user_data) { # # ... # } # # +progress+ is an Integer between 0 and 100. # def progress_callback=: (progress_callback) -> progress_callback def progress_callback: () -> (progress_callback | nil) # Sets user data passed to the last argument of progress callback. # def progress_callback_user_data=: (Object) -> Object def progress_callback_user_data: () -> Object # Sets encoder begin callback, called when the encoder starts. # def encoder_begin_callback=: (encoder_begin_callback) -> encoder_begin_callback def encoder_begin_callback: () -> (encoder_begin_callback | nil) # Sets user data passed to the last argument of encoder begin callback. # def encoder_begin_callback_user_data=: (Object) -> Object def encoder_begin_callback_user_data: () -> Object # Sets abort callback, called to check if the process should be aborted. # # params.abort_callback = ->(user_data) { # # ... # } # # def abort_callback=: (abort_callback) -> abort_callback def abort_callback: () -> (abort_callback | nil) # Sets user data passed to the last argument of abort callback. # def abort_callback_user_data=: (Object) -> Object def abort_callback_user_data: () -> Object # Enable VAD # def vad=: (boolish) -> boolish def vad: () -> (true | false) # Path to the VAD model def vad_model_path=: (path | URI | nil) -> (path | URI | nil) def vad_model_path: () -> (String | nil) def vad_params=: (Whisper::VAD::Params) -> Whisper::VAD::Params def vad_params: () -> (Whisper::VAD::Params) # Hook called on new segment. Yields each Whisper::Segment. # # whisper.on_new_segment do |segment| # # ... # end # def on_new_segment: { (Segment) -> void } -> void # Hook called on progress update. Yields each progress Integer between 0 and 100. # def on_progress: { (Integer progress) -> void } -> void # Hook called on encoder starts. # def on_encoder_begin: { () -> void } -> void # Call block to determine whether abort or not. Return +true+ when you want to abort. # # params.abort_on do # if some_condition # true # abort # else # false # continue # end # end # def abort_on: { (Object user_data) -> boolish } -> void end class Model def self.pre_converted_models: () -> Hash[String, Model::URI] def self.coreml_compiled_models: () -> Hash[Model::URI, Model::ZipURI] def self.new: () -> instance def n_vocab: () -> Integer def n_audio_ctx: () -> Integer def n_audio_state: () -> Integer def n_audio_head: () -> Integer def n_audio_layer: () -> Integer def n_text_ctx: () -> Integer def n_text_state: () -> Integer def n_text_head: () -> Integer def n_text_layer: () -> Integer def n_mels: () -> Integer def ftype: () -> Integer def type: () -> String class URI def self.new: (string | ::URI::HTTP) -> instance def to_path: -> String def clear_cache: -> void end class ZipURI < URI def cache: () -> String def clear_cache: () -> void end end class Segment type deconstructed_keys = { start_time: (Integer | nil), end_time: (Integer | nil), text: (String | nil), no_speech_prob: (Float | nil), speaker_turn_next: (true | false | nil) } # Start time in milliseconds. # def start_time: () -> Integer # End time in milliseconds. # def end_time: () -> Integer # Whether the next segment is predicted as a speaker turn. def speaker_turn_next?: () -> (true | false) def text: () -> String def no_speech_prob: () -> Float def to_srt_cue: () -> String def to_webvtt_cue: () -> String # Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next # # whisper.each_segment do |segment| # segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:} # # puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})" # end def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys end module VAD class Params def self.new: ( ?threshold: Float, ?min_speech_duration_ms: Integer, ?min_silence_duration_ms: Integer, ?max_speech_duration_s: Float, ?speech_pad_ms: Integer, ?samples_overlap: Float ) -> instance # Probability threshold to consider as speech. # def threshold=: (Float) -> Float def threshold: () -> Float # Min duration for a valid speech segment. # def min_speech_duration_ms=: (Integer) -> Integer def min_speech_duration_ms: () -> Integer # Min silence duration to consider speech as ended. # def min_silence_duration_ms=: (Integer) -> Integer def min_silence_duration_ms: () -> Integer # Max duration of a speech segment before forcing a new segment. def max_speech_duration_s=: (Float) -> Float def max_speech_duration_s: () -> Float # Padding added before and after speech segments. # def speech_pad_ms=: (Integer) -> Integer def speech_pad_ms: () -> Integer # Overlap in seconds when copying audio samples from speech segment. # def samples_overlap=: (Float) -> Float def samples_overlap: () -> Float def ==: (Params) -> (true | false) end end class Error < StandardError attr_reader code: Integer def self.new: (Integer code) -> instance end end