2022-12-20 06:54:33 +00:00
|
|
|
package whisper
|
|
|
|
|
|
|
|
import (
|
2023-01-28 16:44:56 +00:00
|
|
|
"fmt"
|
2022-12-20 06:54:33 +00:00
|
|
|
"io"
|
2023-01-28 16:44:56 +00:00
|
|
|
"runtime"
|
2022-12-20 06:54:33 +00:00
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
// Bindings
|
|
|
|
whisper "github.com/ggerganov/whisper.cpp/bindings/go"
|
|
|
|
)
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
// TYPES
|
|
|
|
|
|
|
|
type context struct {
|
|
|
|
n int
|
|
|
|
model *model
|
|
|
|
params whisper.Params
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure context adheres to the interface
|
|
|
|
var _ Context = (*context)(nil)
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
// LIFECYCLE
|
|
|
|
|
2023-01-07 19:21:43 +00:00
|
|
|
func newContext(model *model, params whisper.Params) (Context, error) {
|
2022-12-20 06:54:33 +00:00
|
|
|
context := new(context)
|
|
|
|
context.model = model
|
|
|
|
context.params = params
|
|
|
|
|
|
|
|
// Return success
|
|
|
|
return context, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
// PUBLIC METHODS
|
|
|
|
|
|
|
|
// Set the language to use for speech recognition.
|
|
|
|
func (context *context) SetLanguage(lang string) error {
|
|
|
|
if context.model.ctx == nil {
|
|
|
|
return ErrInternalAppError
|
|
|
|
}
|
2023-01-07 19:21:43 +00:00
|
|
|
if !context.model.IsMultilingual() {
|
|
|
|
return ErrModelNotMultilingual
|
|
|
|
}
|
2023-02-04 07:09:27 +00:00
|
|
|
|
|
|
|
if lang == "auto" {
|
|
|
|
context.params.SetLanguage(-1)
|
|
|
|
} else if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
|
2022-12-20 06:54:33 +00:00
|
|
|
return ErrUnsupportedLanguage
|
|
|
|
} else if err := context.params.SetLanguage(id); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// Return success
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-01-07 19:21:43 +00:00
|
|
|
func (context *context) IsMultilingual() bool {
|
|
|
|
return context.model.IsMultilingual()
|
|
|
|
}
|
|
|
|
|
2022-12-20 06:54:33 +00:00
|
|
|
// Get language
|
|
|
|
func (context *context) Language() string {
|
2023-02-04 07:09:27 +00:00
|
|
|
id := context.params.Language()
|
|
|
|
if id == -1 {
|
|
|
|
return "auto"
|
|
|
|
}
|
2022-12-20 06:54:33 +00:00
|
|
|
return whisper.Whisper_lang_str(context.params.Language())
|
|
|
|
}
|
|
|
|
|
2023-01-07 19:21:43 +00:00
|
|
|
// Set translate flag
|
|
|
|
func (context *context) SetTranslate(v bool) {
|
|
|
|
context.params.SetTranslate(v)
|
|
|
|
}
|
|
|
|
|
2022-12-20 06:54:33 +00:00
|
|
|
// Set speedup flag
|
|
|
|
func (context *context) SetSpeedup(v bool) {
|
|
|
|
context.params.SetSpeedup(v)
|
|
|
|
}
|
|
|
|
|
2023-07-25 16:10:12 +00:00
|
|
|
func (context *context) SetSplitOnWord(v bool) {
|
2023-10-15 12:35:06 +00:00
|
|
|
context.params.SetSplitOnWord(v)
|
2023-07-25 16:10:12 +00:00
|
|
|
}
|
|
|
|
|
2023-01-07 19:21:43 +00:00
|
|
|
// Set number of threads to use
|
|
|
|
func (context *context) SetThreads(v uint) {
|
|
|
|
context.params.SetThreads(int(v))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set time offset
|
|
|
|
func (context *context) SetOffset(v time.Duration) {
|
|
|
|
context.params.SetOffset(int(v.Milliseconds()))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set duration of audio to process
|
|
|
|
func (context *context) SetDuration(v time.Duration) {
|
2023-07-04 13:13:25 +00:00
|
|
|
context.params.SetDuration(int(v.Milliseconds()))
|
2023-01-07 19:21:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Set timestamp token probability threshold (~0.01)
|
|
|
|
func (context *context) SetTokenThreshold(t float32) {
|
|
|
|
context.params.SetTokenThreshold(t)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set timestamp token sum probability threshold (~0.01)
|
|
|
|
func (context *context) SetTokenSumThreshold(t float32) {
|
|
|
|
context.params.SetTokenSumThreshold(t)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set max segment length in characters
|
|
|
|
func (context *context) SetMaxSegmentLength(n uint) {
|
|
|
|
context.params.SetMaxSegmentLength(int(n))
|
|
|
|
}
|
|
|
|
|
2023-04-14 15:52:10 +00:00
|
|
|
// Set token timestamps flag
|
|
|
|
func (context *context) SetTokenTimestamps(b bool) {
|
|
|
|
context.params.SetTokenTimestamps(b)
|
|
|
|
}
|
|
|
|
|
2023-01-07 19:21:43 +00:00
|
|
|
// Set max tokens per segment (0 = no limit)
|
|
|
|
func (context *context) SetMaxTokensPerSegment(n uint) {
|
|
|
|
context.params.SetMaxTokensPerSegment(int(n))
|
|
|
|
}
|
|
|
|
|
2023-10-15 12:35:06 +00:00
|
|
|
// Set audio encoder context
|
|
|
|
func (context *context) SetAudioCtx(n uint) {
|
|
|
|
context.params.SetAudioCtx(int(n))
|
|
|
|
}
|
|
|
|
|
2024-01-12 11:44:50 +00:00
|
|
|
// Set initial prompt
|
|
|
|
func (context *context) SetInitialPrompt(prompt string) {
|
|
|
|
context.params.SetInitialPrompt(prompt)
|
|
|
|
}
|
|
|
|
|
2023-01-25 16:57:30 +00:00
|
|
|
// ResetTimings resets the mode timings. Should be called before processing
|
|
|
|
func (context *context) ResetTimings() {
|
|
|
|
context.model.ctx.Whisper_reset_timings()
|
|
|
|
}
|
|
|
|
|
|
|
|
// PrintTimings prints the model timings to stdout.
|
|
|
|
func (context *context) PrintTimings() {
|
|
|
|
context.model.ctx.Whisper_print_timings()
|
|
|
|
}
|
|
|
|
|
2023-01-28 16:44:56 +00:00
|
|
|
// SystemInfo returns the system information
|
|
|
|
func (context *context) SystemInfo() string {
|
|
|
|
return fmt.Sprintf("system_info: n_threads = %d / %d | %s\n",
|
|
|
|
context.params.Threads(),
|
|
|
|
runtime.NumCPU(),
|
|
|
|
whisper.Whisper_print_system_info(),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2023-01-26 23:14:20 +00:00
|
|
|
// Use mel data at offset_ms to try and auto-detect the spoken language
|
|
|
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
|
|
|
// Returns the probabilities of all languages.
|
|
|
|
func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]float32, error) {
|
|
|
|
langProbs, err := context.model.ctx.Whisper_lang_auto_detect(offset_ms, n_threads)
|
|
|
|
if err != nil {
|
2023-01-28 16:44:56 +00:00
|
|
|
return nil, err
|
2023-01-26 23:14:20 +00:00
|
|
|
}
|
|
|
|
return langProbs, nil
|
|
|
|
}
|
|
|
|
|
2022-12-20 06:54:33 +00:00
|
|
|
// Process new sample data and return any errors
|
2023-06-25 11:07:55 +00:00
|
|
|
func (context *context) Process(
|
|
|
|
data []float32,
|
|
|
|
callNewSegment SegmentCallback,
|
|
|
|
callProgress ProgressCallback,
|
|
|
|
) error {
|
2022-12-20 06:54:33 +00:00
|
|
|
if context.model.ctx == nil {
|
|
|
|
return ErrInternalAppError
|
|
|
|
}
|
|
|
|
// If the callback is defined then we force on single_segment mode
|
2023-06-25 11:34:10 +00:00
|
|
|
if callNewSegment != nil {
|
2022-12-20 06:54:33 +00:00
|
|
|
context.params.SetSingleSegment(true)
|
|
|
|
}
|
|
|
|
|
|
|
|
// We don't do parallel processing at the moment
|
|
|
|
processors := 0
|
|
|
|
if processors > 1 {
|
|
|
|
if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, nil, func(new int) {
|
2023-06-25 11:07:55 +00:00
|
|
|
if callNewSegment != nil {
|
2022-12-20 06:54:33 +00:00
|
|
|
num_segments := context.model.ctx.Whisper_full_n_segments()
|
|
|
|
s0 := num_segments - new
|
|
|
|
for i := s0; i < num_segments; i++ {
|
2023-06-25 11:07:55 +00:00
|
|
|
callNewSegment(toSegment(context.model.ctx, i))
|
2022-12-20 06:54:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
} else if err := context.model.ctx.Whisper_full(context.params, data, nil, func(new int) {
|
2023-06-25 11:07:55 +00:00
|
|
|
if callNewSegment != nil {
|
2022-12-20 06:54:33 +00:00
|
|
|
num_segments := context.model.ctx.Whisper_full_n_segments()
|
|
|
|
s0 := num_segments - new
|
|
|
|
for i := s0; i < num_segments; i++ {
|
2023-06-25 11:07:55 +00:00
|
|
|
callNewSegment(toSegment(context.model.ctx, i))
|
2022-12-20 06:54:33 +00:00
|
|
|
}
|
|
|
|
}
|
2023-06-25 11:07:55 +00:00
|
|
|
}, func(progress int) {
|
|
|
|
if callProgress != nil {
|
|
|
|
callProgress(progress)
|
|
|
|
}
|
2022-12-20 06:54:33 +00:00
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return success
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the next segment of tokens
|
|
|
|
func (context *context) NextSegment() (Segment, error) {
|
|
|
|
if context.model.ctx == nil {
|
|
|
|
return Segment{}, ErrInternalAppError
|
|
|
|
}
|
|
|
|
if context.n >= context.model.ctx.Whisper_full_n_segments() {
|
|
|
|
return Segment{}, io.EOF
|
|
|
|
}
|
|
|
|
|
|
|
|
// Populate result
|
|
|
|
result := toSegment(context.model.ctx, context.n)
|
|
|
|
|
|
|
|
// Increment the cursor
|
|
|
|
context.n++
|
|
|
|
|
|
|
|
// Return success
|
|
|
|
return result, nil
|
|
|
|
}
|
|
|
|
|
2023-01-07 19:21:43 +00:00
|
|
|
// Test for text tokens
|
|
|
|
func (context *context) IsText(t Token) bool {
|
|
|
|
switch {
|
|
|
|
case context.IsBEG(t):
|
|
|
|
return false
|
|
|
|
case context.IsSOT(t):
|
|
|
|
return false
|
|
|
|
case whisper.Token(t.Id) >= context.model.ctx.Whisper_token_eot():
|
|
|
|
return false
|
|
|
|
case context.IsPREV(t):
|
|
|
|
return false
|
|
|
|
case context.IsSOLM(t):
|
|
|
|
return false
|
|
|
|
case context.IsNOT(t):
|
|
|
|
return false
|
|
|
|
default:
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test for "begin" token
|
|
|
|
func (context *context) IsBEG(t Token) bool {
|
|
|
|
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_beg()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test for "start of transcription" token
|
|
|
|
func (context *context) IsSOT(t Token) bool {
|
|
|
|
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_sot()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test for "end of transcription" token
|
|
|
|
func (context *context) IsEOT(t Token) bool {
|
|
|
|
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_eot()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test for "start of prev" token
|
|
|
|
func (context *context) IsPREV(t Token) bool {
|
|
|
|
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_prev()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test for "start of lm" token
|
|
|
|
func (context *context) IsSOLM(t Token) bool {
|
|
|
|
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_solm()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test for "No timestamps" token
|
|
|
|
func (context *context) IsNOT(t Token) bool {
|
|
|
|
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_not()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test for token associated with a specific language
|
|
|
|
func (context *context) IsLANG(t Token, lang string) bool {
|
|
|
|
if id := context.model.ctx.Whisper_lang_id(lang); id >= 0 {
|
|
|
|
return whisper.Token(t.Id) == context.model.ctx.Whisper_token_lang(id)
|
|
|
|
} else {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-20 06:54:33 +00:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
// PRIVATE METHODS
|
|
|
|
|
|
|
|
func toSegment(ctx *whisper.Context, n int) Segment {
|
|
|
|
return Segment{
|
|
|
|
Num: n,
|
|
|
|
Text: strings.TrimSpace(ctx.Whisper_full_get_segment_text(n)),
|
|
|
|
Start: time.Duration(ctx.Whisper_full_get_segment_t0(n)) * time.Millisecond * 10,
|
|
|
|
End: time.Duration(ctx.Whisper_full_get_segment_t1(n)) * time.Millisecond * 10,
|
|
|
|
Tokens: toTokens(ctx, n),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func toTokens(ctx *whisper.Context, n int) []Token {
|
|
|
|
result := make([]Token, ctx.Whisper_full_n_tokens(n))
|
|
|
|
for i := 0; i < len(result); i++ {
|
2023-04-14 15:52:10 +00:00
|
|
|
data := ctx.Whisper_full_get_token_data(n, i)
|
|
|
|
|
2022-12-20 06:54:33 +00:00
|
|
|
result[i] = Token{
|
2023-04-14 15:52:10 +00:00
|
|
|
Id: int(ctx.Whisper_full_get_token_id(n, i)),
|
|
|
|
Text: ctx.Whisper_full_get_token_text(n, i),
|
|
|
|
P: ctx.Whisper_full_get_token_p(n, i),
|
|
|
|
Start: time.Duration(data.T0()) * time.Millisecond * 10,
|
|
|
|
End: time.Duration(data.T1()) * time.Millisecond * 10,
|
2022-12-20 06:54:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|