go : adding features to the go-whisper example, go ci, etc (#384)

* Updated bindings so they can be used in third pary packages. * Updated makefiles to set FMA flag on optionally, for xeon E5 on Darwin * Added test script * Changes for examples * Reverted * Made the NewContext method private
2025-06-18 14:58:09 +00:00 · 2023-01-07 19:21:43 +00:00
parent f30b5d322c
commit f078a6f20e
10 changed files with 370 additions and 31 deletions
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -20,6 +20,9 @@ type Model interface {
 	// Return a new speech-to-text context.
 	NewContext() (Context, error)

+	// Return true if the model is multilingual.
+	IsMultilingual() bool
+
 	// Return all languages supported.
 	Languages() []string
 }
@ -27,8 +30,18 @@ type Model interface {
 // Context is the speach recognition context.
 type Context interface {
 	SetLanguage(string) error // Set the language to use for speech recognition.
+	SetTranslate(bool)        // Set translate flag
+	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language
-	SetSpeedup(bool)          // Set speedup flag
+
+	SetOffset(time.Duration)      // Set offset
+	SetDuration(time.Duration)    // Set duration
+	SetThreads(uint)              // Set number of threads to use
+	SetSpeedup(bool)              // Set speedup flag
+	SetTokenThreshold(float32)    // Set timestamp token probability threshold
+	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
+	SetMaxSegmentLength(uint)     // Set max segment length in characters
+	SetMaxTokensPerSegment(uint)  // Set max tokens per segment (0 = no limit)

 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
@ -38,6 +51,15 @@ type Context interface {
 	// After process is called, return segments until the end of the stream
 	// is reached, when io.EOF is returned.
 	NextSegment() (Segment, error)
+
+	IsBEG(Token) bool          // Test for "begin" token
+	IsSOT(Token) bool          // Test for "start of transcription" token
+	IsEOT(Token) bool          // Test for "end of transcription" token
+	IsPREV(Token) bool         // Test for "start of prev" token
+	IsSOLM(Token) bool         // Test for "start of lm" token
+	IsNOT(Token) bool          // Test for "No timestamps" token
+	IsLANG(Token, string) bool // Test for token associated with a specific language
+	IsText(Token) bool         // Test for text token
 }

 // Segment is the text result of a speech recognition.