mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-28 14:59:43 +00:00
Some checks failed
Bindings Tests (Ruby) / ubuntu-22 (push) Waiting to run
CI / determine-tag (push) Waiting to run
CI / ubuntu-22 (linux/amd64) (push) Waiting to run
CI / ubuntu-22 (linux/ppc64le) (push) Waiting to run
CI / ubuntu-22-arm64 (linux/arm64) (push) Waiting to run
CI / ubuntu-22-arm-v7 (linux/arm/v7) (push) Waiting to run
CI / macOS-latest (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest (generic/platform=tvOS) (push) Waiting to run
CI / ubuntu-22-gcc (linux/amd64, Debug) (push) Waiting to run
CI / ubuntu-22-gcc (linux/amd64, Release) (push) Waiting to run
CI / ubuntu-22-gcc (linux/ppc64le, Debug) (push) Waiting to run
CI / ubuntu-22-gcc (linux/ppc64le, Release) (push) Waiting to run
CI / ubuntu-22-gcc-arm64 (linux/arm64, Debug) (push) Waiting to run
CI / ubuntu-22-gcc-arm64 (linux/arm64, Release) (push) Waiting to run
CI / ubuntu-22-gcc-arm-v7 (linux/arm/v7, Debug) (push) Waiting to run
CI / ubuntu-22-gcc-arm-v7 (linux/arm/v7, Release) (push) Waiting to run
CI / ubuntu-22-clang (linux/amd64, Debug) (push) Waiting to run
CI / ubuntu-22-clang (linux/amd64, Release) (push) Waiting to run
CI / ubuntu-22-clang (linux/arm64, Debug) (push) Waiting to run
CI / ubuntu-22-clang (linux/arm64, Release) (push) Waiting to run
CI / ubuntu-22-clang (linux/ppc64le, Debug) (push) Waiting to run
CI / ubuntu-22-clang (linux/ppc64le, Release) (push) Waiting to run
CI / ubuntu-22-gcc-sanitized (linux/amd64, ADDRESS) (push) Waiting to run
CI / ubuntu-22-gcc-sanitized (linux/amd64, THREAD) (push) Waiting to run
CI / ubuntu-22-gcc-sanitized (linux/amd64, UNDEFINED) (push) Waiting to run
CI / ubuntu-22-cmake-sycl (linux/amd64, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl (linux/arm/v7, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl (linux/arm64, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl (linux/ppc64le, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (linux/amd64, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm/v7, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (linux/arm64, icx, icpx, ON) (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (linux/ppc64le, icx, icpx, ON) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows (Win32, Release, win32-x86, x86, 2.28.5, ON) (push) Waiting to run
CI / windows (x64, Release, win32-x86-64, x64, 2.28.5, ON) (push) Waiting to run
CI / windows-blas (Win32, ON, Release, x86, 2.28.5, ON) (push) Waiting to run
CI / windows-blas (x64, ON, Release, x64, 2.28.5, ON) (push) Waiting to run
CI / windows-cublas (x64, Release, ON, 11.8.0, ON, 2.28.5) (push) Waiting to run
CI / windows-cublas (x64, Release, ON, 12.2.0, ON, 2.28.5) (push) Waiting to run
CI / emscripten (Release) (push) Waiting to run
CI / ios-xcode-build (Release) (push) Blocked by required conditions
CI / android (push) Waiting to run
CI / quantize (push) Waiting to run
CI / release (push) Blocked by required conditions
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/main.Dockerfile platform:linux/amd64 tag:main]) (push) Waiting to run
Bindings Tests (Go) / ubuntu-22 (push) Has been cancelled
Adding in DetectedLanguage(), a function to retrieve the detected language that's populated by processing audio. Also adding in a unit test to test the success. Co-authored-by: Amanda Der Bedrosian <aderbedrosian@sdl.com>
110 lines
3.7 KiB
Go
110 lines
3.7 KiB
Go
package whisper
|
|
|
|
import (
|
|
"io"
|
|
"time"
|
|
)
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// TYPES
|
|
|
|
// SegmentCallback is the callback function for processing segments in real
|
|
// time. It is called during the Process function
|
|
type SegmentCallback func(Segment)
|
|
|
|
// ProgressCallback is the callback function for reporting progress during
|
|
// processing. It is called during the Process function
|
|
type ProgressCallback func(int)
|
|
|
|
// EncoderBeginCallback is the callback function for checking if we want to
|
|
// continue processing. It is called during the Process function
|
|
type EncoderBeginCallback func() bool
|
|
|
|
// Model is the interface to a whisper model. Create a new model with the
|
|
// function whisper.New(string)
|
|
type Model interface {
|
|
io.Closer
|
|
|
|
// Return a new speech-to-text context.
|
|
NewContext() (Context, error)
|
|
|
|
// Return true if the model is multilingual.
|
|
IsMultilingual() bool
|
|
|
|
// Return all languages supported.
|
|
Languages() []string
|
|
}
|
|
|
|
// Context is the speech recognition context.
|
|
type Context interface {
|
|
SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
|
|
SetTranslate(bool) // Set translate flag
|
|
IsMultilingual() bool // Return true if the model is multilingual.
|
|
Language() string // Get language
|
|
DetectedLanguage() string // Get detected language
|
|
|
|
SetOffset(time.Duration) // Set offset
|
|
SetDuration(time.Duration) // Set duration
|
|
SetThreads(uint) // Set number of threads to use
|
|
SetSplitOnWord(bool) // Set split on word flag
|
|
SetTokenThreshold(float32) // Set timestamp token probability threshold
|
|
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
|
|
SetMaxSegmentLength(uint) // Set max segment length in characters
|
|
SetTokenTimestamps(bool) // Set token timestamps flag
|
|
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
|
|
SetAudioCtx(uint) // Set audio encoder context
|
|
SetMaxContext(n int) // Set maximum number of text context tokens to store
|
|
SetBeamSize(n int) // Set Beam Size
|
|
SetEntropyThold(t float32) // Set Entropy threshold
|
|
SetInitialPrompt(prompt string) // Set initial prompt
|
|
SetTemperature(t float32) // Set temperature
|
|
SetTemperatureFallback(t float32) // Set temperature incrementation
|
|
|
|
// Process mono audio data and return any errors.
|
|
// If defined, newly generated segments are passed to the
|
|
// callback function during processing.
|
|
Process([]float32, EncoderBeginCallback, SegmentCallback, ProgressCallback) error
|
|
|
|
// After process is called, return segments until the end of the stream
|
|
// is reached, when io.EOF is returned.
|
|
NextSegment() (Segment, error)
|
|
|
|
IsBEG(Token) bool // Test for "begin" token
|
|
IsSOT(Token) bool // Test for "start of transcription" token
|
|
IsEOT(Token) bool // Test for "end of transcription" token
|
|
IsPREV(Token) bool // Test for "start of prev" token
|
|
IsSOLM(Token) bool // Test for "start of lm" token
|
|
IsNOT(Token) bool // Test for "No timestamps" token
|
|
IsLANG(Token, string) bool // Test for token associated with a specific language
|
|
IsText(Token) bool // Test for text token
|
|
|
|
// Timings
|
|
PrintTimings()
|
|
ResetTimings()
|
|
|
|
SystemInfo() string
|
|
}
|
|
|
|
// Segment is the text result of a speech recognition.
|
|
type Segment struct {
|
|
// Segment Number
|
|
Num int
|
|
|
|
// Time beginning and end timestamps for the segment.
|
|
Start, End time.Duration
|
|
|
|
// The text of the segment.
|
|
Text string
|
|
|
|
// The tokens of the segment.
|
|
Tokens []Token
|
|
}
|
|
|
|
// Token is a text or special token
|
|
type Token struct {
|
|
Id int
|
|
Text string
|
|
P float32
|
|
Start, End time.Duration
|
|
}
|