more robust approach
Some checks failed
Security Scan / tests (push) Has been cancelled

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2025-01-10 16:22:50 +01:00
parent 72b2883757
commit 4565b87e5c

View File

@ -1,14 +1,18 @@
package openai package openai
import ( import (
"bytes"
"context" "context"
"encoding/base64" "encoding/base64"
"encoding/json" "encoding/json"
"fmt" "fmt"
"os"
"strings" "strings"
"sync" "sync"
"time" "time"
"github.com/go-audio/wav"
"github.com/go-audio/audio" "github.com/go-audio/audio"
"github.com/gofiber/fiber/v2" "github.com/gofiber/fiber/v2"
"github.com/gofiber/websocket/v2" "github.com/gofiber/websocket/v2"
@ -488,21 +492,8 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
} }
const ( const (
minMicVolume = 450 sendToVADDelay = 2 * time.Second
sendToVADDelay = time.Second silenceThreshold = 2 * time.Second
)
type VADState int
const (
StateSilence VADState = iota
StateSpeaking
)
const (
// tune these thresholds to taste
SpeechFramesThreshold = 3 // must see X consecutive speech results to confirm "start"
SilenceFramesThreshold = 5 // must see X consecutive silence results to confirm "end"
) )
// handleVAD is a goroutine that listens for audio data from the client, // handleVAD is a goroutine that listens for audio data from the client,
@ -534,14 +525,18 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
copy(allAudio, session.InputAudioBuffer) copy(allAudio, session.InputAudioBuffer)
session.AudioBufferLock.Unlock() session.AudioBufferLock.Unlock()
// 2) If there's no audio at all, just continue // 2) If there's no audio at all, or just too small samples, just continue
if len(allAudio) == 0 { if len(allAudio) == 0 || len(allAudio) < 32000 {
continue continue
} }
// 3) Run VAD on the entire audio so far // 3) Run VAD on the entire audio so far
segments, err := runVAD(vadContext, session, allAudio) segments, err := runVAD(vadContext, session, allAudio)
if err != nil { if err != nil {
if err.Error() == "unexpected speech end" {
log.Debug().Msg("VAD cancelled")
continue
}
log.Error().Msgf("failed to process audio: %s", err.Error()) log.Error().Msgf("failed to process audio: %s", err.Error())
sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "") sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
// handle or log error, continue // handle or log error, continue
@ -550,7 +545,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
segCount := len(segments) segCount := len(segments)
if len(segments) == 0 && !speaking && time.Since(timeOfLastNewSeg) > 1*time.Second { if len(segments) == 0 && !speaking && time.Since(timeOfLastNewSeg) > silenceThreshold {
// no speech detected, and we haven't seen a new segment in > 1s // no speech detected, and we haven't seen a new segment in > 1s
// clean up input // clean up input
session.AudioBufferLock.Lock() session.AudioBufferLock.Lock()
@ -569,8 +564,11 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
} }
// 5) If speaking, but we haven't seen a new segment in > 1s => finalize // 5) If speaking, but we haven't seen a new segment in > 1s => finalize
if speaking && time.Since(timeOfLastNewSeg) > 1*time.Second { if speaking && time.Since(timeOfLastNewSeg) > sendToVADDelay {
log.Debug().Msgf("Detected end of speech segment") log.Debug().Msgf("Detected end of speech segment")
session.AudioBufferLock.Lock()
session.InputAudioBuffer = nil
session.AudioBufferLock.Unlock()
// user has presumably stopped talking // user has presumably stopped talking
commitUtterance(allAudio, cfg, evaluator, session, conv, c) commitUtterance(allAudio, cfg, evaluator, session, conv, c)
// reset state // reset state
@ -608,18 +606,38 @@ func commitUtterance(utt []byte, cfg *config.BackendConfig, evaluator *templates
Item: item, Item: item,
}) })
// Optionally trigger the response generation // save chunk to disk
f, err := os.CreateTemp("", "audio-*.wav")
if err != nil {
log.Error().Msgf("failed to create temp file: %s", err.Error())
return
}
defer f.Close()
//defer os.Remove(f.Name())
log.Debug().Msgf("Writing to %s\n", f.Name())
f.Write(utt)
f.Sync()
// trigger the response generation
generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage) generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage)
} }
// runVAD is a helper that calls your model's VAD method, returning // runVAD is a helper that calls the model's VAD method, returning
// true if it detects speech, false if it detects silence // true if it detects speech, false if it detects silence
func runVAD(ctx context.Context, session *Session, chunk []byte) ([]*proto.VADSegment, error) { func runVAD(ctx context.Context, session *Session, chunk []byte) ([]*proto.VADSegment, error) {
adata := sound.BytesToInt16sLE(chunk) adata := sound.BytesToInt16sLE(chunk)
// Resample from 24kHz to 16kHz // Resample from 24kHz to 16kHz
// adata = sound.ResampleInt16(adata, 24000, 16000) adata = sound.ResampleInt16(adata, 24000, 16000)
dec := wav.NewDecoder(bytes.NewReader(chunk))
dur, err := dec.Duration()
if err != nil {
fmt.Printf("failed to get duration: %s\n", err)
}
fmt.Printf("duration: %s\n", dur)
soundIntBuffer := &audio.IntBuffer{ soundIntBuffer := &audio.IntBuffer{
Format: &audio.Format{SampleRate: 16000, NumChannels: 1}, Format: &audio.Format{SampleRate: 16000, NumChannels: 1},