more robust approach

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-02-09 20:31:09 +00:00 · 2025-01-10 16:22:50 +01:00 · 2025-01-10 16:22:50 +01:00 · 4565b87e5c
commit 4565b87e5c
parent 72b2883757
1 changed files with 40 additions and 22 deletions
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@ -1,14 +1,18 @@
 package openai
 import (
 	"bytes"
 	"context"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"os"
 	"strings"
 	"sync"
 	"time"
 	"github.com/go-audio/wav"
 	"github.com/go-audio/audio"
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/websocket/v2"
@ -488,21 +492,8 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
 }
 const (
-	minMicVolume   = 450
+	sendToVADDelay   = 2 * time.Second
-	sendToVADDelay = time.Second
+	silenceThreshold = 2 * time.Second
 )
 type VADState int
 const (
 	StateSilence VADState = iota
 	StateSpeaking
 )
 const (
 	// tune these thresholds to taste
 	SpeechFramesThreshold  = 3 // must see X consecutive speech results to confirm "start"
 	SilenceFramesThreshold = 5 // must see X consecutive silence results to confirm "end"
 )
 // handleVAD is a goroutine that listens for audio data from the client,
@ -534,14 +525,18 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 			copy(allAudio, session.InputAudioBuffer)
 			session.AudioBufferLock.Unlock()
-			// 2) If there's no audio at all, just continue
+			// 2) If there's no audio at all, or just too small samples, just continue
-			if len(allAudio) == 0 {
+			if len(allAudio) == 0 || len(allAudio) < 32000 {
 				continue
 			}
 			// 3) Run VAD on the entire audio so far
 			segments, err := runVAD(vadContext, session, allAudio)
 			if err != nil {
 				if err.Error() == "unexpected speech end" {
 					log.Debug().Msg("VAD cancelled")
 					continue
 				}
 				log.Error().Msgf("failed to process audio: %s", err.Error())
 				sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
 				// handle or log error, continue
@ -550,7 +545,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 			segCount := len(segments)
-			if len(segments) == 0 && !speaking && time.Since(timeOfLastNewSeg) > 1*time.Second {
+			if len(segments) == 0 && !speaking && time.Since(timeOfLastNewSeg) > silenceThreshold {
 				// no speech detected, and we haven't seen a new segment in > 1s
 				// clean up input
 				session.AudioBufferLock.Lock()
@ -569,8 +564,11 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 			}
 			// 5) If speaking, but we haven't seen a new segment in > 1s => finalize
-			if speaking && time.Since(timeOfLastNewSeg) > 1*time.Second {
+			if speaking && time.Since(timeOfLastNewSeg) > sendToVADDelay {
 				log.Debug().Msgf("Detected end of speech segment")
 				session.AudioBufferLock.Lock()
 				session.InputAudioBuffer = nil
 				session.AudioBufferLock.Unlock()
 				// user has presumably stopped talking
 				commitUtterance(allAudio, cfg, evaluator, session, conv, c)
 				// reset state
@ -608,18 +606,38 @@ func commitUtterance(utt []byte, cfg *config.BackendConfig, evaluator *templates
 		Item: item,
 	})
-	// Optionally trigger the response generation
+	// save chunk to disk
 	f, err := os.CreateTemp("", "audio-*.wav")
 	if err != nil {
 		log.Error().Msgf("failed to create temp file: %s", err.Error())
 		return
 	}
 	defer f.Close()
 	//defer os.Remove(f.Name())
 	log.Debug().Msgf("Writing to %s\n", f.Name())
 	f.Write(utt)
 	f.Sync()
 	// trigger the response generation
 	generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage)
 }
-// runVAD is a helper that calls your model's VAD method, returning
+// runVAD is a helper that calls the model's VAD method, returning
 // true if it detects speech, false if it detects silence
 func runVAD(ctx context.Context, session *Session, chunk []byte) ([]*proto.VADSegment, error) {
 	adata := sound.BytesToInt16sLE(chunk)
 	// Resample from 24kHz to 16kHz
-	//	adata = sound.ResampleInt16(adata, 24000, 16000)
+	adata = sound.ResampleInt16(adata, 24000, 16000)
 	dec := wav.NewDecoder(bytes.NewReader(chunk))
 	dur, err := dec.Duration()
 	if err != nil {
 		fmt.Printf("failed to get duration: %s\n", err)
 	}
 	fmt.Printf("duration: %s\n", dur)
 	soundIntBuffer := &audio.IntBuffer{
 		Format: &audio.Format{SampleRate: 16000, NumChannels: 1},