WIP

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-12-19 20:57:54 +00:00 · 2024-11-20 10:25:29 +01:00 · 2024-11-20 10:25:29 +01:00 · e30925c555
commit e30925c555
parent db14dce90f
1 changed files with 54 additions and 89 deletions
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@ -462,12 +462,10 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
 const (
 	minMicVolume              = 450
 	sendToVADDelay            = time.Second
-	maxWhisperSegmentDuration = time.Second * 25
+	maxWhisperSegmentDuration = time.Second * 15
 )

-// Placeholder function to handle VAD (Voice Activity Detection)
-// https://github.com/snakers4/silero-vad/tree/master/examples/go
-// XXX: use session.ModelInterface for VAD or hook directly VAD runtime here?
+// handle VAD (Voice Activity Detection)
 func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, done chan struct{}) {

 	vadContext, cancel := context.WithCancel(context.Background())
@ -480,6 +478,7 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,

 	audioDetected := false
 	timeListening := time.Now()
+
 	// Implement VAD logic here
 	// For brevity, this is a placeholder
 	// When VAD detects end of speech, generate a response
@ -492,103 +491,18 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
 			// Check if there's audio data to process
 			session.AudioBufferLock.Lock()

-			if len(session.InputAudioBuffer) > 16000 {
+			if len(session.InputAudioBuffer) > 0 {

-				adata := sound.BytesToInt16sLE(session.InputAudioBuffer)
-
-				// Resample from 24kHz to 16kHz
-				adata = sound.ResampleInt16(adata, 24000, 16000)
-
-				soundIntBuffer := &audio.IntBuffer{
-					Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
-				}
-				soundIntBuffer.Data = sound.ConvertInt16ToInt(adata)
-
-				/* if len(adata) < 16000 {
-					log.Debug().Msgf("audio length too small %d", len(session.InputAudioBuffer))
-					session.AudioBufferLock.Unlock()
-					continue
-				} */
-
-				float32Data := soundIntBuffer.AsFloat32Buffer().Data
-
-				resp, err := session.ModelInterface.VAD(vadContext, &proto.VADRequest{
-					Audio: float32Data,
-				})
-				if err != nil {
-					log.Error().Msgf("failed to process audio: %s", err.Error())
-					sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
+				if audioDetected && time.Since(timeListening) < maxWhisperSegmentDuration {
+					log.Debug().Msgf("VAD detected speech, but still listening")
+					// audioDetected = false
+					// keep listening
 					session.AudioBufferLock.Unlock()
 					continue
 				}

-				speechStart, speechEnd := float32(0), float32(0)
-
-				/*
-					volume := sound.CalculateRMS16(adata)
-					if volume > minMicVolume {
-						startListening = time.Now()
-					}
-
-					if time.Since(startListening) < sendToVADDelay && time.Since(startListening) < maxWhisperSegmentDuration {
-						log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
-
-						session.AudioBufferLock.Unlock()
-						log.Debug().Msg("speech is ongoing")
-
-						continue
-					}
-				*/
-
-				if len(resp.Segments) == 0 {
-					log.Debug().Msg("VAD detected no speech activity")
-					log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
-
-					if !audioDetected {
-						session.InputAudioBuffer = nil
-					}
-					log.Debug().Msgf("audio length(after) %d", len(session.InputAudioBuffer))
-
-					session.AudioBufferLock.Unlock()
-					continue
-				}
-
-				timeListening = time.Now()
-
-				log.Debug().Msgf("VAD detected %d segments", len(resp.Segments))
-				log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
-
-				speechStart = resp.Segments[0].Start
-				log.Debug().Msgf("speech starts at %0.2fs", speechStart)
-
-				audioDetected = true
-
-				for _, s := range resp.Segments {
-					if s.End > 0 {
-						log.Debug().Msgf("speech ends at %0.2fs", s.End)
-						speechEnd = s.End
-						audioDetected = false
-					}
-				}
-
-				if speechEnd == 0 {
-					log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
-
-					session.AudioBufferLock.Unlock()
-					log.Debug().Msg("speech is ongoing, no end found ?")
-					continue
-				}
-
-				// Handle when input is too long without a voice activity (reset the buffer)
-				if speechStart == 0 && speechEnd == 0 {
-					//	log.Debug().Msg("VAD detected no speech activity")
-					session.InputAudioBuffer = nil
-					session.AudioBufferLock.Unlock()
-					continue
-				}
-
-				// TODO: Shall we cut the audio from speechStart and SpeechEnd?
-				log.Debug().Msgf("VAD detected Start speech at: %0.2fs, End speech at: %0.2fs", speechStart, speechEnd)
+				if audioDetected {
+					log.Debug().Msgf("VAD detected speech that we can process")

 					// Commit the audio buffer as a conversation item
 					item := &Item{
@ -620,8 +534,59 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
 						Item: item,
 					})

+					audioDetected = false
 					// Generate a response
 					generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage)
+					continue
+				}
+
+				adata := sound.BytesToInt16sLE(session.InputAudioBuffer)
+
+				// Resample from 24kHz to 16kHz
+				adata = sound.ResampleInt16(adata, 24000, 16000)
+
+				soundIntBuffer := &audio.IntBuffer{
+					Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
+				}
+				soundIntBuffer.Data = sound.ConvertInt16ToInt(adata)
+
+				/* if len(adata) < 16000 {
+					log.Debug().Msgf("audio length too small %d", len(session.InputAudioBuffer))
+					session.AudioBufferLock.Unlock()
+					continue
+				} */
+
+				float32Data := soundIntBuffer.AsFloat32Buffer().Data
+
+				resp, err := session.ModelInterface.VAD(vadContext, &proto.VADRequest{
+					Audio: float32Data,
+				})
+				if err != nil {
+					log.Error().Msgf("failed to process audio: %s", err.Error())
+					sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
+					session.AudioBufferLock.Unlock()
+					continue
+				}
+
+				if len(resp.Segments) == 0 {
+					log.Debug().Msg("VAD detected no speech activity")
+					log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
+
+					if !audioDetected {
+						session.InputAudioBuffer = nil
+					}
+					log.Debug().Msgf("audio length(after) %d", len(session.InputAudioBuffer))
+
+					session.AudioBufferLock.Unlock()
+					continue
+				}
+
+				if !audioDetected {
+					timeListening = time.Now()
+				}
+				audioDetected = true
+
+				session.AudioBufferLock.Unlock()
 			} else {
 				session.AudioBufferLock.Unlock()
 			}