WIP
Some checks failed
Security Scan / tests (push) Has been cancelled

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-11-20 10:25:29 +01:00
parent db14dce90f
commit e30925c555

View File

@ -462,12 +462,10 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
const ( const (
minMicVolume = 450 minMicVolume = 450
sendToVADDelay = time.Second sendToVADDelay = time.Second
maxWhisperSegmentDuration = time.Second * 25 maxWhisperSegmentDuration = time.Second * 15
) )
// Placeholder function to handle VAD (Voice Activity Detection) // handle VAD (Voice Activity Detection)
// https://github.com/snakers4/silero-vad/tree/master/examples/go
// XXX: use session.ModelInterface for VAD or hook directly VAD runtime here?
func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, done chan struct{}) { func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, done chan struct{}) {
vadContext, cancel := context.WithCancel(context.Background()) vadContext, cancel := context.WithCancel(context.Background())
@ -480,6 +478,7 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
audioDetected := false audioDetected := false
timeListening := time.Now() timeListening := time.Now()
// Implement VAD logic here // Implement VAD logic here
// For brevity, this is a placeholder // For brevity, this is a placeholder
// When VAD detects end of speech, generate a response // When VAD detects end of speech, generate a response
@ -492,103 +491,18 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
// Check if there's audio data to process // Check if there's audio data to process
session.AudioBufferLock.Lock() session.AudioBufferLock.Lock()
if len(session.InputAudioBuffer) > 16000 { if len(session.InputAudioBuffer) > 0 {
adata := sound.BytesToInt16sLE(session.InputAudioBuffer) if audioDetected && time.Since(timeListening) < maxWhisperSegmentDuration {
log.Debug().Msgf("VAD detected speech, but still listening")
// Resample from 24kHz to 16kHz // audioDetected = false
adata = sound.ResampleInt16(adata, 24000, 16000) // keep listening
soundIntBuffer := &audio.IntBuffer{
Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
}
soundIntBuffer.Data = sound.ConvertInt16ToInt(adata)
/* if len(adata) < 16000 {
log.Debug().Msgf("audio length too small %d", len(session.InputAudioBuffer))
session.AudioBufferLock.Unlock()
continue
} */
float32Data := soundIntBuffer.AsFloat32Buffer().Data
resp, err := session.ModelInterface.VAD(vadContext, &proto.VADRequest{
Audio: float32Data,
})
if err != nil {
log.Error().Msgf("failed to process audio: %s", err.Error())
sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
session.AudioBufferLock.Unlock() session.AudioBufferLock.Unlock()
continue continue
} }
speechStart, speechEnd := float32(0), float32(0) if audioDetected {
log.Debug().Msgf("VAD detected speech that we can process")
/*
volume := sound.CalculateRMS16(adata)
if volume > minMicVolume {
startListening = time.Now()
}
if time.Since(startListening) < sendToVADDelay && time.Since(startListening) < maxWhisperSegmentDuration {
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
session.AudioBufferLock.Unlock()
log.Debug().Msg("speech is ongoing")
continue
}
*/
if len(resp.Segments) == 0 {
log.Debug().Msg("VAD detected no speech activity")
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
if !audioDetected {
session.InputAudioBuffer = nil
}
log.Debug().Msgf("audio length(after) %d", len(session.InputAudioBuffer))
session.AudioBufferLock.Unlock()
continue
}
timeListening = time.Now()
log.Debug().Msgf("VAD detected %d segments", len(resp.Segments))
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
speechStart = resp.Segments[0].Start
log.Debug().Msgf("speech starts at %0.2fs", speechStart)
audioDetected = true
for _, s := range resp.Segments {
if s.End > 0 {
log.Debug().Msgf("speech ends at %0.2fs", s.End)
speechEnd = s.End
audioDetected = false
}
}
if speechEnd == 0 {
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
session.AudioBufferLock.Unlock()
log.Debug().Msg("speech is ongoing, no end found ?")
continue
}
// Handle when input is too long without a voice activity (reset the buffer)
if speechStart == 0 && speechEnd == 0 {
// log.Debug().Msg("VAD detected no speech activity")
session.InputAudioBuffer = nil
session.AudioBufferLock.Unlock()
continue
}
// TODO: Shall we cut the audio from speechStart and SpeechEnd?
log.Debug().Msgf("VAD detected Start speech at: %0.2fs, End speech at: %0.2fs", speechStart, speechEnd)
// Commit the audio buffer as a conversation item // Commit the audio buffer as a conversation item
item := &Item{ item := &Item{
@ -620,8 +534,59 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
Item: item, Item: item,
}) })
audioDetected = false
// Generate a response // Generate a response
generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage) generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage)
continue
}
adata := sound.BytesToInt16sLE(session.InputAudioBuffer)
// Resample from 24kHz to 16kHz
adata = sound.ResampleInt16(adata, 24000, 16000)
soundIntBuffer := &audio.IntBuffer{
Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
}
soundIntBuffer.Data = sound.ConvertInt16ToInt(adata)
/* if len(adata) < 16000 {
log.Debug().Msgf("audio length too small %d", len(session.InputAudioBuffer))
session.AudioBufferLock.Unlock()
continue
} */
float32Data := soundIntBuffer.AsFloat32Buffer().Data
resp, err := session.ModelInterface.VAD(vadContext, &proto.VADRequest{
Audio: float32Data,
})
if err != nil {
log.Error().Msgf("failed to process audio: %s", err.Error())
sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
session.AudioBufferLock.Unlock()
continue
}
if len(resp.Segments) == 0 {
log.Debug().Msg("VAD detected no speech activity")
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
if !audioDetected {
session.InputAudioBuffer = nil
}
log.Debug().Msgf("audio length(after) %d", len(session.InputAudioBuffer))
session.AudioBufferLock.Unlock()
continue
}
if !audioDetected {
timeListening = time.Now()
}
audioDetected = true
session.AudioBufferLock.Unlock()
} else { } else {
session.AudioBufferLock.Unlock() session.AudioBufferLock.Unlock()
} }