Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-11-19 19:08:04 +01:00
parent b1f57a90fe
commit 78d45e5f69

View File

@ -478,6 +478,8 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
cancel() cancel()
}() }()
audioDetected := false
timeListening := time.Now()
// Implement VAD logic here // Implement VAD logic here
// For brevity, this is a placeholder // For brevity, this is a placeholder
// When VAD detects end of speech, generate a response // When VAD detects end of speech, generate a response
@ -489,10 +491,14 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
default: default:
// Check if there's audio data to process // Check if there's audio data to process
session.AudioBufferLock.Lock() session.AudioBufferLock.Lock()
if len(session.InputAudioBuffer) > 16000 { if len(session.InputAudioBuffer) > 16000 {
adata := sound.BytesToInt16sLE(session.InputAudioBuffer) adata := sound.BytesToInt16sLE(session.InputAudioBuffer)
// Resample from 24kHz to 16kHz
adata = sound.ResampleInt16(adata, 24000, 16000)
soundIntBuffer := &audio.IntBuffer{ soundIntBuffer := &audio.IntBuffer{
Format: &audio.Format{SampleRate: 16000, NumChannels: 1}, Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
} }
@ -538,23 +544,30 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
log.Debug().Msg("VAD detected no speech activity") log.Debug().Msg("VAD detected no speech activity")
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer)) log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
if !audioDetected {
session.InputAudioBuffer = nil session.InputAudioBuffer = nil
}
log.Debug().Msgf("audio length(after) %d", len(session.InputAudioBuffer)) log.Debug().Msgf("audio length(after) %d", len(session.InputAudioBuffer))
session.AudioBufferLock.Unlock() session.AudioBufferLock.Unlock()
continue continue
} }
timeListening = time.Now()
log.Debug().Msgf("VAD detected %d segments", len(resp.Segments)) log.Debug().Msgf("VAD detected %d segments", len(resp.Segments))
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer)) log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
speechStart = resp.Segments[0].Start speechStart = resp.Segments[0].Start
log.Debug().Msgf("speech starts at %0.2fs", speechStart) log.Debug().Msgf("speech starts at %0.2fs", speechStart)
audioDetected = true
for _, s := range resp.Segments { for _, s := range resp.Segments {
if s.End > 0 { if s.End > 0 {
log.Debug().Msgf("speech ends at %0.2fs", s.End) log.Debug().Msgf("speech ends at %0.2fs", s.End)
speechEnd = s.End speechEnd = s.End
audioDetected = false
} }
} }
@ -599,6 +612,7 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
// Reset InputAudioBuffer // Reset InputAudioBuffer
session.InputAudioBuffer = nil session.InputAudioBuffer = nil
session.AudioBufferLock.Unlock()
// Send item.created event // Send item.created event
sendEvent(c, OutgoingMessage{ sendEvent(c, OutgoingMessage{
@ -608,9 +622,10 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
// Generate a response // Generate a response
generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage) generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage)
} else {
session.AudioBufferLock.Unlock()
} }
session.AudioBufferLock.Unlock()
} }
} }
} }