mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
db14dce90f
commit
e30925c555
@ -462,12 +462,10 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
|
|||||||
const (
|
const (
|
||||||
minMicVolume = 450
|
minMicVolume = 450
|
||||||
sendToVADDelay = time.Second
|
sendToVADDelay = time.Second
|
||||||
maxWhisperSegmentDuration = time.Second * 25
|
maxWhisperSegmentDuration = time.Second * 15
|
||||||
)
|
)
|
||||||
|
|
||||||
// Placeholder function to handle VAD (Voice Activity Detection)
|
// handle VAD (Voice Activity Detection)
|
||||||
// https://github.com/snakers4/silero-vad/tree/master/examples/go
|
|
||||||
// XXX: use session.ModelInterface for VAD or hook directly VAD runtime here?
|
|
||||||
func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, done chan struct{}) {
|
func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, done chan struct{}) {
|
||||||
|
|
||||||
vadContext, cancel := context.WithCancel(context.Background())
|
vadContext, cancel := context.WithCancel(context.Background())
|
||||||
@ -480,6 +478,7 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
|
|||||||
|
|
||||||
audioDetected := false
|
audioDetected := false
|
||||||
timeListening := time.Now()
|
timeListening := time.Now()
|
||||||
|
|
||||||
// Implement VAD logic here
|
// Implement VAD logic here
|
||||||
// For brevity, this is a placeholder
|
// For brevity, this is a placeholder
|
||||||
// When VAD detects end of speech, generate a response
|
// When VAD detects end of speech, generate a response
|
||||||
@ -492,7 +491,54 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
|
|||||||
// Check if there's audio data to process
|
// Check if there's audio data to process
|
||||||
session.AudioBufferLock.Lock()
|
session.AudioBufferLock.Lock()
|
||||||
|
|
||||||
if len(session.InputAudioBuffer) > 16000 {
|
if len(session.InputAudioBuffer) > 0 {
|
||||||
|
|
||||||
|
if audioDetected && time.Since(timeListening) < maxWhisperSegmentDuration {
|
||||||
|
log.Debug().Msgf("VAD detected speech, but still listening")
|
||||||
|
// audioDetected = false
|
||||||
|
// keep listening
|
||||||
|
session.AudioBufferLock.Unlock()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if audioDetected {
|
||||||
|
log.Debug().Msgf("VAD detected speech that we can process")
|
||||||
|
|
||||||
|
// Commit the audio buffer as a conversation item
|
||||||
|
item := &Item{
|
||||||
|
ID: generateItemID(),
|
||||||
|
Object: "realtime.item",
|
||||||
|
Type: "message",
|
||||||
|
Status: "completed",
|
||||||
|
Role: "user",
|
||||||
|
Content: []ConversationContent{
|
||||||
|
{
|
||||||
|
Type: "input_audio",
|
||||||
|
Audio: base64.StdEncoding.EncodeToString(session.InputAudioBuffer),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add item to conversation
|
||||||
|
conversation.Lock.Lock()
|
||||||
|
conversation.Items = append(conversation.Items, item)
|
||||||
|
conversation.Lock.Unlock()
|
||||||
|
|
||||||
|
// Reset InputAudioBuffer
|
||||||
|
session.InputAudioBuffer = nil
|
||||||
|
session.AudioBufferLock.Unlock()
|
||||||
|
|
||||||
|
// Send item.created event
|
||||||
|
sendEvent(c, OutgoingMessage{
|
||||||
|
Type: "conversation.item.created",
|
||||||
|
Item: item,
|
||||||
|
})
|
||||||
|
|
||||||
|
audioDetected = false
|
||||||
|
// Generate a response
|
||||||
|
generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
adata := sound.BytesToInt16sLE(session.InputAudioBuffer)
|
adata := sound.BytesToInt16sLE(session.InputAudioBuffer)
|
||||||
|
|
||||||
@ -522,24 +568,6 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
speechStart, speechEnd := float32(0), float32(0)
|
|
||||||
|
|
||||||
/*
|
|
||||||
volume := sound.CalculateRMS16(adata)
|
|
||||||
if volume > minMicVolume {
|
|
||||||
startListening = time.Now()
|
|
||||||
}
|
|
||||||
|
|
||||||
if time.Since(startListening) < sendToVADDelay && time.Since(startListening) < maxWhisperSegmentDuration {
|
|
||||||
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
|
|
||||||
|
|
||||||
session.AudioBufferLock.Unlock()
|
|
||||||
log.Debug().Msg("speech is ongoing")
|
|
||||||
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
if len(resp.Segments) == 0 {
|
if len(resp.Segments) == 0 {
|
||||||
log.Debug().Msg("VAD detected no speech activity")
|
log.Debug().Msg("VAD detected no speech activity")
|
||||||
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
|
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
|
||||||
@ -553,75 +581,12 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
timeListening = time.Now()
|
if !audioDetected {
|
||||||
|
timeListening = time.Now()
|
||||||
log.Debug().Msgf("VAD detected %d segments", len(resp.Segments))
|
}
|
||||||
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
|
|
||||||
|
|
||||||
speechStart = resp.Segments[0].Start
|
|
||||||
log.Debug().Msgf("speech starts at %0.2fs", speechStart)
|
|
||||||
|
|
||||||
audioDetected = true
|
audioDetected = true
|
||||||
|
|
||||||
for _, s := range resp.Segments {
|
|
||||||
if s.End > 0 {
|
|
||||||
log.Debug().Msgf("speech ends at %0.2fs", s.End)
|
|
||||||
speechEnd = s.End
|
|
||||||
audioDetected = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if speechEnd == 0 {
|
|
||||||
log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
|
|
||||||
|
|
||||||
session.AudioBufferLock.Unlock()
|
|
||||||
log.Debug().Msg("speech is ongoing, no end found ?")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle when input is too long without a voice activity (reset the buffer)
|
|
||||||
if speechStart == 0 && speechEnd == 0 {
|
|
||||||
// log.Debug().Msg("VAD detected no speech activity")
|
|
||||||
session.InputAudioBuffer = nil
|
|
||||||
session.AudioBufferLock.Unlock()
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Shall we cut the audio from speechStart and SpeechEnd?
|
|
||||||
log.Debug().Msgf("VAD detected Start speech at: %0.2fs, End speech at: %0.2fs", speechStart, speechEnd)
|
|
||||||
|
|
||||||
// Commit the audio buffer as a conversation item
|
|
||||||
item := &Item{
|
|
||||||
ID: generateItemID(),
|
|
||||||
Object: "realtime.item",
|
|
||||||
Type: "message",
|
|
||||||
Status: "completed",
|
|
||||||
Role: "user",
|
|
||||||
Content: []ConversationContent{
|
|
||||||
{
|
|
||||||
Type: "input_audio",
|
|
||||||
Audio: base64.StdEncoding.EncodeToString(session.InputAudioBuffer),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add item to conversation
|
|
||||||
conversation.Lock.Lock()
|
|
||||||
conversation.Items = append(conversation.Items, item)
|
|
||||||
conversation.Lock.Unlock()
|
|
||||||
|
|
||||||
// Reset InputAudioBuffer
|
|
||||||
session.InputAudioBuffer = nil
|
|
||||||
session.AudioBufferLock.Unlock()
|
session.AudioBufferLock.Unlock()
|
||||||
|
|
||||||
// Send item.created event
|
|
||||||
sendEvent(c, OutgoingMessage{
|
|
||||||
Type: "conversation.item.created",
|
|
||||||
Item: item,
|
|
||||||
})
|
|
||||||
|
|
||||||
// Generate a response
|
|
||||||
generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage)
|
|
||||||
} else {
|
} else {
|
||||||
session.AudioBufferLock.Unlock()
|
session.AudioBufferLock.Unlock()
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user