mirror of
https://github.com/mudler/LocalAI.git
synced 2025-02-10 12:51:14 +00:00
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
72b2883757
commit
4565b87e5c
@ -1,14 +1,18 @@
|
|||||||
package openai
|
package openai
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/go-audio/wav"
|
||||||
|
|
||||||
"github.com/go-audio/audio"
|
"github.com/go-audio/audio"
|
||||||
"github.com/gofiber/fiber/v2"
|
"github.com/gofiber/fiber/v2"
|
||||||
"github.com/gofiber/websocket/v2"
|
"github.com/gofiber/websocket/v2"
|
||||||
@ -488,21 +492,8 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
|
|||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
minMicVolume = 450
|
sendToVADDelay = 2 * time.Second
|
||||||
sendToVADDelay = time.Second
|
silenceThreshold = 2 * time.Second
|
||||||
)
|
|
||||||
|
|
||||||
type VADState int
|
|
||||||
|
|
||||||
const (
|
|
||||||
StateSilence VADState = iota
|
|
||||||
StateSpeaking
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
// tune these thresholds to taste
|
|
||||||
SpeechFramesThreshold = 3 // must see X consecutive speech results to confirm "start"
|
|
||||||
SilenceFramesThreshold = 5 // must see X consecutive silence results to confirm "end"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// handleVAD is a goroutine that listens for audio data from the client,
|
// handleVAD is a goroutine that listens for audio data from the client,
|
||||||
@ -534,14 +525,18 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
|
|||||||
copy(allAudio, session.InputAudioBuffer)
|
copy(allAudio, session.InputAudioBuffer)
|
||||||
session.AudioBufferLock.Unlock()
|
session.AudioBufferLock.Unlock()
|
||||||
|
|
||||||
// 2) If there's no audio at all, just continue
|
// 2) If there's no audio at all, or just too small samples, just continue
|
||||||
if len(allAudio) == 0 {
|
if len(allAudio) == 0 || len(allAudio) < 32000 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3) Run VAD on the entire audio so far
|
// 3) Run VAD on the entire audio so far
|
||||||
segments, err := runVAD(vadContext, session, allAudio)
|
segments, err := runVAD(vadContext, session, allAudio)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if err.Error() == "unexpected speech end" {
|
||||||
|
log.Debug().Msg("VAD cancelled")
|
||||||
|
continue
|
||||||
|
}
|
||||||
log.Error().Msgf("failed to process audio: %s", err.Error())
|
log.Error().Msgf("failed to process audio: %s", err.Error())
|
||||||
sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
|
sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
|
||||||
// handle or log error, continue
|
// handle or log error, continue
|
||||||
@ -550,7 +545,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
|
|||||||
|
|
||||||
segCount := len(segments)
|
segCount := len(segments)
|
||||||
|
|
||||||
if len(segments) == 0 && !speaking && time.Since(timeOfLastNewSeg) > 1*time.Second {
|
if len(segments) == 0 && !speaking && time.Since(timeOfLastNewSeg) > silenceThreshold {
|
||||||
// no speech detected, and we haven't seen a new segment in > 1s
|
// no speech detected, and we haven't seen a new segment in > 1s
|
||||||
// clean up input
|
// clean up input
|
||||||
session.AudioBufferLock.Lock()
|
session.AudioBufferLock.Lock()
|
||||||
@ -569,8 +564,11 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 5) If speaking, but we haven't seen a new segment in > 1s => finalize
|
// 5) If speaking, but we haven't seen a new segment in > 1s => finalize
|
||||||
if speaking && time.Since(timeOfLastNewSeg) > 1*time.Second {
|
if speaking && time.Since(timeOfLastNewSeg) > sendToVADDelay {
|
||||||
log.Debug().Msgf("Detected end of speech segment")
|
log.Debug().Msgf("Detected end of speech segment")
|
||||||
|
session.AudioBufferLock.Lock()
|
||||||
|
session.InputAudioBuffer = nil
|
||||||
|
session.AudioBufferLock.Unlock()
|
||||||
// user has presumably stopped talking
|
// user has presumably stopped talking
|
||||||
commitUtterance(allAudio, cfg, evaluator, session, conv, c)
|
commitUtterance(allAudio, cfg, evaluator, session, conv, c)
|
||||||
// reset state
|
// reset state
|
||||||
@ -608,18 +606,38 @@ func commitUtterance(utt []byte, cfg *config.BackendConfig, evaluator *templates
|
|||||||
Item: item,
|
Item: item,
|
||||||
})
|
})
|
||||||
|
|
||||||
// Optionally trigger the response generation
|
// save chunk to disk
|
||||||
|
f, err := os.CreateTemp("", "audio-*.wav")
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Msgf("failed to create temp file: %s", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
//defer os.Remove(f.Name())
|
||||||
|
log.Debug().Msgf("Writing to %s\n", f.Name())
|
||||||
|
|
||||||
|
f.Write(utt)
|
||||||
|
f.Sync()
|
||||||
|
|
||||||
|
// trigger the response generation
|
||||||
generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage)
|
generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage)
|
||||||
}
|
}
|
||||||
|
|
||||||
// runVAD is a helper that calls your model's VAD method, returning
|
// runVAD is a helper that calls the model's VAD method, returning
|
||||||
// true if it detects speech, false if it detects silence
|
// true if it detects speech, false if it detects silence
|
||||||
func runVAD(ctx context.Context, session *Session, chunk []byte) ([]*proto.VADSegment, error) {
|
func runVAD(ctx context.Context, session *Session, chunk []byte) ([]*proto.VADSegment, error) {
|
||||||
|
|
||||||
adata := sound.BytesToInt16sLE(chunk)
|
adata := sound.BytesToInt16sLE(chunk)
|
||||||
|
|
||||||
// Resample from 24kHz to 16kHz
|
// Resample from 24kHz to 16kHz
|
||||||
// adata = sound.ResampleInt16(adata, 24000, 16000)
|
adata = sound.ResampleInt16(adata, 24000, 16000)
|
||||||
|
|
||||||
|
dec := wav.NewDecoder(bytes.NewReader(chunk))
|
||||||
|
dur, err := dec.Duration()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("failed to get duration: %s\n", err)
|
||||||
|
}
|
||||||
|
fmt.Printf("duration: %s\n", dur)
|
||||||
|
|
||||||
soundIntBuffer := &audio.IntBuffer{
|
soundIntBuffer := &audio.IntBuffer{
|
||||||
Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
|
Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
|
||||||
|
Loading…
x
Reference in New Issue
Block a user