mirror of
https://github.com/mudler/LocalAI.git
synced 2025-01-13 08:19:57 +00:00
chore(vad): try to hook vad to received data from the API (WIP)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
d76b654c51
commit
5c2a00f7fb
@ -8,10 +8,13 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
"github.com/go-audio/audio"
|
||||||
"github.com/gofiber/websocket/v2"
|
"github.com/gofiber/websocket/v2"
|
||||||
"github.com/mudler/LocalAI/core/config"
|
"github.com/mudler/LocalAI/core/config"
|
||||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
model "github.com/mudler/LocalAI/pkg/model"
|
model "github.com/mudler/LocalAI/pkg/model"
|
||||||
|
"github.com/mudler/LocalAI/pkg/sound"
|
||||||
|
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
@ -456,9 +459,17 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn,
|
|||||||
// Check if there's audio data to process
|
// Check if there's audio data to process
|
||||||
session.AudioBufferLock.Lock()
|
session.AudioBufferLock.Lock()
|
||||||
if len(session.InputAudioBuffer) > 0 {
|
if len(session.InputAudioBuffer) > 0 {
|
||||||
// TODO: what to put in the VADRequest request?
|
|
||||||
// Data is received as buffer, but we want PCM as float32 here...
|
adata := sound.BytesToInt16sLE(session.InputAudioBuffer)
|
||||||
resp, err := session.ModelInterface.VAD(context.Background(), &proto.VADRequest{})
|
|
||||||
|
soundIntBuffer := &audio.IntBuffer{
|
||||||
|
Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
|
||||||
|
}
|
||||||
|
soundIntBuffer.Data = sound.ConvertInt16ToInt(adata)
|
||||||
|
|
||||||
|
resp, err := session.ModelInterface.VAD(context.Background(), &proto.VADRequest{
|
||||||
|
Audio: soundIntBuffer.AsFloat32Buffer().Data,
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error().Msgf("failed to process audio: %s", err.Error())
|
log.Error().Msgf("failed to process audio: %s", err.Error())
|
||||||
sendError(c, "processing_error", "Failed to process audio", "", "")
|
sendError(c, "processing_error", "Failed to process audio", "", "")
|
||||||
|
20
pkg/sound/float32.go
Normal file
20
pkg/sound/float32.go
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
package sound
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/binary"
|
||||||
|
"math"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BytesToFloat32Array(aBytes []byte) []float32 {
|
||||||
|
aArr := make([]float32, 3)
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
aArr[i] = BytesFloat32(aBytes[i*4:])
|
||||||
|
}
|
||||||
|
return aArr
|
||||||
|
}
|
||||||
|
|
||||||
|
func BytesFloat32(bytes []byte) float32 {
|
||||||
|
bits := binary.LittleEndian.Uint32(bytes)
|
||||||
|
float := math.Float32frombits(bits)
|
||||||
|
return float
|
||||||
|
}
|
65
pkg/sound/int16.go
Normal file
65
pkg/sound/int16.go
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
package sound
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2024 Xbozon
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
func ResampleInt16(input []int16, inputRate, outputRate int) []int16 {
|
||||||
|
// Calculate the resampling ratio
|
||||||
|
ratio := float64(inputRate) / float64(outputRate)
|
||||||
|
|
||||||
|
// Calculate the length of the resampled output
|
||||||
|
outputLength := int(float64(len(input)) / ratio)
|
||||||
|
|
||||||
|
// Allocate a slice for the resampled output
|
||||||
|
output := make([]int16, outputLength)
|
||||||
|
|
||||||
|
// Perform linear interpolation for resampling
|
||||||
|
for i := 0; i < outputLength-1; i++ {
|
||||||
|
// Calculate the corresponding position in the input
|
||||||
|
pos := float64(i) * ratio
|
||||||
|
|
||||||
|
// Calculate the indices of the surrounding input samples
|
||||||
|
indexBefore := int(pos)
|
||||||
|
indexAfter := indexBefore + 1
|
||||||
|
if indexAfter >= len(input) {
|
||||||
|
indexAfter = len(input) - 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the fractional part of the position
|
||||||
|
frac := pos - float64(indexBefore)
|
||||||
|
|
||||||
|
// Linearly interpolate between the two surrounding input samples
|
||||||
|
output[i] = int16((1-frac)*float64(input[indexBefore]) + frac*float64(input[indexAfter]))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the last sample explicitly to avoid index out of range
|
||||||
|
output[outputLength-1] = input[len(input)-1]
|
||||||
|
|
||||||
|
return output
|
||||||
|
}
|
||||||
|
|
||||||
|
func ConvertInt16ToInt(input []int16) []int {
|
||||||
|
output := make([]int, len(input)) // Allocate a slice for the output
|
||||||
|
for i, value := range input {
|
||||||
|
output[i] = int(value) // Convert each int16 to int and assign it to the output slice
|
||||||
|
}
|
||||||
|
return output // Return the converted slice
|
||||||
|
}
|
||||||
|
|
||||||
|
func BytesToInt16sLE(bytes []byte) []int16 {
|
||||||
|
// Ensure the byte slice length is even
|
||||||
|
if len(bytes)%2 != 0 {
|
||||||
|
panic("bytesToInt16sLE: input bytes slice has odd length, must be even")
|
||||||
|
}
|
||||||
|
|
||||||
|
int16s := make([]int16, len(bytes)/2)
|
||||||
|
for i := 0; i < len(int16s); i++ {
|
||||||
|
int16s[i] = int16(bytes[2*i]) | int16(bytes[2*i+1])<<8
|
||||||
|
}
|
||||||
|
return int16s
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user