feat(tts): Implement naive response_format for tts endpoint (#4035)

Signed-off-by: n-Arno <arnaud.alcabas@gmail.com>
This commit is contained in:
Arnaud A 2024-11-02 20:13:35 +01:00 committed by GitHub
parent 57908df956
commit 65c3df392c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 61 additions and 9 deletions

View File

@ -9,16 +9,19 @@ import (
"github.com/gofiber/fiber/v2" "github.com/gofiber/fiber/v2"
"github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/core/schema"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
"github.com/mudler/LocalAI/pkg/utils"
) )
// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
// @Summary Generates audio from the input text. //
// @Accept json // @Summary Generates audio from the input text.
// @Produce audio/x-wav // @Accept json
// @Param request body schema.TTSRequest true "query params" // @Produce audio/x-wav
// @Success 200 {string} binary "generated audio/wav file" // @Param request body schema.TTSRequest true "query params"
// @Router /v1/audio/speech [post] // @Success 200 {string} binary "generated audio/wav file"
// @Router /tts [post] // @Router /v1/audio/speech [post]
// @Router /tts [post]
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error { func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error { return func(c *fiber.Ctx) error {
@ -67,6 +70,13 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
if err != nil { if err != nil {
return err return err
} }
// Convert generated file to target format
filePath, err = utils.AudioConvert(filePath, input.Format)
if err != nil {
return err
}
return c.Download(filePath) return c.Download(filePath)
} }
} }

View File

@ -32,6 +32,7 @@ type TTSRequest struct {
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
Backend string `json:"backend" yaml:"backend"` Backend string `json:"backend" yaml:"backend"`
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format
} }
type StoresSet struct { type StoresSet struct {

View File

@ -4,6 +4,7 @@ import (
"fmt" "fmt"
"os" "os"
"os/exec" "os/exec"
"strings"
) )
func ffmpegCommand(args []string) (string, error) { func ffmpegCommand(args []string) (string, error) {
@ -23,3 +24,32 @@ func AudioToWav(src, dst string) error {
} }
return nil return nil
} }
// AudioConvert converts generated wav file from tts to other output formats.
// TODO: handle pcm to have 100% parity of supported format from OpenAI
func AudioConvert(src string, format string) (string, error) {
extension := ""
// compute file extension from format, default to wav
switch format {
case "opus":
extension = ".ogg"
case "mp3", "aac", "flac":
extension = fmt.Sprintf(".%s", format)
default:
extension = ".wav"
}
// if .wav, do nothing
if extension == ".wav" {
return src, nil
}
// naive conversion based on default values and target extension of file
dst := strings.Replace(src, ".wav", extension, -1)
commandArgs := []string{"-y", "-i", src, "-vn", dst}
out, err := ffmpegCommand(commandArgs)
if err != nil {
return "", fmt.Errorf("error: %w out: %s", err, out)
}
return dst, nil
}

View File

@ -1721,7 +1721,11 @@ const docTemplate = `{
"voice": { "voice": {
"description": "voice audio file or speaker id", "description": "voice audio file or speaker id",
"type": "string" "type": "string"
} },
"response_format": {
"description": "(optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus",
"type": "string"
},
} }
}, },
"schema.ToolCall": { "schema.ToolCall": {

View File

@ -1714,6 +1714,10 @@
"voice": { "voice": {
"description": "voice audio file or speaker id", "description": "voice audio file or speaker id",
"type": "string" "type": "string"
},
"response_format": {
"description": "(optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus",
"type": "string"
} }
} }
}, },
@ -1742,4 +1746,4 @@
"in": "header" "in": "header"
} }
} }
} }

View File

@ -679,6 +679,9 @@ definitions:
voice: voice:
description: voice audio file or speaker id description: voice audio file or speaker id
type: string type: string
response_format:
description: (optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus
type: string
type: object type: object
schema.ToolCall: schema.ToolCall:
properties: properties: