From 65c3df392c13ca21515899dd2c5b08a8997decd4 Mon Sep 17 00:00:00 2001 From: Arnaud A Date: Sat, 2 Nov 2024 20:13:35 +0100 Subject: [PATCH] feat(tts): Implement naive response_format for tts endpoint (#4035) Signed-off-by: n-Arno --- core/http/endpoints/localai/tts.go | 24 +++++++++++++++++------- core/schema/localai.go | 1 + pkg/utils/ffmpeg.go | 30 ++++++++++++++++++++++++++++++ swagger/docs.go | 6 +++++- swagger/swagger.json | 6 +++++- swagger/swagger.yaml | 3 +++ 6 files changed, 61 insertions(+), 9 deletions(-) diff --git a/core/http/endpoints/localai/tts.go b/core/http/endpoints/localai/tts.go index ca3f58bd..7c73c633 100644 --- a/core/http/endpoints/localai/tts.go +++ b/core/http/endpoints/localai/tts.go @@ -9,16 +9,19 @@ import ( "github.com/gofiber/fiber/v2" "github.com/mudler/LocalAI/core/schema" "github.com/rs/zerolog/log" + + "github.com/mudler/LocalAI/pkg/utils" ) // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech -// @Summary Generates audio from the input text. -// @Accept json -// @Produce audio/x-wav -// @Param request body schema.TTSRequest true "query params" -// @Success 200 {string} binary "generated audio/wav file" -// @Router /v1/audio/speech [post] -// @Router /tts [post] +// +// @Summary Generates audio from the input text. +// @Accept json +// @Produce audio/x-wav +// @Param request body schema.TTSRequest true "query params" +// @Success 200 {string} binary "generated audio/wav file" +// @Router /v1/audio/speech [post] +// @Router /tts [post] func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error { return func(c *fiber.Ctx) error { @@ -67,6 +70,13 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi if err != nil { return err } + + // Convert generated file to target format + filePath, err = utils.AudioConvert(filePath, input.Format) + if err != nil { + return err + } + return c.Download(filePath) } } diff --git a/core/schema/localai.go b/core/schema/localai.go index cdc3e5b0..1b663ae0 100644 --- a/core/schema/localai.go +++ b/core/schema/localai.go @@ -32,6 +32,7 @@ type TTSRequest struct { Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id Backend string `json:"backend" yaml:"backend"` Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model + Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format } type StoresSet struct { diff --git a/pkg/utils/ffmpeg.go b/pkg/utils/ffmpeg.go index 16656d8e..68683370 100644 --- a/pkg/utils/ffmpeg.go +++ b/pkg/utils/ffmpeg.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "os/exec" + "strings" ) func ffmpegCommand(args []string) (string, error) { @@ -23,3 +24,32 @@ func AudioToWav(src, dst string) error { } return nil } + +// AudioConvert converts generated wav file from tts to other output formats. +// TODO: handle pcm to have 100% parity of supported format from OpenAI +func AudioConvert(src string, format string) (string, error) { + extension := "" + // compute file extension from format, default to wav + switch format { + case "opus": + extension = ".ogg" + case "mp3", "aac", "flac": + extension = fmt.Sprintf(".%s", format) + default: + extension = ".wav" + } + + // if .wav, do nothing + if extension == ".wav" { + return src, nil + } + + // naive conversion based on default values and target extension of file + dst := strings.Replace(src, ".wav", extension, -1) + commandArgs := []string{"-y", "-i", src, "-vn", dst} + out, err := ffmpegCommand(commandArgs) + if err != nil { + return "", fmt.Errorf("error: %w out: %s", err, out) + } + return dst, nil +} diff --git a/swagger/docs.go b/swagger/docs.go index c283dcb0..284656bf 100644 --- a/swagger/docs.go +++ b/swagger/docs.go @@ -1721,7 +1721,11 @@ const docTemplate = `{ "voice": { "description": "voice audio file or speaker id", "type": "string" - } + }, + "response_format": { + "description": "(optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus", + "type": "string" + }, } }, "schema.ToolCall": { diff --git a/swagger/swagger.json b/swagger/swagger.json index 0a3be179..3ec82878 100644 --- a/swagger/swagger.json +++ b/swagger/swagger.json @@ -1714,6 +1714,10 @@ "voice": { "description": "voice audio file or speaker id", "type": "string" + }, + "response_format": { + "description": "(optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus", + "type": "string" } } }, @@ -1742,4 +1746,4 @@ "in": "header" } } -} \ No newline at end of file +} diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml index 7b6619b4..af6055a1 100644 --- a/swagger/swagger.yaml +++ b/swagger/swagger.yaml @@ -679,6 +679,9 @@ definitions: voice: description: voice audio file or speaker id type: string + response_format: + description: (optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus + type: string type: object schema.ToolCall: properties: