mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
feat(tts): Implement naive response_format for tts endpoint (#4035)
Signed-off-by: n-Arno <arnaud.alcabas@gmail.com>
This commit is contained in:
parent
57908df956
commit
65c3df392c
@ -9,16 +9,19 @@ import (
|
|||||||
"github.com/gofiber/fiber/v2"
|
"github.com/gofiber/fiber/v2"
|
||||||
"github.com/mudler/LocalAI/core/schema"
|
"github.com/mudler/LocalAI/core/schema"
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
|
|
||||||
|
"github.com/mudler/LocalAI/pkg/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
|
// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
|
||||||
// @Summary Generates audio from the input text.
|
//
|
||||||
// @Accept json
|
// @Summary Generates audio from the input text.
|
||||||
// @Produce audio/x-wav
|
// @Accept json
|
||||||
// @Param request body schema.TTSRequest true "query params"
|
// @Produce audio/x-wav
|
||||||
// @Success 200 {string} binary "generated audio/wav file"
|
// @Param request body schema.TTSRequest true "query params"
|
||||||
// @Router /v1/audio/speech [post]
|
// @Success 200 {string} binary "generated audio/wav file"
|
||||||
// @Router /tts [post]
|
// @Router /v1/audio/speech [post]
|
||||||
|
// @Router /tts [post]
|
||||||
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||||
return func(c *fiber.Ctx) error {
|
return func(c *fiber.Ctx) error {
|
||||||
|
|
||||||
@ -67,6 +70,13 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Convert generated file to target format
|
||||||
|
filePath, err = utils.AudioConvert(filePath, input.Format)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
return c.Download(filePath)
|
return c.Download(filePath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,7 @@ type TTSRequest struct {
|
|||||||
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
|
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
|
||||||
Backend string `json:"backend" yaml:"backend"`
|
Backend string `json:"backend" yaml:"backend"`
|
||||||
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
|
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
|
||||||
|
Format string `json:"response_format,omitempty" yaml:"response_format,omitempty"` // (optional) output format
|
||||||
}
|
}
|
||||||
|
|
||||||
type StoresSet struct {
|
type StoresSet struct {
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func ffmpegCommand(args []string) (string, error) {
|
func ffmpegCommand(args []string) (string, error) {
|
||||||
@ -23,3 +24,32 @@ func AudioToWav(src, dst string) error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AudioConvert converts generated wav file from tts to other output formats.
|
||||||
|
// TODO: handle pcm to have 100% parity of supported format from OpenAI
|
||||||
|
func AudioConvert(src string, format string) (string, error) {
|
||||||
|
extension := ""
|
||||||
|
// compute file extension from format, default to wav
|
||||||
|
switch format {
|
||||||
|
case "opus":
|
||||||
|
extension = ".ogg"
|
||||||
|
case "mp3", "aac", "flac":
|
||||||
|
extension = fmt.Sprintf(".%s", format)
|
||||||
|
default:
|
||||||
|
extension = ".wav"
|
||||||
|
}
|
||||||
|
|
||||||
|
// if .wav, do nothing
|
||||||
|
if extension == ".wav" {
|
||||||
|
return src, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// naive conversion based on default values and target extension of file
|
||||||
|
dst := strings.Replace(src, ".wav", extension, -1)
|
||||||
|
commandArgs := []string{"-y", "-i", src, "-vn", dst}
|
||||||
|
out, err := ffmpegCommand(commandArgs)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error: %w out: %s", err, out)
|
||||||
|
}
|
||||||
|
return dst, nil
|
||||||
|
}
|
||||||
|
@ -1721,7 +1721,11 @@ const docTemplate = `{
|
|||||||
"voice": {
|
"voice": {
|
||||||
"description": "voice audio file or speaker id",
|
"description": "voice audio file or speaker id",
|
||||||
"type": "string"
|
"type": "string"
|
||||||
}
|
},
|
||||||
|
"response_format": {
|
||||||
|
"description": "(optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"schema.ToolCall": {
|
"schema.ToolCall": {
|
||||||
|
@ -1714,6 +1714,10 @@
|
|||||||
"voice": {
|
"voice": {
|
||||||
"description": "voice audio file or speaker id",
|
"description": "voice audio file or speaker id",
|
||||||
"type": "string"
|
"type": "string"
|
||||||
|
},
|
||||||
|
"response_format": {
|
||||||
|
"description": "(optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus",
|
||||||
|
"type": "string"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -1742,4 +1746,4 @@
|
|||||||
"in": "header"
|
"in": "header"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -679,6 +679,9 @@ definitions:
|
|||||||
voice:
|
voice:
|
||||||
description: voice audio file or speaker id
|
description: voice audio file or speaker id
|
||||||
type: string
|
type: string
|
||||||
|
response_format:
|
||||||
|
description: (optional) output format of generated audio file, defaults to wav, accept wav, mp3, flac, aac, opus
|
||||||
|
type: string
|
||||||
type: object
|
type: object
|
||||||
schema.ToolCall:
|
schema.ToolCall:
|
||||||
properties:
|
properties:
|
||||||
|
Loading…
Reference in New Issue
Block a user