mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
TTS API improvements (#2308)
* update doc on COQUI_LANGUAGE env variable Signed-off-by: blob42 <contact@blob42.xyz> * return errors from tts gRPC backend Signed-off-by: blob42 <contact@blob42.xyz> * handle speaker_id and language in coqui TTS backend Signed-off-by: blob42 <contact@blob42.xyz> * TTS endpoint: add optional language paramter Signed-off-by: blob42 <contact@blob42.xyz> * tts fix: empty language string breaks non-multilingual models Signed-off-by: blob42 <contact@blob42.xyz> * allow tts param definition in config file - consolidate TTS options under `tts` config entry Signed-off-by: blob42 <contact@blob42.xyz> * tts: update doc Signed-off-by: blob42 <contact@blob42.xyz> --------- Signed-off-by: blob42 <contact@blob42.xyz> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
parent
95c65d67f5
commit
b99182c8d4
2
Makefile
2
Makefile
@ -447,7 +447,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
|
|||||||
.PHONY: protogen-go
|
.PHONY: protogen-go
|
||||||
protogen-go:
|
protogen-go:
|
||||||
mkdir -p pkg/grpc/proto
|
mkdir -p pkg/grpc/proto
|
||||||
protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
|
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
|
||||||
backend/backend.proto
|
backend/backend.proto
|
||||||
|
|
||||||
.PHONY: protogen-go-clean
|
.PHONY: protogen-go-clean
|
||||||
|
@ -266,6 +266,7 @@ message TTSRequest {
|
|||||||
string model = 2;
|
string model = 2;
|
||||||
string dst = 3;
|
string dst = 3;
|
||||||
string voice = 4;
|
string voice = 4;
|
||||||
|
optional string language = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
message TokenizationResponse {
|
message TokenizationResponse {
|
||||||
|
@ -66,7 +66,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
|
|
||||||
def TTS(self, request, context):
|
def TTS(self, request, context):
|
||||||
try:
|
try:
|
||||||
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
|
# if model is multilangual add language from request or env as fallback
|
||||||
|
lang = request.language or COQUI_LANGUAGE
|
||||||
|
if lang == "":
|
||||||
|
lang = None
|
||||||
|
if self.tts.is_multi_lingual and lang is None:
|
||||||
|
return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")
|
||||||
|
|
||||||
|
# if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
|
||||||
|
if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
|
||||||
|
return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")
|
||||||
|
|
||||||
|
if self.tts.is_multi_speaker and request.voice is not None:
|
||||||
|
self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
|
||||||
|
else:
|
||||||
|
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||||
return backend_pb2.Result(success=True)
|
return backend_pb2.Result(success=True)
|
||||||
|
@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
|
func ModelTTS(
|
||||||
|
backend,
|
||||||
|
text,
|
||||||
|
modelFile,
|
||||||
|
voice ,
|
||||||
|
language string,
|
||||||
|
loader *model.ModelLoader,
|
||||||
|
appConfig *config.ApplicationConfig,
|
||||||
|
backendConfig config.BackendConfig,
|
||||||
|
) (string, *proto.Result, error) {
|
||||||
bb := backend
|
bb := backend
|
||||||
if bb == "" {
|
if bb == "" {
|
||||||
bb = model.PiperBackend
|
bb = model.PiperBackend
|
||||||
@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
|
|||||||
Model: modelPath,
|
Model: modelPath,
|
||||||
Voice: voice,
|
Voice: voice,
|
||||||
Dst: filePath,
|
Dst: filePath,
|
||||||
|
Language: &language,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// return RPC error if any
|
||||||
|
if !res.Success {
|
||||||
|
return "", nil, fmt.Errorf(res.Message)
|
||||||
|
}
|
||||||
|
|
||||||
return filePath, res, err
|
return filePath, res, err
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,7 @@ type TTSCMD struct {
|
|||||||
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
|
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
|
||||||
Model string `short:"m" required:"" help:"Model name to run the TTS"`
|
Model string `short:"m" required:"" help:"Model name to run the TTS"`
|
||||||
Voice string `short:"v" help:"Voice name to run the TTS"`
|
Voice string `short:"v" help:"Voice name to run the TTS"`
|
||||||
|
Language string `short:"l" help:"Language to use with the TTS"`
|
||||||
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
|
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
|
||||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||||
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
||||||
@ -52,7 +53,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
|
|||||||
options := config.BackendConfig{}
|
options := config.BackendConfig{}
|
||||||
options.SetDefaults()
|
options.SetDefaults()
|
||||||
|
|
||||||
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
|
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,15 @@ const (
|
|||||||
RAND_SEED = -1
|
RAND_SEED = -1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type TTSConfig struct {
|
||||||
|
|
||||||
|
// Voice wav path or id
|
||||||
|
Voice string `yaml:"voice"`
|
||||||
|
|
||||||
|
// Vall-e-x
|
||||||
|
VallE VallE `yaml:"vall-e"`
|
||||||
|
}
|
||||||
|
|
||||||
type BackendConfig struct {
|
type BackendConfig struct {
|
||||||
schema.PredictionOptions `yaml:"parameters"`
|
schema.PredictionOptions `yaml:"parameters"`
|
||||||
Name string `yaml:"name"`
|
Name string `yaml:"name"`
|
||||||
@ -49,8 +58,8 @@ type BackendConfig struct {
|
|||||||
// GRPC Options
|
// GRPC Options
|
||||||
GRPC GRPC `yaml:"grpc"`
|
GRPC GRPC `yaml:"grpc"`
|
||||||
|
|
||||||
// Vall-e-x
|
// TTS specifics
|
||||||
VallE VallE `yaml:"vall-e"`
|
TTSConfig `yaml:"tts"`
|
||||||
|
|
||||||
// CUDA
|
// CUDA
|
||||||
// Explicitly enable CUDA or not (some backends might need it)
|
// Explicitly enable CUDA or not (some backends might need it)
|
||||||
|
@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
|
|||||||
}
|
}
|
||||||
log.Debug().Msgf("Request for model: %s", modelFile)
|
log.Debug().Msgf("Request for model: %s", modelFile)
|
||||||
|
|
||||||
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
|
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -12,10 +12,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
|
// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
|
||||||
// @Summary Generates audio from the input text.
|
// @Summary Generates audio from the input text.
|
||||||
// @Param request body schema.TTSRequest true "query params"
|
// @Accept json
|
||||||
// @Success 200 {string} binary "Response"
|
// @Produce audio/x-wav
|
||||||
// @Router /v1/audio/speech [post]
|
// @Param request body schema.TTSRequest true "query params"
|
||||||
|
// @Success 200 {string} binary "generated audio/wav file"
|
||||||
|
// @Router /v1/audio/speech [post]
|
||||||
|
// @Router /tts [post]
|
||||||
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||||
return func(c *fiber.Ctx) error {
|
return func(c *fiber.Ctx) error {
|
||||||
|
|
||||||
@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
|
|||||||
)
|
)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
log.Err(err)
|
||||||
modelFile = input.Model
|
modelFile = input.Model
|
||||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||||
} else {
|
} else {
|
||||||
@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
|
|||||||
cfg.Backend = input.Backend
|
cfg.Backend = input.Backend
|
||||||
}
|
}
|
||||||
|
|
||||||
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
|
if input.Language != "" {
|
||||||
|
cfg.Language = input.Language
|
||||||
|
}
|
||||||
|
|
||||||
|
if input.Voice != "" {
|
||||||
|
cfg.Voice = input.Voice
|
||||||
|
}
|
||||||
|
|
||||||
|
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -1,59 +1,61 @@
|
|||||||
package schema
|
package schema
|
||||||
|
|
||||||
import (
|
import (
|
||||||
gopsutil "github.com/shirou/gopsutil/v3/process"
|
gopsutil "github.com/shirou/gopsutil/v3/process"
|
||||||
)
|
)
|
||||||
|
|
||||||
type BackendMonitorRequest struct {
|
type BackendMonitorRequest struct {
|
||||||
Model string `json:"model" yaml:"model"`
|
Model string `json:"model" yaml:"model"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BackendMonitorResponse struct {
|
type BackendMonitorResponse struct {
|
||||||
MemoryInfo *gopsutil.MemoryInfoStat
|
MemoryInfo *gopsutil.MemoryInfoStat
|
||||||
MemoryPercent float32
|
MemoryPercent float32
|
||||||
CPUPercent float64
|
CPUPercent float64
|
||||||
}
|
}
|
||||||
|
|
||||||
type TTSRequest struct {
|
// @Description TTS request body
|
||||||
Model string `json:"model" yaml:"model"`
|
type TTSRequest struct {
|
||||||
Input string `json:"input" yaml:"input"`
|
Model string `json:"model" yaml:"model"` // model name or full path
|
||||||
Voice string `json:"voice" yaml:"voice"`
|
Input string `json:"input" yaml:"input"` // text input
|
||||||
Backend string `json:"backend" yaml:"backend"`
|
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
|
||||||
}
|
Backend string `json:"backend" yaml:"backend"`
|
||||||
|
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
|
||||||
type StoresSet struct {
|
}
|
||||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
|
||||||
|
type StoresSet struct {
|
||||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||||
Values []string `json:"values" yaml:"values"`
|
|
||||||
}
|
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||||
|
Values []string `json:"values" yaml:"values"`
|
||||||
type StoresDelete struct {
|
}
|
||||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
|
||||||
|
type StoresDelete struct {
|
||||||
Keys [][]float32 `json:"keys"`
|
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||||
}
|
|
||||||
|
Keys [][]float32 `json:"keys"`
|
||||||
type StoresGet struct {
|
}
|
||||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
|
||||||
|
type StoresGet struct {
|
||||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||||
}
|
|
||||||
|
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||||
type StoresGetResponse struct {
|
}
|
||||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
|
||||||
Values []string `json:"values" yaml:"values"`
|
type StoresGetResponse struct {
|
||||||
}
|
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||||
|
Values []string `json:"values" yaml:"values"`
|
||||||
type StoresFind struct {
|
}
|
||||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
|
||||||
|
type StoresFind struct {
|
||||||
Key []float32 `json:"key" yaml:"key"`
|
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||||
Topk int `json:"topk" yaml:"topk"`
|
|
||||||
}
|
Key []float32 `json:"key" yaml:"key"`
|
||||||
|
Topk int `json:"topk" yaml:"topk"`
|
||||||
type StoresFindResponse struct {
|
}
|
||||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
|
||||||
Values []string `json:"values" yaml:"values"`
|
type StoresFindResponse struct {
|
||||||
Similarities []float32 `json:"similarities" yaml:"similarities"`
|
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||||
}
|
Values []string `json:"values" yaml:"values"`
|
||||||
|
Similarities []float32 `json:"similarities" yaml:"similarities"`
|
||||||
|
}
|
||||||
|
@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend.
|
||||||
|
|
||||||
|
You can also use config files to configure tts models (see section below on how to use config files).
|
||||||
|
|
||||||
### Bark
|
### Bark
|
||||||
|
|
||||||
[Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
|
[Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
|
||||||
@ -148,11 +152,12 @@ name: cloned-voice
|
|||||||
backend: vall-e-x
|
backend: vall-e-x
|
||||||
parameters:
|
parameters:
|
||||||
model: "cloned-voice"
|
model: "cloned-voice"
|
||||||
vall-e:
|
tts:
|
||||||
# The path to the audio file to be cloned
|
vall-e:
|
||||||
# relative to the models directory
|
# The path to the audio file to be cloned
|
||||||
# Max 15s
|
# relative to the models directory
|
||||||
audio_path: "audio-sample.wav"
|
# Max 15s
|
||||||
|
audio_path: "audio-sample.wav"
|
||||||
```
|
```
|
||||||
|
|
||||||
Then you can specify the model name in the requests:
|
Then you can specify the model name in the requests:
|
||||||
@ -164,6 +169,35 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
|
|||||||
}' | aplay
|
}' | aplay
|
||||||
```
|
```
|
||||||
|
|
||||||
## Parler-tts
|
### Parler-tts
|
||||||
|
|
||||||
`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
|
`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
|
||||||
|
|
||||||
|
|
||||||
|
## Using config files
|
||||||
|
|
||||||
|
You can also use a `config-file` to specify TTS models and their parameters.
|
||||||
|
|
||||||
|
In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
|
||||||
|
name: xtts_v2
|
||||||
|
backend: coqui
|
||||||
|
parameters:
|
||||||
|
language: fr
|
||||||
|
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||||
|
|
||||||
|
tts:
|
||||||
|
voice: Ana Florence
|
||||||
|
```
|
||||||
|
|
||||||
|
With this config, you can now use the following curl command to generate a text-to-speech audio file:
|
||||||
|
```bash
|
||||||
|
curl -L http://localhost:8080/tts \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "xtts_v2",
|
||||||
|
"input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?"
|
||||||
|
}' | aplay
|
||||||
|
```
|
||||||
|
Loading…
Reference in New Issue
Block a user