mirror of
https://github.com/mudler/LocalAI.git
synced 2024-12-18 20:27:57 +00:00
TTS API improvements (#2308)
* update doc on COQUI_LANGUAGE env variable Signed-off-by: blob42 <contact@blob42.xyz> * return errors from tts gRPC backend Signed-off-by: blob42 <contact@blob42.xyz> * handle speaker_id and language in coqui TTS backend Signed-off-by: blob42 <contact@blob42.xyz> * TTS endpoint: add optional language paramter Signed-off-by: blob42 <contact@blob42.xyz> * tts fix: empty language string breaks non-multilingual models Signed-off-by: blob42 <contact@blob42.xyz> * allow tts param definition in config file - consolidate TTS options under `tts` config entry Signed-off-by: blob42 <contact@blob42.xyz> * tts: update doc Signed-off-by: blob42 <contact@blob42.xyz> --------- Signed-off-by: blob42 <contact@blob42.xyz> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
parent
95c65d67f5
commit
b99182c8d4
2
Makefile
2
Makefile
@ -447,7 +447,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
|
||||
.PHONY: protogen-go
|
||||
protogen-go:
|
||||
mkdir -p pkg/grpc/proto
|
||||
protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
|
||||
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
|
||||
backend/backend.proto
|
||||
|
||||
.PHONY: protogen-go-clean
|
||||
|
@ -266,6 +266,7 @@ message TTSRequest {
|
||||
string model = 2;
|
||||
string dst = 3;
|
||||
string voice = 4;
|
||||
optional string language = 5;
|
||||
}
|
||||
|
||||
message TokenizationResponse {
|
||||
|
@ -66,7 +66,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
def TTS(self, request, context):
|
||||
try:
|
||||
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
|
||||
# if model is multilangual add language from request or env as fallback
|
||||
lang = request.language or COQUI_LANGUAGE
|
||||
if lang == "":
|
||||
lang = None
|
||||
if self.tts.is_multi_lingual and lang is None:
|
||||
return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")
|
||||
|
||||
# if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
|
||||
if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
|
||||
return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")
|
||||
|
||||
if self.tts.is_multi_speaker and request.voice is not None:
|
||||
self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
|
||||
else:
|
||||
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
return backend_pb2.Result(success=True)
|
||||
|
@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
|
||||
}
|
||||
}
|
||||
|
||||
func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
|
||||
func ModelTTS(
|
||||
backend,
|
||||
text,
|
||||
modelFile,
|
||||
voice ,
|
||||
language string,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
backendConfig config.BackendConfig,
|
||||
) (string, *proto.Result, error) {
|
||||
bb := backend
|
||||
if bb == "" {
|
||||
bb = model.PiperBackend
|
||||
@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
|
||||
Model: modelPath,
|
||||
Voice: voice,
|
||||
Dst: filePath,
|
||||
Language: &language,
|
||||
})
|
||||
|
||||
// return RPC error if any
|
||||
if !res.Success {
|
||||
return "", nil, fmt.Errorf(res.Message)
|
||||
}
|
||||
|
||||
return filePath, res, err
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ type TTSCMD struct {
|
||||
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
|
||||
Model string `short:"m" required:"" help:"Model name to run the TTS"`
|
||||
Voice string `short:"v" help:"Voice name to run the TTS"`
|
||||
Language string `short:"l" help:"Language to use with the TTS"`
|
||||
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
||||
@ -52,7 +53,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
|
||||
options := config.BackendConfig{}
|
||||
options.SetDefaults()
|
||||
|
||||
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
|
||||
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -15,6 +15,15 @@ const (
|
||||
RAND_SEED = -1
|
||||
)
|
||||
|
||||
type TTSConfig struct {
|
||||
|
||||
// Voice wav path or id
|
||||
Voice string `yaml:"voice"`
|
||||
|
||||
// Vall-e-x
|
||||
VallE VallE `yaml:"vall-e"`
|
||||
}
|
||||
|
||||
type BackendConfig struct {
|
||||
schema.PredictionOptions `yaml:"parameters"`
|
||||
Name string `yaml:"name"`
|
||||
@ -49,8 +58,8 @@ type BackendConfig struct {
|
||||
// GRPC Options
|
||||
GRPC GRPC `yaml:"grpc"`
|
||||
|
||||
// Vall-e-x
|
||||
VallE VallE `yaml:"vall-e"`
|
||||
// TTS specifics
|
||||
TTSConfig `yaml:"tts"`
|
||||
|
||||
// CUDA
|
||||
// Explicitly enable CUDA or not (some backends might need it)
|
||||
|
@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
|
||||
}
|
||||
log.Debug().Msgf("Request for model: %s", modelFile)
|
||||
|
||||
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
|
||||
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -12,10 +12,13 @@ import (
|
||||
)
|
||||
|
||||
// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
|
||||
// @Summary Generates audio from the input text.
|
||||
// @Param request body schema.TTSRequest true "query params"
|
||||
// @Success 200 {string} binary "Response"
|
||||
// @Router /v1/audio/speech [post]
|
||||
// @Summary Generates audio from the input text.
|
||||
// @Accept json
|
||||
// @Produce audio/x-wav
|
||||
// @Param request body schema.TTSRequest true "query params"
|
||||
// @Success 200 {string} binary "generated audio/wav file"
|
||||
// @Router /v1/audio/speech [post]
|
||||
// @Router /tts [post]
|
||||
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
|
||||
@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Err(err)
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
} else {
|
||||
@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
|
||||
cfg.Backend = input.Backend
|
||||
}
|
||||
|
||||
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
|
||||
if input.Language != "" {
|
||||
cfg.Language = input.Language
|
||||
}
|
||||
|
||||
if input.Voice != "" {
|
||||
cfg.Voice = input.Voice
|
||||
}
|
||||
|
||||
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -1,59 +1,61 @@
|
||||
package schema
|
||||
|
||||
import (
|
||||
gopsutil "github.com/shirou/gopsutil/v3/process"
|
||||
)
|
||||
|
||||
type BackendMonitorRequest struct {
|
||||
Model string `json:"model" yaml:"model"`
|
||||
}
|
||||
|
||||
type BackendMonitorResponse struct {
|
||||
MemoryInfo *gopsutil.MemoryInfoStat
|
||||
MemoryPercent float32
|
||||
CPUPercent float64
|
||||
}
|
||||
|
||||
type TTSRequest struct {
|
||||
Model string `json:"model" yaml:"model"`
|
||||
Input string `json:"input" yaml:"input"`
|
||||
Voice string `json:"voice" yaml:"voice"`
|
||||
Backend string `json:"backend" yaml:"backend"`
|
||||
}
|
||||
|
||||
type StoresSet struct {
|
||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||
|
||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||
Values []string `json:"values" yaml:"values"`
|
||||
}
|
||||
|
||||
type StoresDelete struct {
|
||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||
|
||||
Keys [][]float32 `json:"keys"`
|
||||
}
|
||||
|
||||
type StoresGet struct {
|
||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||
|
||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||
}
|
||||
|
||||
type StoresGetResponse struct {
|
||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||
Values []string `json:"values" yaml:"values"`
|
||||
}
|
||||
|
||||
type StoresFind struct {
|
||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||
|
||||
Key []float32 `json:"key" yaml:"key"`
|
||||
Topk int `json:"topk" yaml:"topk"`
|
||||
}
|
||||
|
||||
type StoresFindResponse struct {
|
||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||
Values []string `json:"values" yaml:"values"`
|
||||
Similarities []float32 `json:"similarities" yaml:"similarities"`
|
||||
}
|
||||
package schema
|
||||
|
||||
import (
|
||||
gopsutil "github.com/shirou/gopsutil/v3/process"
|
||||
)
|
||||
|
||||
type BackendMonitorRequest struct {
|
||||
Model string `json:"model" yaml:"model"`
|
||||
}
|
||||
|
||||
type BackendMonitorResponse struct {
|
||||
MemoryInfo *gopsutil.MemoryInfoStat
|
||||
MemoryPercent float32
|
||||
CPUPercent float64
|
||||
}
|
||||
|
||||
// @Description TTS request body
|
||||
type TTSRequest struct {
|
||||
Model string `json:"model" yaml:"model"` // model name or full path
|
||||
Input string `json:"input" yaml:"input"` // text input
|
||||
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
|
||||
Backend string `json:"backend" yaml:"backend"`
|
||||
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
|
||||
}
|
||||
|
||||
type StoresSet struct {
|
||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||
|
||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||
Values []string `json:"values" yaml:"values"`
|
||||
}
|
||||
|
||||
type StoresDelete struct {
|
||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||
|
||||
Keys [][]float32 `json:"keys"`
|
||||
}
|
||||
|
||||
type StoresGet struct {
|
||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||
|
||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||
}
|
||||
|
||||
type StoresGetResponse struct {
|
||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||
Values []string `json:"values" yaml:"values"`
|
||||
}
|
||||
|
||||
type StoresFind struct {
|
||||
Store string `json:"store,omitempty" yaml:"store,omitempty"`
|
||||
|
||||
Key []float32 `json:"key" yaml:"key"`
|
||||
Topk int `json:"topk" yaml:"topk"`
|
||||
}
|
||||
|
||||
type StoresFindResponse struct {
|
||||
Keys [][]float32 `json:"keys" yaml:"keys"`
|
||||
Values []string `json:"values" yaml:"values"`
|
||||
Similarities []float32 `json:"similarities" yaml:"similarities"`
|
||||
}
|
||||
|
@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur
|
||||
}'
|
||||
```
|
||||
|
||||
You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend.
|
||||
|
||||
You can also use config files to configure tts models (see section below on how to use config files).
|
||||
|
||||
### Bark
|
||||
|
||||
[Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
|
||||
@ -148,11 +152,12 @@ name: cloned-voice
|
||||
backend: vall-e-x
|
||||
parameters:
|
||||
model: "cloned-voice"
|
||||
vall-e:
|
||||
# The path to the audio file to be cloned
|
||||
# relative to the models directory
|
||||
# Max 15s
|
||||
audio_path: "audio-sample.wav"
|
||||
tts:
|
||||
vall-e:
|
||||
# The path to the audio file to be cloned
|
||||
# relative to the models directory
|
||||
# Max 15s
|
||||
audio_path: "audio-sample.wav"
|
||||
```
|
||||
|
||||
Then you can specify the model name in the requests:
|
||||
@ -164,6 +169,35 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
|
||||
}' | aplay
|
||||
```
|
||||
|
||||
## Parler-tts
|
||||
### Parler-tts
|
||||
|
||||
`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
|
||||
`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
|
||||
|
||||
|
||||
## Using config files
|
||||
|
||||
You can also use a `config-file` to specify TTS models and their parameters.
|
||||
|
||||
In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language.
|
||||
|
||||
```yaml
|
||||
|
||||
name: xtts_v2
|
||||
backend: coqui
|
||||
parameters:
|
||||
language: fr
|
||||
model: tts_models/multilingual/multi-dataset/xtts_v2
|
||||
|
||||
tts:
|
||||
voice: Ana Florence
|
||||
```
|
||||
|
||||
With this config, you can now use the following curl command to generate a text-to-speech audio file:
|
||||
```bash
|
||||
curl -L http://localhost:8080/tts \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "xtts_v2",
|
||||
"input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?"
|
||||
}' | aplay
|
||||
```
|
||||
|
Loading…
Reference in New Issue
Block a user