TTS API improvements (#2308)

* update doc on COQUI_LANGUAGE env variable Signed-off-by: blob42 <contact@blob42.xyz> * return errors from tts gRPC backend Signed-off-by: blob42 <contact@blob42.xyz> * handle speaker_id and language in coqui TTS backend Signed-off-by: blob42 <contact@blob42.xyz> * TTS endpoint: add optional language paramter Signed-off-by: blob42 <contact@blob42.xyz> * tts fix: empty language string breaks non-multilingual models Signed-off-by: blob42 <contact@blob42.xyz> * allow tts param definition in config file - consolidate TTS options under `tts` config entry Signed-off-by: blob42 <contact@blob42.xyz> * tts: update doc Signed-off-by: blob42 <contact@blob42.xyz> --------- Signed-off-by: blob42 <contact@blob42.xyz> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2025-06-14 13:08:08 +00:00 · 2024-06-01 20:26:27 +02:00
parent 95c65d67f5
commit b99182c8d4
10 changed files with 166 additions and 78 deletions
--- a/2
+++ b/2
@ -447,7 +447,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
 protogen-go:
 	mkdir -p pkg/grpc/proto
-	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto
 .PHONY: protogen-go-clean
--- a/backend/backend.proto
+++ b/backend/backend.proto
@ -266,6 +266,7 @@ message TTSRequest {
  string model = 2;
  string dst = 3;
  string voice = 4;
  optional string language = 5;
 }
 message TokenizationResponse {
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@ -66,7 +66,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def TTS(self, request, context):
        try:
-            self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
+            # if model is multilangual add language from request or env as fallback
            lang = request.language or COQUI_LANGUAGE
            if lang == "":
                lang = None
            if self.tts.is_multi_lingual and lang is None:
               return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")
            # if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
            if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
                return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")
            if self.tts.is_multi_speaker and request.voice is not None:
               self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
            else:
                self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(success=True)
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }
-func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
+func ModelTTS(
 	backend,
 	text,
 	modelFile,
 	voice ,
 	language string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig,
 ) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
 		Model: modelPath,
 		Voice: voice,
 		Dst:   filePath,
 		Language: &language,
 	})
 	// return RPC error if any
 	if !res.Success {
 		return "", nil, fmt.Errorf(res.Message)
 	}
 	return filePath, res, err
 }
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@ -20,6 +20,7 @@ type TTSCMD struct {
 	Backend           string `short:"b" default:"piper" help:"Backend to run the TTS model"`
 	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
 	Voice             string `short:"v" help:"Voice name to run the TTS"`
 	Language          string `short:"l" help:"Language to use with the TTS"`
 	OutputFile        string `short:"o" type:"path" help:"The path to write the output wav file"`
 	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
@ -52,7 +53,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()
-	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
+	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
 	if err != nil {
 		return err
 	}
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -15,6 +15,15 @@ const (
 	RAND_SEED = -1
 )
 type TTSConfig struct {
 	// Voice wav path or id
 	Voice string `yaml:"voice"`
 	// Vall-e-x
 	VallE    VallE  `yaml:"vall-e"`
 }
 type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`
@ -49,8 +58,8 @@ type BackendConfig struct {
 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`
-	// Vall-e-x
+	// TTS specifics
-	VallE VallE `yaml:"vall-e"`
+	TTSConfig `yaml:"tts"`
 	// CUDA
 	// Explicitly enable CUDA or not (some backends might need it)
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@ -13,9 +13,12 @@ import (
 // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
 //	@Summary	Generates audio from the input text.
 //  @Accept json
 //  @Produce audio/x-wav
 //	@Param		request	body		schema.TTSRequest	true	"query params"
-// @Success 200 {string} binary	 "Response"
+//	@Success	200		{string}	binary				"generated audio/wav file"
 //	@Router		/v1/audio/speech [post]
 //	@Router		/tts [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		)
 		if err != nil {
 			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			cfg.Backend = input.Backend
 		}
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
+		if input.Language != "" {
 			cfg.Language = input.Language
 		}
 		if input.Voice != "" {
 			cfg.Voice = input.Voice
 		}
 		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@ -14,11 +14,13 @@ type BackendMonitorResponse struct {
 	CPUPercent    float64
 }
 // @Description TTS request body
 type TTSRequest struct {
-	Model   string `json:"model" yaml:"model"`
+	Model    string `json:"model" yaml:"model"` // model name or full path
-	Input   string `json:"input" yaml:"input"`
+	Input    string `json:"input" yaml:"input"` // text input
-	Voice   string `json:"voice" yaml:"voice"`
+	Voice    string `json:"voice" yaml:"voice"` // voice audio file or speaker id
 	Backend  string `json:"backend" yaml:"backend"`
 	Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
 }
 type StoresSet struct {
--- a/docs/content/docs/features/text-to-audio.md
+++ b/docs/content/docs/features/text-to-audio.md
@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur
        }'
 ```
 You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend.
 You can also use config files to configure tts models (see section below on how to use config files).
 ### Bark
 [Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
@ -148,6 +152,7 @@ name: cloned-voice
 backend: vall-e-x
 parameters:
  model: "cloned-voice"
 tts:
    vall-e:
      # The path to the audio file to be cloned
      # relative to the models directory
@ -164,6 +169,35 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
   }' | aplay
 ```
-## Parler-tts
+### Parler-tts
 `parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
 ## Using config files
 You can also use a `config-file` to specify TTS models and their parameters.
 In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language.
 ```yaml
 name: xtts_v2
 backend: coqui
 parameters:
  language: fr
  model: tts_models/multilingual/multi-dataset/xtts_v2
 tts:
  voice: Ana Florence
 ```
 With this config, you can now use the following curl command to generate a text-to-speech audio file:
 ```bash
 curl -L http://localhost:8080/tts \
    -H "Content-Type: application/json" \
    -d '{
 "model": "xtts_v2",
 "input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?"
 }' | aplay
 ```