fix(vall-e-x): Fix voice cloning (#1696)

2025-06-01 23:10:49 +00:00 · 2024-02-11 11:20:00 +01:00 · 2024-02-11 11:20:00 +01:00 · fd68bf7084
commit fd68bf7084
parent 58cdf97361
2 changed files with 7 additions and 3 deletions
--- a/backend/python/vall-e-x/ttsvalle.py
+++ b/backend/python/vall-e-x/ttsvalle.py
@ -55,6 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            print("Preparing models, please wait", file=sys.stderr)
            # download and load all models
            preload_models()
            self.clonedVoice = False
            # Assume directory from request.ModelFile.
            # Only if request.LoraAdapter it's not an absolute path
            if request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
@ -65,6 +66,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.AudioPath != "":
                print("Generating model", file=sys.stderr)
                make_prompt(name=model_name, audio_prompt_path=request.AudioPath)
                self.clonedVoice = True
                ### Use given transcript
                ##make_prompt(name=model_name, audio_prompt_path="paimon_prompt.wav",
                ##                transcript="Just, what was that? Paimon thought we were gonna get eaten.")
@ -91,6 +93,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        try:
            audio_array = None
            if model != "":
                if self.clonedVoice:
                    model = os.path.basename(request.model)
                audio_array = generate_audio(request.text, prompt=model)
            else:
                audio_array = generate_audio(request.text)
--- a/docs/content/docs/features/text-to-audio.md
+++ b/docs/content/docs/features/text-to-audio.md
@ -144,15 +144,15 @@ parameters:
  model: "cloned-voice"
 vall-e:
  # The path to the audio file to be cloned
-  # relative to the models directory 
+  # relative to the models directory
-  audio_path: "path-to-wav-source.wav"
+  # Max 15s
  audio_path: "audio-sample.wav"
 ```
 Then you can specify the model name in the requests:
 ```
 curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
     "backend": "vall-e-x",
     "model": "cloned-voice",
     "input":"Hello!"
   }' | aplay