diff --git a/configs/config.yaml b/configs/config.yaml index 6b93dcb..49a783a 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 128 +version: 129 binding_name: null model_name: null model_variant: null @@ -101,7 +101,7 @@ copy_to_clipboard_add_all_details: false # -------------------- Services global configurations -------------------------- # Select the active test to speach, text to image and speach to text services -active_tts_service: "None" # xtts (offline), openai_tts (API key required) +active_tts_service: "None" # xtts (offline), openai_tts (API key required), elevenlabs_tts (API key required) active_tti_service: "None" # autosd (offline), dall-e (online) active_stt_service: "None" # whisper (offline), asr (offline or online), openai_whiosper (API key required) active_ttm_service: "None" # musicgen (offline) @@ -161,6 +161,12 @@ openai_tts_key: "" openai_tts_model: "tts-1" openai_tts_voice: "alloy" + +elevenlabs_tts_key: "" +elevenlabs_tts_model_id: "eleven_monolingual_v1" +elevenlabs_tts_voice_stability: 0.5 +elevenlabs_tts_voice_boost: 0.5 +elevenlabs_tts_voice_id: EXAVITQu4vr4xnSDxMaL # ***************** TTI ***************** use_negative_prompt: true diff --git a/lollms/app.py b/lollms/app.py index 5ed5384..dcce264 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -403,7 +403,11 @@ class LollmsApplication(LoLLMsCom): ASCIIColors.blue("Activating TTS services") - if self.config.active_tts_service == "openai_tts": + + if self.config.active_tts_service == "eleven_labs_tts": + from lollms.services.eleven_labs_tts.lollms_eleven_labs_tts import LollmsElevenLabsTTS + self.tts = LollmsElevenLabsTTS(self, self.config.elevenlabs_tts_model_id, self.config.elevenlabs_tts_voice_id, self.config.elevenlabs_tts_key, stability=self.config.elevenlabs_tts_voice_stability, similarity_boost=self.config.elevenlabs_tts_voice_similarity_boost) + elif self.config.active_tts_service == "openai_tts": from lollms.services.open_ai_tts.lollms_openai_tts import LollmsOpenAITTS self.tts = LollmsOpenAITTS(self, self.config.openai_tts_model, self.config.openai_tts_voice, self.config.openai_tts_key) elif self.config.active_tts_service == "xtts" and self.xtts: @@ -517,7 +521,10 @@ class LollmsApplication(LoLLMsCom): self.tti = LollmsComfyUI(self, comfyui_base_url=self.config.comfyui_base_url) ASCIIColors.blue("Activating TTS service") - if self.config.active_tts_service == "openai_tts" and (self.tts is None or self.tts.name!="openai_tts"): + if self.config.active_tts_service == "eleven_labs_tts": + from lollms.services.eleven_labs_tts.lollms_eleven_labs_tts import LollmsElevenLabsTTS + self.tts = LollmsElevenLabsTTS(self, self.config.elevenlabs_tts_model_id, self.config.elevenlabs_tts_voice_id, self.config.elevenlabs_tts_key, stability=self.config.elevenlabs_tts_voice_stability, similarity_boost=self.config.elevenlabs_tts_voice_similarity_boost) + elif self.config.active_tts_service == "openai_tts" and (self.tts is None or self.tts.name!="openai_tts"): from lollms.services.open_ai_tts.lollms_openai_tts import LollmsOpenAITTS self.tts = LollmsOpenAITTS(self, self.config.openai_tts_model, self.config.openai_tts_voice, self.config.openai_tts_key) elif self.config.active_tts_service == "xtts" and self.xtts: diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index 6b93dcb..49a783a 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 128 +version: 129 binding_name: null model_name: null model_variant: null @@ -101,7 +101,7 @@ copy_to_clipboard_add_all_details: false # -------------------- Services global configurations -------------------------- # Select the active test to speach, text to image and speach to text services -active_tts_service: "None" # xtts (offline), openai_tts (API key required) +active_tts_service: "None" # xtts (offline), openai_tts (API key required), elevenlabs_tts (API key required) active_tti_service: "None" # autosd (offline), dall-e (online) active_stt_service: "None" # whisper (offline), asr (offline or online), openai_whiosper (API key required) active_ttm_service: "None" # musicgen (offline) @@ -161,6 +161,12 @@ openai_tts_key: "" openai_tts_model: "tts-1" openai_tts_voice: "alloy" + +elevenlabs_tts_key: "" +elevenlabs_tts_model_id: "eleven_monolingual_v1" +elevenlabs_tts_voice_stability: 0.5 +elevenlabs_tts_voice_boost: 0.5 +elevenlabs_tts_voice_id: EXAVITQu4vr4xnSDxMaL # ***************** TTI ***************** use_negative_prompt: true diff --git a/lollms/services/eleven_labs_tts/lollms_eleven_labs_tts.py b/lollms/services/eleven_labs_tts/lollms_eleven_labs_tts.py new file mode 100644 index 0000000..d3af5b1 --- /dev/null +++ b/lollms/services/eleven_labs_tts/lollms_eleven_labs_tts.py @@ -0,0 +1,132 @@ +# Title LollmsOpenAITTS +# Licence: MIT +# Author : Paris Neo +# Uses open AI api to perform text to speech +# + +from pathlib import Path +import sys +from lollms.app import LollmsApplication +from lollms.paths import LollmsPaths +from lollms.config import TypedConfig, ConfigTemplate, BaseConfig +import time +import io +import sys +import requests +import os +import base64 +import subprocess +import time +import json +import platform +from dataclasses import dataclass +from PIL import Image, PngImagePlugin +from enum import Enum +from typing import List, Dict, Any + +from ascii_colors import ASCIIColors, trace_exception +from lollms.paths import LollmsPaths +from lollms.utilities import PackageManager, find_next_available_filename +from lollms.tts import LollmsTTS +import subprocess +import shutil +from tqdm import tqdm +import threading +from io import BytesIO +from openai import OpenAI + +if not PackageManager.check_package_installed("sounddevice"): + PackageManager.install_package("sounddevice") +if not PackageManager.check_package_installed("soundfile"): + PackageManager.install_package("soundfile") + +import sounddevice as sd +import soundfile as sf + +def get_Whisper(lollms_paths:LollmsPaths): + return LollmsElevenLabsTTS + +class LollmsElevenLabsTTS(LollmsTTS): + def __init__( + self, + app:LollmsApplication, + model_id: str = "eleven_monolingual_v2", + voice_id: str = "EXAVITQu4vr4xnSDxMaL", + api_key: str = "", + output_path: Path | str = None, + stability: float = 0.5, + similarity_boost: float = 0.5, + streaming: bool = False + ): + super().__init__("elevenlabs_tts", app, model_id, voice_id, api_key, output_path) + self.voice_id = voice_id + self.model_id = model_id + self.api_key = api_key + self.output_path = output_path + self.stability = stability + self.similarity_boost = similarity_boost + self.streaming = streaming + self.ready = True + + def tts_file(self, text, speaker=None, file_name_or_path: Path | str = None, language="en", use_threading=False): + speech_file_path = file_name_or_path + payload = { + "text": text, + "model_id": self.model_id, + "voice_settings": { + "stability": self.stability, + "similarity_boost": self.similarity_boost + } + } + headers = { + "xi-api-key": self.api_key, + "Content-Type": "application/json" + } + + if self.streaming: + url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream" + response = requests.post(url, json=payload, headers=headers) + # Handle streaming response if needed + else: + url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}" + response = requests.post(url, json=payload, headers=headers) + with open(speech_file_path, 'wb') as f: + f.write(response.content) + + return speech_file_path + + def tts_audio(self, text, speaker: str = None, file_name_or_path: Path | str = None, language="en", use_threading=False): + speech_file_path = file_name_or_path + payload = { + "text": text, + "model_id": self.model_id, + "voice_settings": { + "stability": self.stability, + "similarity_boost": self.similarity_boost + } + } + headers = { + "xi-api-key": self.api_key, + "Content-Type": "application/json" + } + + if self.streaming: + url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}/stream" + response = requests.post(url, json=payload, headers=headers) + # Handle streaming response if needed + else: + url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}" + response = requests.post(url, json=payload, headers=headers) + with open(speech_file_path, 'wb') as f: + f.write(response.content) + + def play_audio(file_path): + # Read the audio file + data, fs = sf.read(file_path, dtype='float32') + # Play the audio file + sd.play(data, fs) + # Wait until the file is done playing + sd.wait() + + # Example usage + play_audio(speech_file_path)