From 1377c5b953e47c69b1b44e09be0133f8eef8881d Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Sun, 5 May 2024 20:57:05 +0200 Subject: [PATCH] good stuff --- configs/config.yaml | 10 +++++++++- lollms/app.py | 11 ++++++++++- lollms/configs/config.yaml | 10 +++++++++- lollms/media.py | 16 ++++++++++------ lollms/server/configs/config.yaml | 10 +++++++++- lollms/server/endpoints/lollms_xtts.py | 1 - 6 files changed, 47 insertions(+), 11 deletions(-) diff --git a/configs/config.yaml b/configs/config.yaml index 65c7f55..c0d554f 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 91 +version: 92 binding_name: null model_name: null model_variant: null @@ -88,6 +88,14 @@ xtts_use_streaming_mode: true auto_read: false xtts_current_voice: null xtts_current_language: en +xtts_stream_chunk_size: 100 +xtts_temperature: 0.75 +xtts_length_penalty: 1.0 +xtts_repetition_penalty: 5.0 +xtts_top_k: 50 +xtts_top_p: 0.85 +xtts_speed: 1 +xtts_enable_text_splitting: true # Image generation service enable_sd_service: false diff --git a/lollms/app.py b/lollms/app.py index 026e937..29edf60 100644 --- a/lollms/app.py +++ b/lollms/app.py @@ -26,6 +26,7 @@ import sys, os import platform import gc import yaml +import time class LollmsApplication(LoLLMsCom): def __init__( self, @@ -265,7 +266,15 @@ class LollmsApplication(LoLLMsCom): trace_exception(ex) self.warning(f"Couldn't load vllm") - + if self.config.whisper_activate: + try: + from lollms.media import AudioRecorder + self.rec = AudioRecorder(self.lollms_paths.personal_outputs_path/"test.wav") + self.rec.start_recording() + time.sleep(1) + self.rec.stop_recording() + except: + pass if self.config.xtts_enable: try: from lollms.services.xtts.lollms_xtts import LollmsXTTS diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index 65c7f55..c0d554f 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 91 +version: 92 binding_name: null model_name: null model_variant: null @@ -88,6 +88,14 @@ xtts_use_streaming_mode: true auto_read: false xtts_current_voice: null xtts_current_language: en +xtts_stream_chunk_size: 100 +xtts_temperature: 0.75 +xtts_length_penalty: 1.0 +xtts_repetition_penalty: 5.0 +xtts_top_k: 50 +xtts_top_p: 0.85 +xtts_speed: 1 +xtts_enable_text_splitting: true # Image generation service enable_sd_service: false diff --git a/lollms/media.py b/lollms/media.py index 2240728..2a9f628 100644 --- a/lollms/media.py +++ b/lollms/media.py @@ -79,11 +79,12 @@ from scipy.io.wavfile import write from matplotlib import pyplot as plt import numpy as np from scipy.signal import spectrogram - +from pathlib import Path class AudioRecorder: - def __init__(self, sio:socketio.Client, filename, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom:LoLLMsCom=None, build_spectrogram=False, model = "base", transcribe=False): + def __init__(self, filename:Path, sio:socketio.Client=None, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom:LoLLMsCom=None, build_spectrogram=False, model = "base", transcribe=False): self.sio = sio - self.filename = filename + self.filename = Path(filename) + self.filename.parent.mkdir(exist_ok=True, parents=True) self.channels = channels self.sample_rate = sample_rate self.chunk_size = chunk_size @@ -138,7 +139,8 @@ class AudioRecorder: with open(transcription_fn, "w", encoding="utf-8") as f: f.write(result["text"]) self.lollmsCom.info(f"File saved to {transcription_fn}") - run_async(partial(self.sio.emit,'transcript', result["text"])) + if self.sio: + run_async(partial(self.sio.emit,'transcript', result["text"])) return {"text":result["text"], "audio":transcription_fn} else: return {"text":""} @@ -152,7 +154,8 @@ class AudioRecorder: plt.savefig(img_buffer, format='png') img_buffer.seek(0) img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8') - run_async(partial(self.sio.emit,'update_spectrogram', img_base64)) + if self.sio: + run_async(partial(self.sio.emit,'update_spectrogram', img_base64)) self.last_spectrogram_update = time.perf_counter() plt.clf() @@ -207,7 +210,8 @@ class WebcamImageSender: _, buffer = cv2.imencode('.jpg', frame) image_base64 = base64.b64encode(buffer) - run_async(partial(self.sio.emit,"video_stream_image", image_base64.decode('utf-8'))) + if self.sio: + run_async(partial(self.sio.emit,"video_stream_image", image_base64.decode('utf-8'))) cap.release() except Exception as ex: diff --git a/lollms/server/configs/config.yaml b/lollms/server/configs/config.yaml index 65c7f55..c0d554f 100644 --- a/lollms/server/configs/config.yaml +++ b/lollms/server/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 91 +version: 92 binding_name: null model_name: null model_variant: null @@ -88,6 +88,14 @@ xtts_use_streaming_mode: true auto_read: false xtts_current_voice: null xtts_current_language: en +xtts_stream_chunk_size: 100 +xtts_temperature: 0.75 +xtts_length_penalty: 1.0 +xtts_repetition_penalty: 5.0 +xtts_top_k: 50 +xtts_top_p: 0.85 +xtts_speed: 1 +xtts_enable_text_splitting: true # Image generation service enable_sd_service: false diff --git a/lollms/server/endpoints/lollms_xtts.py b/lollms/server/endpoints/lollms_xtts.py index 5c1e42b..57308c5 100644 --- a/lollms/server/endpoints/lollms_xtts.py +++ b/lollms/server/endpoints/lollms_xtts.py @@ -150,7 +150,6 @@ async def text2Audio(request: LollmsText2AudioRequest): else: lollmsElfServer.InfoMessage("xtts is not up yet.\nPlease wait for it to load then try again. This may take some time.") return {"status":False, "error":"Service not ready yet"} - return {"url": url} except Exception as ex: trace_exception(ex) return {"url": None}