From 1377c5b953e47c69b1b44e09be0133f8eef8881d Mon Sep 17 00:00:00 2001
From: Saifeddine ALOUI <aloui.seifeddine@gmail.com>
Date: Sun, 5 May 2024 20:57:05 +0200
Subject: [PATCH] good stuff

---
 configs/config.yaml                    | 10 +++++++++-
 lollms/app.py                          | 11 ++++++++++-
 lollms/configs/config.yaml             | 10 +++++++++-
 lollms/media.py                        | 16 ++++++++++------
 lollms/server/configs/config.yaml      | 10 +++++++++-
 lollms/server/endpoints/lollms_xtts.py |  1 -
 6 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/configs/config.yaml b/configs/config.yaml
index 65c7f55..c0d554f 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 91
+version: 92
 binding_name: null
 model_name: null
 model_variant: null
@@ -88,6 +88,14 @@ xtts_use_streaming_mode: true
 auto_read: false
 xtts_current_voice: null
 xtts_current_language: en
+xtts_stream_chunk_size: 100
+xtts_temperature: 0.75
+xtts_length_penalty: 1.0
+xtts_repetition_penalty: 5.0
+xtts_top_k: 50
+xtts_top_p: 0.85
+xtts_speed: 1
+xtts_enable_text_splitting: true
 
 # Image generation service
 enable_sd_service: false
diff --git a/lollms/app.py b/lollms/app.py
index 026e937..29edf60 100644
--- a/lollms/app.py
+++ b/lollms/app.py
@@ -26,6 +26,7 @@ import sys, os
 import platform
 import gc
 import yaml
+import time
 class LollmsApplication(LoLLMsCom):
     def __init__(
                     self, 
@@ -265,7 +266,15 @@ class LollmsApplication(LoLLMsCom):
                 trace_exception(ex)
                 self.warning(f"Couldn't load vllm")
 
-
+        if self.config.whisper_activate:
+            try:
+                from lollms.media import AudioRecorder
+                self.rec = AudioRecorder(self.lollms_paths.personal_outputs_path/"test.wav")
+                self.rec.start_recording()
+                time.sleep(1)
+                self.rec.stop_recording()
+            except:
+                pass
         if self.config.xtts_enable:
             try:
                 from lollms.services.xtts.lollms_xtts import LollmsXTTS
diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml
index 65c7f55..c0d554f 100644
--- a/lollms/configs/config.yaml
+++ b/lollms/configs/config.yaml
@@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 91
+version: 92
 binding_name: null
 model_name: null
 model_variant: null
@@ -88,6 +88,14 @@ xtts_use_streaming_mode: true
 auto_read: false
 xtts_current_voice: null
 xtts_current_language: en
+xtts_stream_chunk_size: 100
+xtts_temperature: 0.75
+xtts_length_penalty: 1.0
+xtts_repetition_penalty: 5.0
+xtts_top_k: 50
+xtts_top_p: 0.85
+xtts_speed: 1
+xtts_enable_text_splitting: true
 
 # Image generation service
 enable_sd_service: false
diff --git a/lollms/media.py b/lollms/media.py
index 2240728..2a9f628 100644
--- a/lollms/media.py
+++ b/lollms/media.py
@@ -79,11 +79,12 @@ from scipy.io.wavfile import write
 from matplotlib import pyplot as plt
 import numpy as np
 from scipy.signal import spectrogram
-
+from pathlib import Path
 class AudioRecorder:
-    def __init__(self, sio:socketio.Client, filename, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom:LoLLMsCom=None, build_spectrogram=False, model = "base", transcribe=False):
+    def __init__(self, filename:Path, sio:socketio.Client=None, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom:LoLLMsCom=None, build_spectrogram=False, model = "base", transcribe=False):
         self.sio = sio
-        self.filename = filename
+        self.filename = Path(filename)
+        self.filename.parent.mkdir(exist_ok=True, parents=True)
         self.channels = channels
         self.sample_rate = sample_rate
         self.chunk_size = chunk_size
@@ -138,7 +139,8 @@ class AudioRecorder:
             with open(transcription_fn, "w", encoding="utf-8") as f:
                 f.write(result["text"])
             self.lollmsCom.info(f"File saved to {transcription_fn}")
-            run_async(partial(self.sio.emit,'transcript', result["text"]))
+            if self.sio:
+                run_async(partial(self.sio.emit,'transcript', result["text"]))
             return {"text":result["text"], "audio":transcription_fn}
         else:
             return {"text":""}
@@ -152,7 +154,8 @@ class AudioRecorder:
         plt.savefig(img_buffer, format='png')
         img_buffer.seek(0)
         img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
-        run_async(partial(self.sio.emit,'update_spectrogram', img_base64))
+        if self.sio:
+            run_async(partial(self.sio.emit,'update_spectrogram', img_base64))
         self.last_spectrogram_update = time.perf_counter()
         plt.clf()
 
@@ -207,7 +210,8 @@ class WebcamImageSender:
 
                 _, buffer = cv2.imencode('.jpg', frame)
                 image_base64 = base64.b64encode(buffer)
-                run_async(partial(self.sio.emit,"video_stream_image", image_base64.decode('utf-8')))
+                if self.sio:
+                    run_async(partial(self.sio.emit,"video_stream_image", image_base64.decode('utf-8')))
 
             cap.release()
         except Exception as ex:
diff --git a/lollms/server/configs/config.yaml b/lollms/server/configs/config.yaml
index 65c7f55..c0d554f 100644
--- a/lollms/server/configs/config.yaml
+++ b/lollms/server/configs/config.yaml
@@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 91
+version: 92
 binding_name: null
 model_name: null
 model_variant: null
@@ -88,6 +88,14 @@ xtts_use_streaming_mode: true
 auto_read: false
 xtts_current_voice: null
 xtts_current_language: en
+xtts_stream_chunk_size: 100
+xtts_temperature: 0.75
+xtts_length_penalty: 1.0
+xtts_repetition_penalty: 5.0
+xtts_top_k: 50
+xtts_top_p: 0.85
+xtts_speed: 1
+xtts_enable_text_splitting: true
 
 # Image generation service
 enable_sd_service: false
diff --git a/lollms/server/endpoints/lollms_xtts.py b/lollms/server/endpoints/lollms_xtts.py
index 5c1e42b..57308c5 100644
--- a/lollms/server/endpoints/lollms_xtts.py
+++ b/lollms/server/endpoints/lollms_xtts.py
@@ -150,7 +150,6 @@ async def text2Audio(request: LollmsText2AudioRequest):
             else:
                 lollmsElfServer.InfoMessage("xtts is not up yet.\nPlease wait for it to load then try again. This may take some time.") 
                 return  {"status":False, "error":"Service not ready yet"} 
-            return {"url": url}
         except Exception as ex:
             trace_exception(ex)
             return {"url": None}