good stuff

2025-04-05 01:49:08 +00:00 · 2024-05-05 20:57:05 +02:00 · 2024-05-05 20:57:05 +02:00 · 1377c5b953
commit 1377c5b953
parent fd20ec2859
6 changed files with 47 additions and 11 deletions
--- a/configs/config.yaml
+++ b/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 91
+version: 92
 binding_name: null
 model_name: null
 model_variant: null
@ -88,6 +88,14 @@ xtts_use_streaming_mode: true
 auto_read: false
 xtts_current_voice: null
 xtts_current_language: en
+xtts_stream_chunk_size: 100
+xtts_temperature: 0.75
+xtts_length_penalty: 1.0
+xtts_repetition_penalty: 5.0
+xtts_top_k: 50
+xtts_top_p: 0.85
+xtts_speed: 1
+xtts_enable_text_splitting: true

 # Image generation service
 enable_sd_service: false
--- a/lollms/app.py
+++ b/lollms/app.py
@ -26,6 +26,7 @@ import sys, os
 import platform
 import gc
 import yaml
+import time
 class LollmsApplication(LoLLMsCom):
    def __init__(
                    self, 
@ -265,7 +266,15 @@ class LollmsApplication(LoLLMsCom):
                trace_exception(ex)
                self.warning(f"Couldn't load vllm")

-
+        if self.config.whisper_activate:
+            try:
+                from lollms.media import AudioRecorder
+                self.rec = AudioRecorder(self.lollms_paths.personal_outputs_path/"test.wav")
+                self.rec.start_recording()
+                time.sleep(1)
+                self.rec.stop_recording()
+            except:
+                pass
        if self.config.xtts_enable:
            try:
                from lollms.services.xtts.lollms_xtts import LollmsXTTS
--- a/lollms/configs/config.yaml
+++ b/lollms/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 91
+version: 92
 binding_name: null
 model_name: null
 model_variant: null
@ -88,6 +88,14 @@ xtts_use_streaming_mode: true
 auto_read: false
 xtts_current_voice: null
 xtts_current_language: en
+xtts_stream_chunk_size: 100
+xtts_temperature: 0.75
+xtts_length_penalty: 1.0
+xtts_repetition_penalty: 5.0
+xtts_top_k: 50
+xtts_top_p: 0.85
+xtts_speed: 1
+xtts_enable_text_splitting: true

 # Image generation service
 enable_sd_service: false
--- a/lollms/media.py
+++ b/lollms/media.py
@ -79,11 +79,12 @@ from scipy.io.wavfile import write
 from matplotlib import pyplot as plt
 import numpy as np
 from scipy.signal import spectrogram
-
+from pathlib import Path
 class AudioRecorder:
-    def __init__(self, sio:socketio.Client, filename, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom:LoLLMsCom=None, build_spectrogram=False, model = "base", transcribe=False):
+    def __init__(self, filename:Path, sio:socketio.Client=None, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom:LoLLMsCom=None, build_spectrogram=False, model = "base", transcribe=False):
        self.sio = sio
-        self.filename = filename
+        self.filename = Path(filename)
+        self.filename.parent.mkdir(exist_ok=True, parents=True)
        self.channels = channels
        self.sample_rate = sample_rate
        self.chunk_size = chunk_size
@ -138,7 +139,8 @@ class AudioRecorder:
            with open(transcription_fn, "w", encoding="utf-8") as f:
                f.write(result["text"])
            self.lollmsCom.info(f"File saved to {transcription_fn}")
-            run_async(partial(self.sio.emit,'transcript', result["text"]))
+            if self.sio:
+                run_async(partial(self.sio.emit,'transcript', result["text"]))
            return {"text":result["text"], "audio":transcription_fn}
        else:
            return {"text":""}
@ -152,7 +154,8 @@ class AudioRecorder:
        plt.savefig(img_buffer, format='png')
        img_buffer.seek(0)
        img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
-        run_async(partial(self.sio.emit,'update_spectrogram', img_base64))
+        if self.sio:
+            run_async(partial(self.sio.emit,'update_spectrogram', img_base64))
        self.last_spectrogram_update = time.perf_counter()
        plt.clf()

@ -207,7 +210,8 @@ class WebcamImageSender:

                _, buffer = cv2.imencode('.jpg', frame)
                image_base64 = base64.b64encode(buffer)
-                run_async(partial(self.sio.emit,"video_stream_image", image_base64.decode('utf-8')))
+                if self.sio:
+                    run_async(partial(self.sio.emit,"video_stream_image", image_base64.decode('utf-8')))

            cap.release()
        except Exception as ex:
--- a/lollms/server/configs/config.yaml
+++ b/lollms/server/configs/config.yaml
@ -1,5 +1,5 @@
 # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== 
-version: 91
+version: 92
 binding_name: null
 model_name: null
 model_variant: null
@ -88,6 +88,14 @@ xtts_use_streaming_mode: true
 auto_read: false
 xtts_current_voice: null
 xtts_current_language: en
+xtts_stream_chunk_size: 100
+xtts_temperature: 0.75
+xtts_length_penalty: 1.0
+xtts_repetition_penalty: 5.0
+xtts_top_k: 50
+xtts_top_p: 0.85
+xtts_speed: 1
+xtts_enable_text_splitting: true

 # Image generation service
 enable_sd_service: false
--- a/lollms/server/endpoints/lollms_xtts.py
+++ b/lollms/server/endpoints/lollms_xtts.py
@ -150,7 +150,6 @@ async def text2Audio(request: LollmsText2AudioRequest):
            else:
                lollmsElfServer.InfoMessage("xtts is not up yet.\nPlease wait for it to load then try again. This may take some time.") 
                return  {"status":False, "error":"Service not ready yet"} 
-            return {"url": url}
        except Exception as ex:
            trace_exception(ex)
            return {"url": None}