moved to sounddevice instead of pyaudio

2025-02-23 02:22:34 +00:00 · 2023-12-22 20:16:14 +01:00 · 2023-12-22 20:16:14 +01:00 · be2283cf34
commit be2283cf34
parent e0e99c5a74
1 changed files with 61 additions and 79 deletions
--- a/lollms/media.py
+++ b/lollms/media.py
@ -14,8 +14,8 @@ import subprocess
 import os
 import threading
 if not PackageManager.check_package_installed("cv2"):
    os.system('sudo apt-get update')
    os.system('sudo apt-get install libgl1-mesa-glx python3-opencv -y')
    PackageManager.install_package("opencv-python")
 import cv2
@ -41,28 +41,24 @@ import json
 import base64
 import io
 import numpy as np
 if not PackageManager.check_package_installed("sounddevice"):
    if platform.system() == "Windows":
        PackageManager.install_package("sounddevice")
    elif platform.system() == "Linux":
        subprocess.check_call(["sudo", "apt", "install", "-y", "portaudio19-dev python3-sounddevice"])
    elif platform.system() == "Darwin":
        subprocess.check_call(["brew", "install", "portaudio19-dev python3-sounddevice"])
    PackageManager.install_package("wave")
 import sounddevice as sd
 class AudioRecorder:
-    def __init__(self, socketio, filename, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom:LoLLMsCom=None):
+    def __init__(self, socketio, filename, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom=None):
        try:
            if not PackageManager.check_package_installed("pyaudio"):
                if platform.system() == "Windows":
                    PackageManager.install_package("pyaudio")
                elif platform.system() == "Linux":
                    subprocess.check_call(["sudo", "apt", "install", "-y", "portaudio19-dev python3-pyaudio"])
                elif platform.system() == "Darwin":
                    subprocess.check_call(["brew", "install", "portaudio19-dev python3-pyaudio"])
                PackageManager.install_package("wave")
            import pyaudio
            import wave
            self.socketio = socketio
            self.filename = filename
            self.channels = channels
            self.sample_rate = sample_rate
            self.chunk_size = chunk_size
            self.audio_format = pyaudio.paInt16
            self.audio_stream = None
            self.audio_frames = []
            self.is_recording = False
@ -78,7 +74,6 @@ class AudioRecorder:
            self.channels = channels
            self.sample_rate = sample_rate
            self.chunk_size = chunk_size
            self.audio_format = None
            self.audio_stream = None
            self.audio_frames = []
            self.is_recording = False
@ -88,91 +83,81 @@ class AudioRecorder:
            self.callback = callback
            self.lollmsCom = lollmsCom
            self.whisper_model = None
    def start_recording(self):
        if self.whisper_model is None:
            self.lollmsCom.info("Loading whisper model")
            self.whisper_model=whisper.load_model("base.en")
        try:
            import pyaudio
            self.is_recording = True
-            self.audio_stream = pyaudio.PyAudio().open(
+            self.audio_stream = sd.InputStream(
                format=self.audio_format,
                channels=self.channels,
-                rate=self.sample_rate,
+                samplerate=self.sample_rate,
-                input=True,
+                callback=self._record,
-                frames_per_buffer=self.chunk_size
+                blocksize=self.chunk_size
            )
            self.audio_stream.start()
            self.lollmsCom.info("Recording started...")
            threading.Thread(target=self._record).start()
        except:
            self.lollmsCom.error("No audio input found!")
-
+    def _record(self, indata, frames, time, status):
    def _record(self):
        first_recording = True  # Flag to track the first recording
        silence_duration = 5
        non_silent_start = None
        non_silent_end = None
        last_spectrogram_update = time.time()
        self.audio_frames = None
-        while self.is_recording:
+        buffered = np.array(indata)
-            data = self.audio_stream.read(self.chunk_size)
+        if self.audio_frames is not None:
-            buffered = np.frombuffer(data, dtype=np.int16)
+            self.audio_frames = np.concatenate([self.audio_frames, buffered])
-            if self.audio_frames is not None:
+        else:
-                self.audio_frames = np.concatenate([self.audio_frames,buffered])
+            self.audio_frames = buffered
            else:
                self.audio_frames = buffered        
            # Remove audio frames that are older than 30 seconds
            if len(self.audio_frames) > self.sample_rate * 30:
                self.audio_frames=self.audio_frames[-self.sample_rate * 30:]
-            # Update spectrogram every 3 seconds
+        # Remove audio frames that are older than 30 seconds
-            if time.time() - last_spectrogram_update >= 1:
+        if len(self.audio_frames) > self.sample_rate * 30:
-                self._update_spectrogram()
+            self.audio_frames=self.audio_frames[-self.sample_rate * 30:]
                last_spectrogram_update = time.time()
-            # Check for silence
+        # Update spectrogram every 3 seconds
-            rms = self._calculate_rms(buffered)
+        if time.time() - last_spectrogram_update >= 1:
-            if rms < self.silence_threshold:
+            self._update_spectrogram()
-                current_time = time.time()
+            last_spectrogram_update = time.time()
                if current_time - self.last_sound_time >= silence_duration:
                    if first_recording:
                        first_recording = False
                        silence_duration = self.silence_duration
-                    if self.callback and non_silent_start is not None and non_silent_end - non_silent_start >= 1:
+        # Check for silence
-                        self.lollmsCom.info("Analyzing")
+        rms = self._calculate_rms(buffered)
-                        # Convert to float
+        if rms < self.silence_threshold:
-                        import pyaudio
+            current_time = time.time()
-                        import wave
+            if current_time - self.last_sound_time >= silence_duration:
-                        audio_data = self.audio_frames.astype(np.float32)
+                if first_recording:
-                        audio = wave.open(str(self.filename), 'wb')
+                    first_recording = False
-                        audio.setnchannels(self.channels)
+                    silence_duration = self.silence_duration
                        audio.setsampwidth(pyaudio.PyAudio().get_sample_size(self.audio_format))
                        audio.setframerate(self.sample_rate)
                        audio.writeframes(b''.join(self.audio_frames[non_silent_start:non_silent_end]))
                        audio.close()
                        # Transcribe the audio using the whisper model
                        text = self.whisper_model.transcribe(audio_data[non_silent_start:non_silent_end])
-                        self.callback(text)
+                if self.callback and non_silent_start is not None and non_silent_end - non_silent_start >= 1:
-                        print(text["text"])
+                    self.lollmsCom.info("Analyzing")
                    # Convert to float
                    audio_data = self.audio_frames.astype(np.float32)
                    audio = wave.open(str(self.filename), 'wb')
                    audio.setnchannels(self.channels)
                    audio.setsampwidth(audio_stream.dtype.itemsize)
                    audio.setframerate(self.sample_rate)
                    audio.writeframes(b''.join(self.audio_frames[non_silent_start:non_silent_end]))
                    audio.close()
-                    self.last_sound_time = time.time()
+                    # Transcribe the audio using the whisper model
-                    non_silent_start = None
+                    text = self.whisper_model.transcribe(audio_data[non_silent_start:non_silent_end])
                    self.callback(text)
                    print(text["text"])
            else:
                self.last_sound_time = time.time()
-                if non_silent_start is None:
+                non_silent_start = None
-                    non_silent_start = len(self.audio_frames) - 1
+
-                non_silent_end = len(self.audio_frames)
+        else:
            self.last_sound_time = time.time()
            if non_silent_start is None:
                non_silent_start = len(self.audio_frames) - 1
            non_silent_end = len(self.audio_frames)
    def _update_spectrogram(self):
        audio_data = self.audio_frames[-self.sample_rate*30:]
@ -208,17 +193,14 @@ class AudioRecorder:
            rms = 0
        return rms
    def stop_recording(self):
        self.is_recording = False
        if self.audio_stream:
-            self.audio_stream.stop_stream()
+            self.audio_stream.stop()
            self.audio_stream.close()
            import pyaudio
            import wave
            audio = wave.open(str(self.filename), 'wb')
            audio.setnchannels(self.channels)
-            audio.setsampwidth(pyaudio.PyAudio().get_sample_size(self.audio_format))
+            audio.setsampwidth(self.audio_stream.dtype.itemsize)
            audio.setframerate(self.sample_rate)
            audio.writeframes(b''.join(self.audio_frames))
            audio.close()