moved to sounddevice instead of pyaudio

2025-02-23 10:30:15 +00:00 · 2023-12-22 20:16:14 +01:00 · 2023-12-22 20:16:14 +01:00 · be2283cf34
commit be2283cf34
parent e0e99c5a74
1 changed files with 61 additions and 79 deletions
--- a/lollms/media.py
+++ b/lollms/media.py
@ -14,8 +14,8 @@ import subprocess
 import os
 import threading
 if not PackageManager.check_package_installed("cv2"):
    os.system('sudo apt-get update')
    os.system('sudo apt-get install libgl1-mesa-glx python3-opencv -y')
    PackageManager.install_package("opencv-python")
 import cv2
@ -41,28 +41,24 @@ import json
 import base64
 import io
 import numpy as np
 if not PackageManager.check_package_installed("sounddevice"):
    if platform.system() == "Windows":
        PackageManager.install_package("sounddevice")
    elif platform.system() == "Linux":
        subprocess.check_call(["sudo", "apt", "install", "-y", "portaudio19-dev python3-sounddevice"])
    elif platform.system() == "Darwin":
        subprocess.check_call(["brew", "install", "portaudio19-dev python3-sounddevice"])
    PackageManager.install_package("wave")
 import sounddevice as sd
 class AudioRecorder:
-    def __init__(self, socketio, filename, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom:LoLLMsCom=None):
+    def __init__(self, socketio, filename, channels=1, sample_rate=16000, chunk_size=24678, silence_threshold=150.0, silence_duration=2, callback=None, lollmsCom=None):
        try:
            if not PackageManager.check_package_installed("pyaudio"):
                if platform.system() == "Windows":
                    PackageManager.install_package("pyaudio")
                elif platform.system() == "Linux":
                    subprocess.check_call(["sudo", "apt", "install", "-y", "portaudio19-dev python3-pyaudio"])
                elif platform.system() == "Darwin":
                    subprocess.check_call(["brew", "install", "portaudio19-dev python3-pyaudio"])
                PackageManager.install_package("wave")
            import pyaudio
            import wave
            self.socketio = socketio
            self.filename = filename
            self.channels = channels
            self.sample_rate = sample_rate
            self.chunk_size = chunk_size
            self.audio_format = pyaudio.paInt16
            self.audio_stream = None
            self.audio_frames = []
            self.is_recording = False
@ -78,7 +74,6 @@ class AudioRecorder:
            self.channels = channels
            self.sample_rate = sample_rate
            self.chunk_size = chunk_size
            self.audio_format = None
            self.audio_stream = None
            self.audio_frames = []
            self.is_recording = False
@ -89,44 +84,37 @@ class AudioRecorder:
            self.lollmsCom = lollmsCom
            self.whisper_model = None
    def start_recording(self):
        if self.whisper_model is None:
            self.lollmsCom.info("Loading whisper model")
            self.whisper_model=whisper.load_model("base.en")
        try:
            import pyaudio
            self.is_recording = True
-            self.audio_stream = pyaudio.PyAudio().open(
+            self.audio_stream = sd.InputStream(
                format=self.audio_format,
                channels=self.channels,
-                rate=self.sample_rate,
+                samplerate=self.sample_rate,
-                input=True,
+                callback=self._record,
-                frames_per_buffer=self.chunk_size
+                blocksize=self.chunk_size
            )
            self.audio_stream.start()
            self.lollmsCom.info("Recording started...")
            threading.Thread(target=self._record).start()
        except:
            self.lollmsCom.error("No audio input found!")
-
+    def _record(self, indata, frames, time, status):
    def _record(self):
        first_recording = True  # Flag to track the first recording
        silence_duration = 5
        non_silent_start = None
        non_silent_end = None
        last_spectrogram_update = time.time()
        self.audio_frames = None
-        while self.is_recording:
+        buffered = np.array(indata)
            data = self.audio_stream.read(self.chunk_size)
            buffered = np.frombuffer(data, dtype=np.int16)
        if self.audio_frames is not None:
-                self.audio_frames = np.concatenate([self.audio_frames,buffered])
+            self.audio_frames = np.concatenate([self.audio_frames, buffered])
        else:
            self.audio_frames = buffered
        # Remove audio frames that are older than 30 seconds
        if len(self.audio_frames) > self.sample_rate * 30:
            self.audio_frames=self.audio_frames[-self.sample_rate * 30:]
@ -148,17 +136,14 @@ class AudioRecorder:
                if self.callback and non_silent_start is not None and non_silent_end - non_silent_start >= 1:
                    self.lollmsCom.info("Analyzing")
                    # Convert to float
                        import pyaudio
                        import wave
                    audio_data = self.audio_frames.astype(np.float32)
                    audio = wave.open(str(self.filename), 'wb')
                    audio.setnchannels(self.channels)
-                        audio.setsampwidth(pyaudio.PyAudio().get_sample_size(self.audio_format))
+                    audio.setsampwidth(audio_stream.dtype.itemsize)
                    audio.setframerate(self.sample_rate)
                    audio.writeframes(b''.join(self.audio_frames[non_silent_start:non_silent_end]))
                    audio.close()
                    # Transcribe the audio using the whisper model
                    text = self.whisper_model.transcribe(audio_data[non_silent_start:non_silent_end])
@ -208,17 +193,14 @@ class AudioRecorder:
            rms = 0
        return rms
    def stop_recording(self):
        self.is_recording = False
        if self.audio_stream:
-            self.audio_stream.stop_stream()
+            self.audio_stream.stop()
            self.audio_stream.close()
            import pyaudio
            import wave
            audio = wave.open(str(self.filename), 'wb')
            audio.setnchannels(self.channels)
-            audio.setsampwidth(pyaudio.PyAudio().get_sample_size(self.audio_format))
+            audio.setsampwidth(self.audio_stream.dtype.itemsize)
            audio.setframerate(self.sample_rate)
            audio.writeframes(b''.join(self.audio_frames))
            audio.close()