mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-02-01 08:48:19 +00:00
720 lines
26 KiB
Python
720 lines
26 KiB
Python
"""
|
||
Project: LoLLMs
|
||
Author: ParisNeo
|
||
Description: Media classes:
|
||
- WebcamImageSender: is a captures images from the webcam and sends them to a SocketIO client.
|
||
- MusicPlayer: is a MusicPlayer class that allows you to play music using pygame library.
|
||
License: Apache 2.0
|
||
"""
|
||
from lollms.utilities import PackageManager
|
||
from lollms.com import LoLLMsCom
|
||
from lollms.utilities import trace_exception, run_async, install_conda_package
|
||
from lollms.types import MSG_TYPE, SENDER_TYPES
|
||
from lollms.client_session import Session
|
||
from ascii_colors import ASCIIColors
|
||
import platform
|
||
from functools import partial
|
||
import subprocess
|
||
from collections import deque
|
||
from scipy.signal import butter, lfilter
|
||
|
||
import os
|
||
import threading
|
||
import re
|
||
|
||
if not PackageManager.check_package_installed("cv2"):
|
||
if platform.system() == "Darwin":
|
||
os.system('brew install opencv')
|
||
elif platform.system() == "Windows":
|
||
os.system('pip install opencv-python')
|
||
else:
|
||
os.system('pip install opencv-python')
|
||
# os.system('sudo apt-get update')
|
||
# os.system('sudo apt-get install libgl1-mesa-glx python3-opencv -y')
|
||
# os.system('pip install opencv-python')
|
||
try:
|
||
import cv2
|
||
except:
|
||
ASCIIColors.error("Couldn't install opencv!")
|
||
|
||
|
||
if not PackageManager.check_package_installed("scipy"):
|
||
PackageManager.install_package("scipy")
|
||
from scipy import signal
|
||
from scipy import signal
|
||
|
||
if not PackageManager.check_package_installed("matplotlib"):
|
||
PackageManager.install_package("matplotlib")
|
||
import matplotlib.pyplot as plt
|
||
import matplotlib
|
||
matplotlib.use('Agg')
|
||
|
||
|
||
|
||
import socketio
|
||
from lollms.com import LoLLMsCom
|
||
try:
|
||
if not PackageManager.check_package_installed("sounddevice"):
|
||
# os.system("sudo apt-get install portaudio19-dev")
|
||
PackageManager.install_package("sounddevice")
|
||
PackageManager.install_package("wave")
|
||
except:
|
||
# os.system("sudo apt-get install portaudio19-dev -y")
|
||
PackageManager.install_package("sounddevice")
|
||
PackageManager.install_package("wave")
|
||
try:
|
||
import sounddevice as sd
|
||
import wave
|
||
except:
|
||
ASCIIColors.error("Couldn't load sound tools")
|
||
|
||
import time
|
||
import base64
|
||
import io
|
||
import socketio
|
||
from scipy.io.wavfile import write
|
||
from matplotlib import pyplot as plt
|
||
import numpy as np
|
||
from scipy.signal import spectrogram
|
||
from pathlib import Path
|
||
|
||
from lollms.app import LollmsApplication
|
||
from lollms.tasks import TasksLibrary
|
||
from lollms.tts import LollmsTTS
|
||
from lollms.personality import AIPersonality
|
||
from lollms.function_call import FunctionCalling_Library
|
||
from lollms.client_session import Client
|
||
from datetime import datetime
|
||
|
||
import sys
|
||
|
||
def update_progress_bar(silence_counter, max_silence):
|
||
bar_length = 40 # Length of the progress bar
|
||
progress = silence_counter / max_silence
|
||
block = int(round(bar_length * progress))
|
||
|
||
# Determine the color based on progress
|
||
if progress < 0.5:
|
||
color = ASCIIColors.color_bright_green
|
||
else:
|
||
color = ASCIIColors.color_bright_red
|
||
|
||
bar = "#" * block + "-" * (bar_length - block)
|
||
|
||
sys.stdout.write("\r")
|
||
ASCIIColors.print(f"silence_counter: {silence_counter} |{bar}| {round(progress * 100, 2)}%", color=color, end="")
|
||
sys.stdout.flush()
|
||
|
||
# Step 1: Define your high-pass and low-pass filters (they’re like the bouncers for your audio club)
|
||
def butter_bandpass(lowcut, highcut, fs, order=5):
|
||
nyquist = 0.5 * fs
|
||
low = lowcut / nyquist
|
||
high = highcut / nyquist
|
||
b, a = butter(order, [low, high], btype='band')
|
||
return b, a
|
||
|
||
def bandpass_filter(data, lowcut, highcut, fs, order=5):
|
||
b, a = butter_bandpass(lowcut, highcut, fs, order=order)
|
||
y = lfilter(b, a, data)
|
||
return y
|
||
|
||
class RTCom:
|
||
def __init__(
|
||
self,
|
||
lc:LollmsApplication,
|
||
sio:socketio.Client,
|
||
personality:AIPersonality,
|
||
client:Client,
|
||
threshold=1000,
|
||
silence_duration=2,
|
||
sound_threshold_percentage=10,
|
||
gain=1.0,
|
||
rate=44100,
|
||
channels=1,
|
||
buffer_size=10,
|
||
snd_input_device=None,
|
||
snd_output_device=None,
|
||
logs_folder="logs",
|
||
block_while_talking=True,
|
||
use_keyword_audio=False,
|
||
keyword_audio_path=None
|
||
):
|
||
self.sio = sio
|
||
self.lc = lc
|
||
self.client = client
|
||
self.block_listening = False
|
||
self.personality = personality
|
||
self.rate = rate
|
||
self.channels = channels
|
||
self.threshold = threshold
|
||
self.silence_duration = silence_duration
|
||
self.buffer_size = buffer_size
|
||
self.gain = gain
|
||
self.sound_threshold_percentage = sound_threshold_percentage
|
||
self.block_while_talking = block_while_talking
|
||
self.image_shot = None
|
||
self.use_keyword_audio=use_keyword_audio,
|
||
self.keyword_audio_path=keyword_audio_path
|
||
self.summoned = False
|
||
self.sample_mfccs = None
|
||
if self.use_keyword_audio and self.keyword_audio_path:
|
||
self.sample_features = self.load_and_extract_features(self.keyword_audio_path)
|
||
|
||
|
||
|
||
if snd_input_device is None:
|
||
devices = sd.query_devices()
|
||
snd_input_device = [device['name'] for device in devices if device["max_input_channels"]>0][0]
|
||
if snd_output_device is None:
|
||
devices = sd.query_devices()
|
||
snd_output_device = [device['name'] for device in devices if device["max_output_channels"]>0][0]
|
||
|
||
self.snd_input_device = snd_input_device
|
||
self.snd_output_device = snd_output_device
|
||
self.logs_folder = Path(logs_folder)
|
||
|
||
self.logs_folder.mkdir(exist_ok=True, parents=True)
|
||
|
||
self.frames = []
|
||
self.silence_counter = 0
|
||
self.current_silence_duration = 0
|
||
self.longest_silence_duration = 0
|
||
self.sound_frames = 0
|
||
self.audio_values = []
|
||
|
||
self.max_audio_value = 0
|
||
self.min_audio_value = 0
|
||
self.total_frames = 0 # Initialize total_frames
|
||
|
||
self.file_index = 0
|
||
self.recording = False
|
||
self.stop_flag = False
|
||
|
||
self.buffer = deque(maxlen=buffer_size)
|
||
self.transcribed_files = deque()
|
||
self.buffer_lock = threading.Condition()
|
||
self.transcribed_lock = threading.Condition()
|
||
|
||
def load_and_extract_features(self, file_path):
|
||
if not PackageManager.check_package_installed("librosa"):
|
||
PackageManager.install_package(librosa)
|
||
import librosa
|
||
y, sr = librosa.load(file_path, sr=None)
|
||
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
||
return np.mean(mfccs.T, axis=0)
|
||
|
||
def extract_features(self, frames):
|
||
if not PackageManager.check_package_installed("librosa"):
|
||
PackageManager.install_package("librosa")
|
||
|
||
filename = f"recording_{self.file_index}.wav"
|
||
self.file_index += 1
|
||
|
||
amplified_frames = self._apply_gain(frames)
|
||
trimmed_frames = self._trim_silence([amplified_frames])
|
||
logs_file = Path(self.logs_folder)/filename
|
||
logs_file.parent.mkdir(exist_ok=True, parents=True)
|
||
|
||
wf = wave.open(str(logs_file), 'wb')
|
||
wf.setnchannels(self.channels)
|
||
wf.setsampwidth(2)
|
||
wf.setframerate(self.rate)
|
||
wf.writeframes(trimmed_frames)
|
||
wf.close()
|
||
|
||
import librosa
|
||
y, sr = librosa.load(logs_file, sr=self.rate)
|
||
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
||
|
||
return np.mean(mfccs.T, axis=0)
|
||
|
||
def compare_voices(self, sample_features, realtime_features, th = 20):
|
||
if not PackageManager.check_package_installed("scipy"):
|
||
PackageManager.install_package("scipy")
|
||
from scipy.spatial.distance import euclidean
|
||
# Calculate the Euclidean distance between the features
|
||
distance = euclidean(sample_features, realtime_features)
|
||
|
||
# If the distance is smaller than the threshold, we have a match!
|
||
if distance < th:
|
||
print(f"Voice match found! (distance: {distance}) 🎉🤡")
|
||
return True
|
||
else:
|
||
print(f"No match found. (distance: {distance}) 😢🤡")
|
||
return False
|
||
def start_recording(self):
|
||
self.recording = True
|
||
self.stop_flag = False
|
||
|
||
self.recording_thread = threading.Thread(target=self._record)
|
||
self.transcription_thread = threading.Thread(target=self._process_files)
|
||
self.recording_thread.start()
|
||
self.transcription_thread.start()
|
||
|
||
def stop_recording(self):
|
||
self.recording = False
|
||
self.stop_flag = True
|
||
ASCIIColors.green("<<RTCOM off>>")
|
||
|
||
def _record(self):
|
||
with sd.InputStream(channels=self.channels, device=self.snd_input_device, samplerate=self.rate, callback=self.callback, dtype='int16'):
|
||
while not self.stop_flag:
|
||
time.sleep(1)
|
||
self.recording = False
|
||
|
||
# self._save_histogram(self.audio_values)
|
||
|
||
def callback(self, indata, frames, time, status):
|
||
max_scilence = int((self.rate / frames) * self.silence_duration)
|
||
if not self.block_listening:
|
||
# Transform the buffer into a numpy array (like turning a frog into a prince)
|
||
audio_data = np.frombuffer(indata, dtype=np.int16)
|
||
# Apply the bandpass filter to the incoming audio data
|
||
audio_data = bandpass_filter(audio_data, lowcut=300, highcut=3000, fs=self.rate)
|
||
max_value = np.max(audio_data)
|
||
min_value = np.min(audio_data)
|
||
|
||
if max_value > self.max_audio_value:
|
||
self.max_audio_value = max_value
|
||
if min_value < self.min_audio_value:
|
||
self.min_audio_value = min_value
|
||
|
||
self.audio_values.extend(audio_data)
|
||
|
||
self.total_frames += frames
|
||
ASCIIColors.red(f" max_value: {max_value}", end="")
|
||
if max_value < self.threshold:
|
||
self.silence_counter += 1
|
||
self.current_silence_duration += frames
|
||
else:
|
||
self.silence_counter = 0
|
||
self.current_silence_duration = 0
|
||
self.sound_frames += frames
|
||
|
||
if self.current_silence_duration > self.longest_silence_duration:
|
||
self.longest_silence_duration = self.current_silence_duration
|
||
|
||
if self.silence_counter > max_scilence:
|
||
trimmed_frames = self._trim_silence(self.frames)
|
||
ASCIIColors.yellow(f"\nsound duration: {len(trimmed_frames)/self.rate}")
|
||
sound_percentage = self._calculate_sound_percentage(trimmed_frames)
|
||
if sound_percentage >= self.sound_threshold_percentage:
|
||
ASCIIColors.red(f"Sound percentage {sound_percentage}")
|
||
ASCIIColors.red("\nSilence counter reached threshold")
|
||
|
||
if self.use_keyword_audio and self.keyword_audio_path and self.summoned == False:
|
||
features = self.extract_features(self.frames)
|
||
if self.compare_voices(self.sample_features, features):
|
||
self.summoned = True
|
||
else:
|
||
self._save_wav(self.frames)
|
||
self.summoned = False
|
||
self.frames = []
|
||
self.silence_counter = 0
|
||
self.total_frames = 0
|
||
self.sound_frames = 0
|
||
else:
|
||
update_progress_bar(self.silence_counter, max_scilence)
|
||
self.frames.append(indata.copy())
|
||
else:
|
||
self.frames = []
|
||
self.silence_counter = 0
|
||
self.current_silence_duration = 0
|
||
self.longest_silence_duration = 0
|
||
self.sound_frames = 0
|
||
self.audio_values = []
|
||
|
||
self.max_audio_value = 0
|
||
self.min_audio_value = 0
|
||
self.total_frames = 0 # Initialize total_frames
|
||
|
||
def _apply_gain(self, frames):
|
||
audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
|
||
audio_data = audio_data * self.gain
|
||
audio_data = np.clip(audio_data, -32768, 32767)
|
||
return audio_data.astype(np.int16).tobytes()
|
||
|
||
def _trim_silence(self, frames):
|
||
audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
|
||
non_silent_indices = np.where(np.abs(audio_data) >= self.threshold)[0]
|
||
|
||
if non_silent_indices.size:
|
||
start_index = max(non_silent_indices[0] - self.rate, 0)
|
||
end_index = min(non_silent_indices[-1] + self.rate, len(audio_data))
|
||
trimmed_data = audio_data[start_index:end_index]
|
||
else:
|
||
trimmed_data = np.array([], dtype=np.int16)
|
||
|
||
return trimmed_data.tobytes()
|
||
|
||
def _calculate_sound_percentage(self, frames):
|
||
audio_data = np.frombuffer(frames, dtype=np.int16)
|
||
num_bins = len(audio_data) // self.rate
|
||
sound_count = 0
|
||
|
||
for i in range(num_bins):
|
||
bin_data = audio_data[i * self.rate: (i + 1) * self.rate]
|
||
if np.max(bin_data) >= self.threshold:
|
||
sound_count += 1
|
||
|
||
sound_percentage = (sound_count / num_bins) * 100 if num_bins > 0 else 0
|
||
return sound_percentage
|
||
|
||
def contains_unwanted_special_characters(self, s):
|
||
# Define a regex pattern to match any character that is not a Unicode letter, digit, punctuation, or whitespace
|
||
pattern = re.compile(r'[^a-zA-Z0-9\s.,!?;:()\'"“”‘’—\-\u00C0-\u017F\u0400-\u04FF\u0600-\u06FF\u3040-\u30FF\u4E00-\u9FFF]', re.UNICODE)
|
||
# Search for the pattern in the string
|
||
if pattern.search(s):
|
||
return True
|
||
return False
|
||
|
||
def remove_special_characters(self, s:str)->str:
|
||
# Define a regex pattern to match any character that is not a Unicode letter, digit, punctuation, or whitespace
|
||
pattern = re.compile(r'[^a-zA-Z0-9\s.,!?;:()\'"“”‘’—\-\u00C0-\u017F\u0400-\u04FF\u0600-\u06FF\u3040-\u30FF\u4E00-\u9FFF]', re.UNICODE)
|
||
# Substitute the matched characters with an empty string
|
||
cleaned_string = pattern.sub('', s)
|
||
return cleaned_string
|
||
|
||
def _save_wav(self, frames):
|
||
ASCIIColors.green("<<SEGMENT_RECOVERED>>")
|
||
# Todo annouce
|
||
# self.transcription_signal.update_status.emit("Segment detected and saved")
|
||
filename = f"recording_{self.file_index}.wav"
|
||
self.file_index += 1
|
||
|
||
amplified_frames = self._apply_gain(frames)
|
||
trimmed_frames = self._trim_silence([amplified_frames])
|
||
logs_file = Path(self.logs_folder)/filename
|
||
logs_file.parent.mkdir(exist_ok=True, parents=True)
|
||
|
||
wf = wave.open(str(logs_file), 'wb')
|
||
wf.setnchannels(self.channels)
|
||
wf.setsampwidth(2)
|
||
wf.setframerate(self.rate)
|
||
wf.writeframes(trimmed_frames)
|
||
wf.close()
|
||
|
||
with self.buffer_lock:
|
||
while len(self.buffer) >= self.buffer.maxlen:
|
||
self.buffer_lock.wait()
|
||
self.buffer.append(filename)
|
||
self.buffer_lock.notify()
|
||
|
||
def _save_histogram(self, audio_values):
|
||
plt.hist(audio_values, bins=50, edgecolor='black')
|
||
plt.title('Histogram of Audio Values')
|
||
plt.xlabel('Audio Value')
|
||
plt.ylabel('Frequency')
|
||
plt.savefig('audio_values_histogram.png')
|
||
plt.close()
|
||
|
||
def fix_string_for_xtts(self, input_string):
|
||
# Remove excessive exclamation marks
|
||
fixed_string = input_string.rstrip('!')
|
||
|
||
return fixed_string
|
||
|
||
def _process_files(self):
|
||
while not self.stop_flag:
|
||
with self.buffer_lock:
|
||
while not self.buffer and not self.stop_flag:
|
||
self.buffer_lock.wait()
|
||
if self.buffer:
|
||
filename = self.buffer.popleft()
|
||
self.buffer_lock.notify()
|
||
if self.block_while_talking:
|
||
self.block_listening = True
|
||
try:
|
||
if filename:
|
||
self.lc.info("Transcribing")
|
||
ASCIIColors.green("<<TRANSCRIBING>>")
|
||
wav_file_path = str(Path(self.logs_folder)/filename)
|
||
ASCIIColors.cyan(f"Logging to : {wav_file_path}")
|
||
transcription = self.lc.stt.transcribe(wav_file_path)
|
||
transcription = self.remove_special_characters(transcription).strip()
|
||
if len(transcription)>0:
|
||
transcription_fn = str(Path(self.logs_folder)/filename) + ".txt"
|
||
with open(transcription_fn, "w", encoding="utf-8") as f:
|
||
f.write(transcription)
|
||
|
||
with self.transcribed_lock:
|
||
self.transcribed_files.append((filename, transcription))
|
||
self.transcribed_lock.notify()
|
||
|
||
current_prompt = transcription
|
||
self.lc.new_block(client_id=self.client.client_id,sender=self.lc.config.user_name, content=current_prompt)
|
||
ASCIIColors.green("<<RESPONDING>>")
|
||
self.lc.info("Responding")
|
||
self.lc.handle_generate_msg(self.client.client_id, {"prompt": current_prompt})
|
||
while self.lc.busy:
|
||
time.sleep(0.01)
|
||
lollms_text = self.fix_string_for_xtts(self.client.generated_text)
|
||
ASCIIColors.red(" -------------- LOLLMS answer -------------------")
|
||
ASCIIColors.yellow(lollms_text)
|
||
ASCIIColors.red(" -------------------------------------------------")
|
||
self.lc.info("Talking")
|
||
ASCIIColors.green("<<TALKING>>")
|
||
self.lc.tts.tts_audio(lollms_text, file_name_or_path=str(Path(self.logs_folder)/filename)+"_answer.wav", use_threading=True)
|
||
except Exception as ex:
|
||
trace_exception(ex)
|
||
self.block_listening = False
|
||
ASCIIColors.green("<<LISTENING>>")
|
||
self.lc.info(f"Listening.\nYou can talk to {self.personality.name}")
|
||
# TODO : send the output
|
||
#self.transcription_signal.update_status.emit("Listening")
|
||
|
||
def get_voices(self):
|
||
if self.lc.tts and self.lc.tts.ready:
|
||
voices = self.lc.tts.get_voices() # Assuming the response is in JSON format
|
||
return voices
|
||
return []
|
||
|
||
from pathlib import Path
|
||
import os
|
||
import sounddevice as sd
|
||
import threading
|
||
import datetime
|
||
import wave
|
||
|
||
class AudioNinja:
|
||
def __init__(self, lc:LollmsApplication, logs_folder='logs', device=None):
|
||
"""
|
||
Initialize the AudioNinja with a LollmsApplication object,
|
||
a log folder, and an optional recording device.
|
||
|
||
Args:
|
||
lc (LollmsApplication): The LollmsApplication object for communication.
|
||
logs_folder (str): The folder to save recordings. Default is 'logs'.
|
||
device (int or str): The recording device index or name. Default is None.
|
||
"""
|
||
self.lc = lc
|
||
self.logs_folder = Path(logs_folder)
|
||
self.device = device
|
||
self.recording_thread = None
|
||
self.is_recording = False
|
||
self.frames = []
|
||
if not self.logs_folder.exists():
|
||
self.logs_folder.mkdir(parents=True, exist_ok=True)
|
||
self.lc.info(f"AudioNinja is ready to strike from the shadows! Logging to '{self.logs_folder}' with device '{self.device}'")
|
||
|
||
def _record_audio(self):
|
||
"""
|
||
Internal method to handle audio recording callback.
|
||
"""
|
||
def callback(indata, frames, time, status):
|
||
if self.is_recording:
|
||
self.frames.append(indata.copy())
|
||
|
||
with sd.InputStream(callback=callback, device=self.device):
|
||
while self.is_recording:
|
||
sd.sleep(1000)
|
||
|
||
def start_recording(self):
|
||
"""
|
||
Start the audio recording.
|
||
"""
|
||
if not self.is_recording:
|
||
self.is_recording = True
|
||
self.frames = []
|
||
self.recording_thread = threading.Thread(target=self._record_audio)
|
||
self.recording_thread.start()
|
||
self.lc.info("Ninja recording started! 🥷🔴")
|
||
|
||
def stop_recording(self):
|
||
"""
|
||
Stop the audio recording.
|
||
"""
|
||
if self.is_recording:
|
||
self.is_recording = False
|
||
self.recording_thread.join()
|
||
filename = self._save_recording()
|
||
self.lc.info("Ninja recording stopped! 🥷⚪️")
|
||
return filename
|
||
|
||
def _save_recording(self):
|
||
"""
|
||
Save the recorded audio to a .wav file.
|
||
"""
|
||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
filename = self.logs_folder / f"recording_{timestamp}.wav"
|
||
with wave.open(filename, 'wb') as wf:
|
||
wf.setnchannels(1)
|
||
wf.setsampwidth(sd.default.dtype[0].itemsize)
|
||
wf.setframerate(44100)
|
||
wf.writeframes(b''.join(self.frames))
|
||
self.lc.info(f"Ninja stored the audio file at '{filename}'! 🥷📂")
|
||
return filename
|
||
|
||
|
||
class WebcamImageSender:
|
||
"""
|
||
Class for capturing images from the webcam and sending them to a SocketIO client.
|
||
"""
|
||
|
||
def __init__(self, sio:socketio, lollmsCom:LoLLMsCom=None):
|
||
"""
|
||
Initializes the WebcamImageSender class.
|
||
|
||
Args:
|
||
socketio (socketio.Client): The SocketIO client object.
|
||
"""
|
||
self.sio = sio
|
||
self.last_image = None
|
||
self.last_change_time = None
|
||
self.capture_thread = None
|
||
self.is_running = False
|
||
self.lollmsCom = lollmsCom
|
||
|
||
def start_capture(self):
|
||
"""
|
||
Starts capturing images from the webcam in a separate thread.
|
||
"""
|
||
self.is_running = True
|
||
self.capture_thread = threading.Thread(target=self.capture_image)
|
||
self.capture_thread.start()
|
||
|
||
def stop_capture(self):
|
||
"""
|
||
Stops capturing images from the webcam.
|
||
"""
|
||
self.is_running = False
|
||
self.capture_thread.join()
|
||
|
||
def capture_image(self):
|
||
"""
|
||
Captures images from the webcam, checks if the image content has changed, and sends the image to the client if it remains the same for 3 seconds.
|
||
"""
|
||
try:
|
||
cap = cv2.VideoCapture(0)
|
||
|
||
while self.is_running:
|
||
ret, frame = cap.read()
|
||
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
||
|
||
if self.last_image is None or self.image_difference(gray) > 2:
|
||
self.last_image = gray
|
||
self.last_change_time = time.time()
|
||
|
||
_, buffer = cv2.imencode('.jpg', frame)
|
||
image_base64 = base64.b64encode(buffer)
|
||
if self.sio:
|
||
run_async(partial(self.sio.emit,"video_stream_image", image_base64.decode('utf-8')))
|
||
|
||
cap.release()
|
||
except Exception as ex:
|
||
self.lollmsCom.error("Couldn't start webcam")
|
||
trace_exception(ex)
|
||
|
||
def image_difference(self, image):
|
||
"""
|
||
Calculates the difference between two images using the absolute difference method.
|
||
|
||
Args:
|
||
image (numpy.ndarray): The current image.
|
||
|
||
Returns:
|
||
int: The sum of pixel intensities representing the difference between the current image and the last image.
|
||
"""
|
||
if self.last_image is None:
|
||
return 0
|
||
|
||
diff = cv2.absdiff(image, self.last_image)
|
||
diff_sum = diff.sum()
|
||
|
||
return diff_sum
|
||
|
||
class MusicPlayer(threading.Thread):
|
||
"""
|
||
MusicPlayer class for playing music using pygame library.
|
||
|
||
Attributes:
|
||
- file_path (str): The path of the music file to be played.
|
||
- paused (bool): Flag to indicate if the music is paused.
|
||
- stopped (bool): Flag to indicate if the music is stopped.
|
||
"""
|
||
|
||
def __init__(self, file_path):
|
||
super().__init__()
|
||
self.file_path = file_path
|
||
self.paused = False
|
||
self.stopped = False
|
||
|
||
def run(self):
|
||
"""
|
||
The main function that runs in a separate thread to play the music.
|
||
"""
|
||
if not PackageManager.check_package_installed("pygame"):
|
||
PackageManager.install_package("pygame")
|
||
import pygame
|
||
|
||
pygame.mixer.init()
|
||
pygame.mixer.music.load(self.file_path)
|
||
pygame.mixer.music.play()
|
||
|
||
while pygame.mixer.music.get_busy() and not self.stopped:
|
||
if self.paused:
|
||
pygame.mixer.music.pause()
|
||
else:
|
||
pygame.mixer.music.unpause()
|
||
|
||
def pause(self):
|
||
"""
|
||
Pauses the music.
|
||
"""
|
||
self.paused = True
|
||
|
||
def resume(self):
|
||
"""
|
||
Resumes the paused music.
|
||
"""
|
||
self.paused = False
|
||
|
||
def stop(self):
|
||
"""
|
||
Stops the music.
|
||
"""
|
||
import pygame
|
||
self.stopped = True
|
||
pygame.mixer.music.stop()
|
||
|
||
|
||
class RealTimeTranscription:
|
||
def __init__(self, callback):
|
||
|
||
# Set up PyAudio
|
||
self.p = pyaudio.PyAudio()
|
||
self.stream = self.p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
|
||
|
||
# Set the callback
|
||
self.callback = callback
|
||
|
||
def start(self):
|
||
import torch
|
||
# Start the stream
|
||
self.stream.start_stream()
|
||
|
||
try:
|
||
while True:
|
||
# Read a chunk of audio data
|
||
data = self.stream.read(1024)
|
||
|
||
# Convert bytes to numpy array
|
||
data_np = np.frombuffer(data, dtype=np.int16)
|
||
# Convert numpy array to float tensor
|
||
data_tensor = torch.tensor(data_np).float()
|
||
# Send the chunk to Whisper for transcription
|
||
result = self.whisper.transcribe(data_tensor)
|
||
|
||
# If the result is not empty, call the callback
|
||
if result:
|
||
self.callback(transcription)
|
||
except KeyboardInterrupt:
|
||
# If the user hits Ctrl+C, stop the stream
|
||
self.stop()
|
||
|
||
def stop(self):
|
||
# Stop the stream and clean up
|
||
self.stream.stop_stream()
|
||
self.stream.close()
|
||
self.p.terminate()
|