mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-04-10 20:29:54 +00:00
new configuration
This commit is contained in:
parent
22f553d9a7
commit
9d458ac979
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 101
|
||||
version: 102
|
||||
binding_name: null
|
||||
model_name: null
|
||||
model_variant: null
|
||||
@ -92,7 +92,7 @@ active_ttm_service: "None" # musicgen (offline)
|
||||
stt_input_device: 0
|
||||
|
||||
|
||||
# TTS service
|
||||
# STT service
|
||||
stt_listening_threshold: 1000
|
||||
stt_silence_duration: 2
|
||||
stt_sound_threshold_percentage: 10
|
||||
@ -101,6 +101,9 @@ stt_rate: 44100
|
||||
stt_channels: 1
|
||||
stt_buffer_size: 10
|
||||
|
||||
stt_activate_word_detection: false
|
||||
stt_word_detection_file: null
|
||||
|
||||
|
||||
|
||||
# ASR STT service
|
||||
|
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 101
|
||||
version: 102
|
||||
binding_name: null
|
||||
model_name: null
|
||||
model_variant: null
|
||||
@ -92,7 +92,7 @@ active_ttm_service: "None" # musicgen (offline)
|
||||
stt_input_device: 0
|
||||
|
||||
|
||||
# TTS service
|
||||
# STT service
|
||||
stt_listening_threshold: 1000
|
||||
stt_silence_duration: 2
|
||||
stt_sound_threshold_percentage: 10
|
||||
@ -101,6 +101,9 @@ stt_rate: 44100
|
||||
stt_channels: 1
|
||||
stt_buffer_size: 10
|
||||
|
||||
stt_activate_word_detection: false
|
||||
stt_word_detection_file: null
|
||||
|
||||
|
||||
|
||||
# ASR STT service
|
||||
|
@ -134,14 +134,14 @@ class RTCom:
|
||||
snd_input_device=None,
|
||||
snd_output_device=None,
|
||||
logs_folder="logs",
|
||||
block_while_talking=True,
|
||||
context_size=4096
|
||||
block_while_talking=True,
|
||||
use_keyword_audio=False,
|
||||
keyword_audio_path=None
|
||||
):
|
||||
self.sio = sio
|
||||
self.lc = lc
|
||||
self.client = client
|
||||
self.block_listening = False
|
||||
self.context_size = context_size
|
||||
self.personality = personality
|
||||
self.rate = rate
|
||||
self.channels = channels
|
||||
@ -152,6 +152,14 @@ class RTCom:
|
||||
self.sound_threshold_percentage = sound_threshold_percentage
|
||||
self.block_while_talking = block_while_talking
|
||||
self.image_shot = None
|
||||
self.use_keyword_audio=use_keyword_audio,
|
||||
self.keyword_audio_path=keyword_audio_path
|
||||
self.summoned = False
|
||||
self.sample_mfccs = None
|
||||
if self.use_keyword_audio and self.keyword_audio_path:
|
||||
self.sample_features = self.load_and_extract_features()
|
||||
|
||||
|
||||
|
||||
if snd_input_device is None:
|
||||
devices = sd.query_devices()
|
||||
@ -186,6 +194,36 @@ class RTCom:
|
||||
self.buffer_lock = threading.Condition()
|
||||
self.transcribed_lock = threading.Condition()
|
||||
|
||||
def load_and_extract_features(self, file_path):
|
||||
if not PackageManager.check_package_installed("librosa"):
|
||||
PackageManager.install_package(librosa)
|
||||
import librosa
|
||||
y, sr = librosa.load(file_path, sr=None)
|
||||
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
||||
return np.mean(mfccs.T, axis=0)
|
||||
|
||||
def extract_features(self, buffer):
|
||||
if not PackageManager.check_package_installed("librosa"):
|
||||
PackageManager.install_package("librosa")
|
||||
import librosa
|
||||
y, sr = librosa.load(buffer, sr=self.rate)
|
||||
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
||||
return np.mean(mfccs.T, axis=0)
|
||||
|
||||
def compare_voices(self, sample_features, realtime_features, th = 20):
|
||||
if not PackageManager.check_package_installed("scipy"):
|
||||
PackageManager.install_package("scipy")
|
||||
from scipy.spatial.distance import euclidean
|
||||
# Calculate the Euclidean distance between the features
|
||||
distance = euclidean(sample_features, realtime_features)
|
||||
|
||||
# If the distance is smaller than the threshold, we have a match!
|
||||
if distance < th:
|
||||
print(f"Voice match found! (distance: {distance}) 🎉🤡")
|
||||
return True
|
||||
else:
|
||||
print(f"No match found. (distance: {distance}) 😢🤡")
|
||||
return False
|
||||
def start_recording(self):
|
||||
self.recording = True
|
||||
self.stop_flag = False
|
||||
@ -204,9 +242,6 @@ class RTCom:
|
||||
with sd.InputStream(channels=self.channels, device=self.snd_input_device, samplerate=self.rate, callback=self.callback, dtype='int16'):
|
||||
while not self.stop_flag:
|
||||
time.sleep(1)
|
||||
|
||||
if self.frames:
|
||||
self._save_wav(self.frames)
|
||||
self.recording = False
|
||||
|
||||
# self._save_histogram(self.audio_values)
|
||||
@ -243,11 +278,19 @@ class RTCom:
|
||||
|
||||
if self.silence_counter > max_scilence:
|
||||
trimmed_frames = self._trim_silence(self.frames)
|
||||
ASCIIColors.yellow(f"\nsound duration: {len(trimmed_frames)/self.rate}")
|
||||
sound_percentage = self._calculate_sound_percentage(trimmed_frames)
|
||||
if sound_percentage >= self.sound_threshold_percentage:
|
||||
ASCIIColors.red(f"Sound percentage {sound_percentage}")
|
||||
ASCIIColors.red("\nSilence counter reached threshold")
|
||||
self._save_wav(self.frames)
|
||||
|
||||
if self.use_keyword_audio and self.keyword_audio_path and self.summoned == False:
|
||||
features = self.extract_features(self.frames)
|
||||
if self.compare_voices(self.sample_features, features):
|
||||
self.summoned = True
|
||||
else:
|
||||
self._save_wav(self.frames)
|
||||
self.summoned = False
|
||||
self.frames = []
|
||||
self.silence_counter = 0
|
||||
self.total_frames = 0
|
||||
@ -426,8 +469,7 @@ class AudioNinja:
|
||||
def callback(indata, frames, time, status):
|
||||
if self.is_recording:
|
||||
self.frames.append(indata.copy())
|
||||
self.lc.info("Ninja is capturing sounds... Shhh!")
|
||||
|
||||
|
||||
with sd.InputStream(callback=callback, device=self.device):
|
||||
while self.is_recording:
|
||||
sd.sleep(1000)
|
||||
@ -450,8 +492,9 @@ class AudioNinja:
|
||||
if self.is_recording:
|
||||
self.is_recording = False
|
||||
self.recording_thread.join()
|
||||
self._save_recording()
|
||||
filename = self._save_recording()
|
||||
self.lc.info("Ninja recording stopped! 🥷⚪️")
|
||||
return filename
|
||||
|
||||
def _save_recording(self):
|
||||
"""
|
||||
@ -465,6 +508,7 @@ class AudioNinja:
|
||||
wf.setframerate(44100)
|
||||
wf.writeframes(b''.join(self.frames))
|
||||
self.lc.info(f"Ninja stored the audio file at '{filename}'! 🥷📂")
|
||||
return filename
|
||||
|
||||
|
||||
class WebcamImageSender:
|
||||
|
@ -1,5 +1,5 @@
|
||||
# =================== Lord Of Large Language Multimodal Systems Configuration file ===========================
|
||||
version: 101
|
||||
version: 102
|
||||
binding_name: null
|
||||
model_name: null
|
||||
model_variant: null
|
||||
@ -92,7 +92,7 @@ active_ttm_service: "None" # musicgen (offline)
|
||||
stt_input_device: 0
|
||||
|
||||
|
||||
# TTS service
|
||||
# STT service
|
||||
stt_listening_threshold: 1000
|
||||
stt_silence_duration: 2
|
||||
stt_sound_threshold_percentage: 10
|
||||
@ -101,6 +101,9 @@ stt_rate: 44100
|
||||
stt_channels: 1
|
||||
stt_buffer_size: 10
|
||||
|
||||
stt_activate_word_detection: false
|
||||
stt_word_detection_file: null
|
||||
|
||||
|
||||
|
||||
# ASR STT service
|
||||
|
Loading…
x
Reference in New Issue
Block a user