diff --git a/configs/config.yaml b/configs/config.yaml index c1603fe..9f3dffb 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 101 +version: 102 binding_name: null model_name: null model_variant: null @@ -92,7 +92,7 @@ active_ttm_service: "None" # musicgen (offline) stt_input_device: 0 -# TTS service +# STT service stt_listening_threshold: 1000 stt_silence_duration: 2 stt_sound_threshold_percentage: 10 @@ -101,6 +101,9 @@ stt_rate: 44100 stt_channels: 1 stt_buffer_size: 10 +stt_activate_word_detection: false +stt_word_detection_file: null + # ASR STT service diff --git a/lollms/configs/config.yaml b/lollms/configs/config.yaml index c1603fe..9f3dffb 100644 --- a/lollms/configs/config.yaml +++ b/lollms/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 101 +version: 102 binding_name: null model_name: null model_variant: null @@ -92,7 +92,7 @@ active_ttm_service: "None" # musicgen (offline) stt_input_device: 0 -# TTS service +# STT service stt_listening_threshold: 1000 stt_silence_duration: 2 stt_sound_threshold_percentage: 10 @@ -101,6 +101,9 @@ stt_rate: 44100 stt_channels: 1 stt_buffer_size: 10 +stt_activate_word_detection: false +stt_word_detection_file: null + # ASR STT service diff --git a/lollms/media.py b/lollms/media.py index 9b83977..14d2a2c 100644 --- a/lollms/media.py +++ b/lollms/media.py @@ -134,14 +134,14 @@ class RTCom: snd_input_device=None, snd_output_device=None, logs_folder="logs", - block_while_talking=True, - context_size=4096 + block_while_talking=True, + use_keyword_audio=False, + keyword_audio_path=None ): self.sio = sio self.lc = lc self.client = client self.block_listening = False - self.context_size = context_size self.personality = personality self.rate = rate self.channels = channels @@ -152,6 +152,14 @@ class RTCom: self.sound_threshold_percentage = sound_threshold_percentage self.block_while_talking = block_while_talking self.image_shot = None + self.use_keyword_audio=use_keyword_audio, + self.keyword_audio_path=keyword_audio_path + self.summoned = False + self.sample_mfccs = None + if self.use_keyword_audio and self.keyword_audio_path: + self.sample_features = self.load_and_extract_features() + + if snd_input_device is None: devices = sd.query_devices() @@ -186,6 +194,36 @@ class RTCom: self.buffer_lock = threading.Condition() self.transcribed_lock = threading.Condition() + def load_and_extract_features(self, file_path): + if not PackageManager.check_package_installed("librosa"): + PackageManager.install_package(librosa) + import librosa + y, sr = librosa.load(file_path, sr=None) + mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) + return np.mean(mfccs.T, axis=0) + + def extract_features(self, buffer): + if not PackageManager.check_package_installed("librosa"): + PackageManager.install_package("librosa") + import librosa + y, sr = librosa.load(buffer, sr=self.rate) + mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) + return np.mean(mfccs.T, axis=0) + + def compare_voices(self, sample_features, realtime_features, th = 20): + if not PackageManager.check_package_installed("scipy"): + PackageManager.install_package("scipy") + from scipy.spatial.distance import euclidean + # Calculate the Euclidean distance between the features + distance = euclidean(sample_features, realtime_features) + + # If the distance is smaller than the threshold, we have a match! + if distance < th: + print(f"Voice match found! (distance: {distance}) 🎉🤡") + return True + else: + print(f"No match found. (distance: {distance}) 😢🤡") + return False def start_recording(self): self.recording = True self.stop_flag = False @@ -204,9 +242,6 @@ class RTCom: with sd.InputStream(channels=self.channels, device=self.snd_input_device, samplerate=self.rate, callback=self.callback, dtype='int16'): while not self.stop_flag: time.sleep(1) - - if self.frames: - self._save_wav(self.frames) self.recording = False # self._save_histogram(self.audio_values) @@ -243,11 +278,19 @@ class RTCom: if self.silence_counter > max_scilence: trimmed_frames = self._trim_silence(self.frames) + ASCIIColors.yellow(f"\nsound duration: {len(trimmed_frames)/self.rate}") sound_percentage = self._calculate_sound_percentage(trimmed_frames) if sound_percentage >= self.sound_threshold_percentage: ASCIIColors.red(f"Sound percentage {sound_percentage}") ASCIIColors.red("\nSilence counter reached threshold") - self._save_wav(self.frames) + + if self.use_keyword_audio and self.keyword_audio_path and self.summoned == False: + features = self.extract_features(self.frames) + if self.compare_voices(self.sample_features, features): + self.summoned = True + else: + self._save_wav(self.frames) + self.summoned = False self.frames = [] self.silence_counter = 0 self.total_frames = 0 @@ -426,8 +469,7 @@ class AudioNinja: def callback(indata, frames, time, status): if self.is_recording: self.frames.append(indata.copy()) - self.lc.info("Ninja is capturing sounds... Shhh!") - + with sd.InputStream(callback=callback, device=self.device): while self.is_recording: sd.sleep(1000) @@ -450,8 +492,9 @@ class AudioNinja: if self.is_recording: self.is_recording = False self.recording_thread.join() - self._save_recording() + filename = self._save_recording() self.lc.info("Ninja recording stopped! 🥷⚪️") + return filename def _save_recording(self): """ @@ -465,6 +508,7 @@ class AudioNinja: wf.setframerate(44100) wf.writeframes(b''.join(self.frames)) self.lc.info(f"Ninja stored the audio file at '{filename}'! 🥷📂") + return filename class WebcamImageSender: diff --git a/lollms/server/configs/config.yaml b/lollms/server/configs/config.yaml index c1603fe..9f3dffb 100644 --- a/lollms/server/configs/config.yaml +++ b/lollms/server/configs/config.yaml @@ -1,5 +1,5 @@ # =================== Lord Of Large Language Multimodal Systems Configuration file =========================== -version: 101 +version: 102 binding_name: null model_name: null model_variant: null @@ -92,7 +92,7 @@ active_ttm_service: "None" # musicgen (offline) stt_input_device: 0 -# TTS service +# STT service stt_listening_threshold: 1000 stt_silence_duration: 2 stt_sound_threshold_percentage: 10 @@ -101,6 +101,9 @@ stt_rate: 44100 stt_channels: 1 stt_buffer_size: 10 +stt_activate_word_detection: false +stt_word_detection_file: null + # ASR STT service