From 654464c8d904fbc6d5857d6476406a5aeead683d Mon Sep 17 00:00:00 2001 From: SevaSk Date: Sat, 13 May 2023 10:36:00 -0400 Subject: [PATCH] refactor audio recording --- AudioRecorder.py | 15 +++++----- AudioTranscriber.py | 10 +++---- custom_speech_recognition/__init__.py | 40 ++++++++------------------- 3 files changed, 23 insertions(+), 42 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index a71e132..057ae21 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -7,12 +7,11 @@ ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False class BaseRecorder: - def __init__(self, source, num_channels, source_name): + def __init__(self, source, source_name): self.recorder = sr.Recognizer() self.recorder.energy_threshold = ENERGY_THRESHOLD self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD self.source = source - self.num_channels = num_channels self.source_name = source_name def adjust_for_noise(self): @@ -30,7 +29,7 @@ class BaseRecorder: class DefaultMicRecorder(BaseRecorder): def __init__(self): - super().__init__(source=sr.Microphone(sample_rate=16000), num_channels=1, source_name="You") + super().__init__(source=sr.Microphone(sample_rate=16000), source_name="You") self.adjust_for_noise() class DefaultSpeakerRecorder(BaseRecorder): @@ -47,8 +46,10 @@ class DefaultSpeakerRecorder(BaseRecorder): else: print("[ERROR] No loopback device found.") - source = sr.Microphone(sample_rate=int(default_speakers["defaultSampleRate"]), - speaker=True, - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16)) - super().__init__(source=source, num_channels=default_speakers["maxInputChannels"], source_name="Speaker") + source = sr.Microphone(speaker=True, + device_index= default_speakers["index"], + sample_rate=int(default_speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=default_speakers["maxInputChannels"]) + super().__init__(source=source, source_name="Speaker") self.adjust_for_noise() \ No newline at end of file diff --git a/AudioTranscriber.py b/AudioTranscriber.py index ccf3d53..042e70a 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -24,7 +24,7 @@ class AudioTranscriber: "You": { "sample_rate": default_mic.source.SAMPLE_RATE, "sample_width": default_mic.source.SAMPLE_WIDTH, - "channels": default_mic.num_channels, + "channels": default_mic.source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -33,7 +33,7 @@ class AudioTranscriber: "Speaker": { "sample_rate": default_speaker.source.SAMPLE_RATE, "sample_width": default_speaker.source.SAMPLE_WIDTH, - "channels": default_speaker.num_channels, + "channels": default_speaker.source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -44,9 +44,8 @@ class AudioTranscriber: def transcribe_audio_queue(self, audio_queue): while True: who_spoke, data, time_spoken = audio_queue.get() - source_info = self.audio_sources[who_spoke] - self.update_last_sample_and_phrase_status(who_spoke, data, time_spoken) + source_info = self.audio_sources[who_spoke] temp_file = source_info["process_data_func"](source_info["last_sample"]) text = self.get_transcription(temp_file) @@ -107,5 +106,4 @@ class AudioTranscriber: def clear_transcript_data(self): self.transcript_data["You"].clear() - self.transcript_data["Speaker"].clear() - \ No newline at end of file + self.transcript_data["Speaker"].clear() \ No newline at end of file diff --git a/custom_speech_recognition/__init__.py b/custom_speech_recognition/__init__.py index fc599ba..1d339b0 100644 --- a/custom_speech_recognition/__init__.py +++ b/custom_speech_recognition/__init__.py @@ -71,7 +71,7 @@ class Microphone(AudioSource): Higher ``chunk_size`` values help avoid triggering on rapidly changing ambient noise, but also makes detection less sensitive. This value, generally, should be left at its default. """ - def __init__(self, device_index=None, sample_rate=None, chunk_size=1024, speaker=False): + def __init__(self, device_index=None, sample_rate=None, chunk_size=1024, speaker=False, channels = 1): assert device_index is None or isinstance(device_index, int), "Device index must be None or an integer" assert sample_rate is None or (isinstance(sample_rate, int) and sample_rate > 0), "Sample rate must be None or a positive integer" assert isinstance(chunk_size, int) and chunk_size > 0, "Chunk size must be a positive integer" @@ -96,6 +96,7 @@ class Microphone(AudioSource): self.SAMPLE_WIDTH = self.pyaudio_module.get_sample_size(self.format) # size of each sample self.SAMPLE_RATE = sample_rate # sampling rate in Hertz self.CHUNK = chunk_size # number of frames stored in each buffer + self.channels = channels self.audio = None self.stream = None @@ -178,35 +179,16 @@ class Microphone(AudioSource): try: if self.speaker: p = self.audio - pyaudio = self.pyaudio_module - try: - wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) - except: - pass - - default_speakers = p.get_device_info_by_index(wasapi_info["defaultOutputDevice"]) - if not default_speakers["isLoopbackDevice"]: - for loopback in p.get_loopback_device_info_generator(): - """ - Try to find loopback device with same name(and [Loopback suffix]). - Unfortunately, this is the most adequate way at the moment. - """ - if default_speakers["name"] in loopback["name"]: - default_speakers = loopback - break - else: - exit() - - self.stream = Microphone.MicrophoneStream( - p.open( - input_device_index=default_speakers["index"], - channels=default_speakers["maxInputChannels"], - format=self.format, - rate=int(default_speakers["defaultSampleRate"]), - frames_per_buffer=pyaudio.get_sample_size(pyaudio.paInt16), - input=True, - ) + self.stream = Microphone.MicrophoneStream( + p.open( + input_device_index=self.device_index, + channels=self.channels, + format=self.format, + rate=self.SAMPLE_RATE, + frames_per_buffer=self.CHUNK, + input=True ) + ) else: self.stream = Microphone.MicrophoneStream( self.audio.open(