From 2a5efca9d6aef387e25963f37a0c948ddf9356ff Mon Sep 17 00:00:00 2001 From: SevaSk Date: Tue, 9 May 2023 00:10:55 -0400 Subject: [PATCH] refactor --- AudioRecorder.py | 17 ++++++ AudioStream.py | 82 ----------------------------- AudioTranscriber.py | 82 +++++++++-------------------- gpt_responder.py => GPTResponder.py | 11 ++-- main.py | 67 +++++++++-------------- 5 files changed, 71 insertions(+), 188 deletions(-) create mode 100644 AudioRecorder.py delete mode 100644 AudioStream.py rename gpt_responder.py => GPTResponder.py (61%) diff --git a/AudioRecorder.py b/AudioRecorder.py new file mode 100644 index 0000000..93ef28c --- /dev/null +++ b/AudioRecorder.py @@ -0,0 +1,17 @@ +import soundcard as sc +from Microphone import Microphone +import pythoncom + +RECORDING_TIME = 5 +SAMPLE_RATE = 16000 + +class AudioRecorder: + def __init__(self, microphone : Microphone): + self.microphone = microphone + + def record_into_queue(self, audio_queue, source): + pythoncom.CoInitialize() + with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic: + while True: + data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array. + audio_queue.put((source, data)) \ No newline at end of file diff --git a/AudioStream.py b/AudioStream.py deleted file mode 100644 index 20e2a9e..0000000 --- a/AudioStream.py +++ /dev/null @@ -1,82 +0,0 @@ -import pyaudio -import queue - -def get_device_list(): - devices = [] - p = pyaudio.PyAudio() - info = p.get_host_api_info_by_index(0) - numdevices = info.get('deviceCount') - for i in range(0, numdevices): - if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0: - devices.append(p.get_device_info_by_host_api_device_index(0, i).get('name')) - if (p.get_device_info_by_host_api_device_index(0, i).get('maxOutputChannels')) > 0: - devices.append(p.get_device_info_by_host_api_device_index(0, i).get('name')) - return devices - -class AudioStream(object): - """Opens a recording stream as a generator yielding the audio chunks.""" - - def __init__(self, rate, input_device_index): - self._rate = rate - self._chunk = int(rate / 10) # 100ms for 16000Hz - self.input_device_index = input_device_index - # Create a thread-safe buffer of audio data - self._buff = queue.Queue() - self.closed = True - - def __enter__(self): - self._audio_interface = pyaudio.PyAudio() - self._audio_stream = self._audio_interface.open( - format=pyaudio.paInt16, - # The API currently only supports 1-channel (mono) audio - # https://goo.gl/z757pE - channels=1, - rate=self._rate, - input=True, - frames_per_buffer=self._chunk, - # Run the audio stream asynchronously to fill the buffer object. - # This is necessary so that the input device's buffer doesn't - # overflow while the calling thread makes network requests, etc. - stream_callback=self._fill_buffer, - input_device_index=self.input_device_index, - ) - - self.closed = False - - return self - - def __exit__(self, type, value, traceback): - self._audio_stream.stop_stream() - self._audio_stream.close() - self.closed = True - # Signal the generator to terminate so that the client's - # streaming_recognize method will not block the process termination. - self._buff.put(None) - self._audio_interface.terminate() - - def _fill_buffer(self, in_data, frame_count, time_info, status_flags): - """Continuously collect data from the audio stream, into the buffer.""" - self._buff.put(in_data) - return None, pyaudio.paContinue - - def generator(self): - while not self.closed: - # Use a blocking get() to ensure there's at least one chunk of - # data, and stop iteration if the chunk is None, indicating the - # end of the audio stream. - chunk = self._buff.get() - if chunk is None: - return - data = [chunk] - - # Now consume whatever other data's still buffered. - while True: - try: - chunk = self._buff.get(block=False) - if chunk is None: - return - data.append(chunk) - except queue.Empty: - break - - yield b"".join(data) \ No newline at end of file diff --git a/AudioTranscriber.py b/AudioTranscriber.py index e0d76f3..352aa7d 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -1,68 +1,36 @@ import numpy as np import soundcard as sc -import threading -import time -import queue import whisper import torch -import argparse import wave import os -from Microphone import Microphone - -TRANSCRIPT_LIMIT = 10 -RECORDING_TIME = 5 class AudioTranscriber: - def __init__(self, lang: str, microphone : Microphone): - self.audio_np_array_queue = queue.Queue() - self.transcript_data = [] - self.microphone = microphone - self.lang = lang - self.lock = threading.Lock() - - parser = argparse.ArgumentParser() - parser.add_argument("--model", default="tiny", help="Model to use", - choices=["tiny", "base", "small", "medium", "large"]) - parser.add_argument("--non_english", action='store_true', - help="Don't use the english model.") - parser.add_argument("--energy_threshold", default=1000, - help="Energy level for mic to detect.", type=int) - parser.add_argument("--record_timeout", default=2, - help="How real time the recording is in seconds.", type=float) - parser.add_argument("--phrase_timeout", default=3, - help="How much empty space between recordings before we " - "consider it a new line in the transcription.", type=float) - args = parser.parse_args() - # Load / Download model - model = args.model - if args.model != "large" and not args.non_english: - model = model + ".en" + def __init__(self): + self.transcript = [] self.audio_model = whisper.load_model(os.getcwd() + r'\tiny.en' + '.pt') + def transcribe(self, audio_data): + with wave.open(f'temp_{id(self)}.wav', 'wb') as wav_file: + wav_file.setnchannels(audio_data.shape[1]) + wav_file.setsampwidth(2) + wav_file.setframerate(16000) + audio_data = (audio_data * (2**15 - 1)).astype(np.int16) + wav_file.writeframes(audio_data.tobytes()) + result = self.audio_model.transcribe(f'temp_{id(self)}.wav', fp16=torch.cuda.is_available()) + print(torch.cuda.is_available()) + text = result['text'].strip() + return text + + def create_transcription_from_queue(self, audio_queue): + while True: + top_of_queue = audio_queue.get() + source = top_of_queue[0] + audio_data = top_of_queue[1] + audio_data_transcription = self.transcribe(audio_data) + # whisper gives "you" on many null inputs + if audio_data_transcription != '' and audio_data_transcription.lower() != 'you': + self.transcript = [source + ": [" + audio_data_transcription + ']\n\n'] + self.transcript + def get_transcript(self): - return self.transcript_data - - def record_into_queue(self): - SAMPLE_RATE = 16000 - with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic: - while True: - data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array. - self.audio_np_array_queue.put(data) - return - - def transcribe_from_queue(self): - with self.lock: - while True: - audio_data = self.audio_np_array_queue.get() - with wave.open(f'temp_{self.microphone.id}.wav', 'wb') as wav_file: - wav_file.setnchannels(audio_data.shape[1]) - wav_file.setsampwidth(2) - wav_file.setframerate(16000) - audio_data = (audio_data * (2**15 - 1)).astype(np.int16) - wav_file.writeframes(audio_data.tobytes()) - result = self.audio_model.transcribe(f'temp_{self.microphone.id}.wav', fp16=torch.cuda.is_available()) - text = result['text'].strip() - if text != '' and text.lower() != 'you': # whisper gives "you" on many null inputs - timestamp = int(time.time()) - self.transcript_data.append({'utterance': text, 'timestamp': timestamp}) \ No newline at end of file + return "".join(self.transcript) \ No newline at end of file diff --git a/gpt_responder.py b/GPTResponder.py similarity index 61% rename from gpt_responder.py rename to GPTResponder.py index d16b34a..8538aab 100644 --- a/gpt_responder.py +++ b/GPTResponder.py @@ -1,17 +1,15 @@ import openai from keys import OPENAI_API_KEY from prompts import create_prompt, INITIAL_RESPONSE +import time openai.api_key = OPENAI_API_KEY class GPTResponder: def __init__(self): - self.last_transcript = "" self.last_response = INITIAL_RESPONSE def generate_response_from_transcript(self, transcript): - if transcript == self.last_transcript: - return self.last_response response = openai.ChatCompletion.create( model="gpt-3.5-turbo-0301", messages=[{"role": "system", "content": create_prompt(transcript)}], @@ -19,9 +17,6 @@ class GPTResponder: ) full_response = response.choices[0].message.content try: - conversational_response = full_response.split('[')[1].split(']')[0] + self.last_response = full_response.split('[')[1].split(']')[0] except: - return self.last_response - self.last_transcript = transcript - self.last_response = conversational_response - return conversational_response \ No newline at end of file + pass \ No newline at end of file diff --git a/main.py b/main.py index 20385e7..8bdf178 100644 --- a/main.py +++ b/main.py @@ -1,38 +1,24 @@ -# pyinstaller --onedir --add-data "C:/Users/mcfar/AppData/Local/Programs/Python/Python310/Lib/site-packages/customtkinter;customtkinter/" --noconfirm --windowed --noconsole main.py - +import soundcard as sc import threading -from AudioTranscriber import AudioTranscriber, TRANSCRIPT_LIMIT -from gpt_responder import GPTResponder +from AudioTranscriber import AudioTranscriber +from GPTResponder import GPTResponder import customtkinter as ctk from Microphone import Microphone -import soundcard as sc +from AudioRecorder import AudioRecorder +import queue def write_in_textbox(textbox, text): textbox.delete("0.0", "end") textbox.insert("0.0", text) -#TODO make fast leetcode :) -def create_transcript_string(transcriber_mic, transcriber_speaker, reverse = True): - transcript_string = "" - - mic_transcript = transcriber_mic.get_transcript() - speaker_transcript = transcriber_speaker.get_transcript() - total_transcript = [('You', data) for data in mic_transcript] + [('Speaker', data) for data in speaker_transcript] - sorted_transcript = sorted(total_transcript, key = lambda x: x[1]['timestamp'], reverse = reverse) - for source, line in sorted_transcript[:TRANSCRIPT_LIMIT]: - transcript_string += source + ": [" + line['utterance'] + ']\n\n' - return transcript_string - -def update_transcript_UI(transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, textbox): - transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker, reverse=True) +def update_transcript_UI(transcriber, textbox): + transcript_string = transcriber.get_transcript() textbox.delete("0.0", "end") textbox.insert("0.0", transcript_string) - textbox.after(200, update_transcript_UI, transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, textbox) + textbox.after(300, update_transcript_UI, transcriber, textbox) def update_response_UI(transcriber_mic, transcriber_speaker, responder, textbox, update_interval_slider_label, update_interval_slider): - transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker,reverse=False) - t = threading.Thread(target=lambda: responder.generate_response_from_transcript(transcript_string)) - t.start() + #transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker,reverse=False) textbox.configure(state="normal") textbox.delete("0.0", "end") textbox.insert("0.0", responder.last_response) @@ -61,7 +47,7 @@ if __name__ == "__main__": response_textbox.grid(row=0, column=1, padx=10, pady=20, sticky="nsew") # Add the clear transcript button to the UI - clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_transcript_data(transcriber_mic, transcriber_speaker)) + clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_transcript_data(user_transcriber, transcriber_speaker)) clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew") # empty label, necessary for proper grid spacing update_interval_slider_label = ctk.CTkLabel(root, text=f"", font=("Arial", 12), text_color="#FFFCF2") @@ -74,21 +60,23 @@ if __name__ == "__main__": update_interval_slider_label = ctk.CTkLabel(root, text=f"Update interval: {update_interval_slider.get()} seconds", font=("Arial", 12), text_color="#FFFCF2") update_interval_slider_label.grid(row=2, column=1, padx=10, pady=10, sticky="nsew") - responder = GPTResponder() + audio_queue = queue.Queue() user_mirophone = Microphone(str(sc.default_microphone().name), False) - transcriber_mic = AudioTranscriber(lang='en-US', microphone=user_mirophone) - recorder_thread_mic = threading.Thread(target=transcriber_mic.record_into_queue) - transcriber_thread_mic = threading.Thread(target=transcriber_mic.transcribe_from_queue) - recorder_thread_mic.start() - transcriber_thread_mic.start() + user_audio_recorder = AudioRecorder(user_mirophone) + + record_user = threading.Thread(target=user_audio_recorder.record_into_queue, args=(audio_queue, "You",)) + record_user.start() speaker_mirophone = Microphone(str(sc.default_speaker().name), True) - transcriber_speaker = AudioTranscriber(lang='en-US', microphone=speaker_mirophone) - recorder_thread_speaker = threading.Thread(target=transcriber_speaker.record_into_queue) - transcriber_thread_speaker = threading.Thread(target=transcriber_speaker.transcribe_from_queue) - recorder_thread_speaker.start() - transcriber_thread_speaker.start() + speaker_audio_recorder = AudioRecorder(speaker_mirophone) + + record_speaker = threading.Thread(target=speaker_audio_recorder.record_into_queue, args=(audio_queue, "Speaker",)) + record_speaker.start() + + global_transcriber = AudioTranscriber() + transcribe = threading.Thread(target=global_transcriber.create_transcription_from_queue, args=(audio_queue,)) + transcribe.start() root.grid_rowconfigure(0, weight=100) root.grid_rowconfigure(1, weight=10) @@ -98,10 +86,7 @@ if __name__ == "__main__": root.grid_columnconfigure(0, weight=2) root.grid_columnconfigure(1, weight=1) - update_transcript_UI(transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, transcript_textbox) - update_response_UI(transcriber_mic, transcriber_speaker, responder, response_textbox, update_interval_slider_label, update_interval_slider) + update_transcript_UI(global_transcriber, transcript_textbox) + #update_response_UI(user_transcriber, transcriber_speaker, responder, response_textbox, update_interval_slider_label, update_interval_slider) - root.mainloop() - - transcriber_thread_mic.join() - transcriber_thread_speaker.join() \ No newline at end of file + root.mainloop() \ No newline at end of file