From 2a5efca9d6aef387e25963f37a0c948ddf9356ff Mon Sep 17 00:00:00 2001
From: SevaSk <sevaskgit@gmail.com>
Date: Tue, 9 May 2023 00:10:55 -0400
Subject: [PATCH] refactor

---
 AudioRecorder.py                    | 17 ++++++
 AudioStream.py                      | 82 -----------------------------
 AudioTranscriber.py                 | 82 +++++++++--------------------
 gpt_responder.py => GPTResponder.py | 11 ++--
 main.py                             | 67 +++++++++--------------
 5 files changed, 71 insertions(+), 188 deletions(-)
 create mode 100644 AudioRecorder.py
 delete mode 100644 AudioStream.py
 rename gpt_responder.py => GPTResponder.py (61%)

diff --git a/AudioRecorder.py b/AudioRecorder.py
new file mode 100644
index 0000000..93ef28c
--- /dev/null
+++ b/AudioRecorder.py
@@ -0,0 +1,17 @@
+import soundcard as sc
+from Microphone import Microphone
+import pythoncom
+
+RECORDING_TIME = 5
+SAMPLE_RATE = 16000
+
+class AudioRecorder:
+    def __init__(self, microphone : Microphone):
+        self.microphone = microphone
+
+    def record_into_queue(self, audio_queue, source):
+        pythoncom.CoInitialize()
+        with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic:
+            while True:
+                data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array.
+                audio_queue.put((source, data))
\ No newline at end of file
diff --git a/AudioStream.py b/AudioStream.py
deleted file mode 100644
index 20e2a9e..0000000
--- a/AudioStream.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import pyaudio
-import queue
-
-def get_device_list():
-    devices = []
-    p = pyaudio.PyAudio()
-    info = p.get_host_api_info_by_index(0)
-    numdevices = info.get('deviceCount')
-    for i in range(0, numdevices):
-            if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
-                    devices.append(p.get_device_info_by_host_api_device_index(0, i).get('name'))
-            if (p.get_device_info_by_host_api_device_index(0, i).get('maxOutputChannels')) > 0:
-                    devices.append(p.get_device_info_by_host_api_device_index(0, i).get('name'))
-    return devices
-
-class AudioStream(object):
-    """Opens a recording stream as a generator yielding the audio chunks."""
-
-    def __init__(self, rate, input_device_index):
-        self._rate = rate
-        self._chunk = int(rate / 10)  # 100ms for 16000Hz
-        self.input_device_index = input_device_index
-        # Create a thread-safe buffer of audio data
-        self._buff = queue.Queue()
-        self.closed = True
-
-    def __enter__(self):
-        self._audio_interface = pyaudio.PyAudio()
-        self._audio_stream = self._audio_interface.open(
-            format=pyaudio.paInt16,
-            # The API currently only supports 1-channel (mono) audio
-            # https://goo.gl/z757pE
-            channels=1,
-            rate=self._rate,
-            input=True,
-            frames_per_buffer=self._chunk,
-            # Run the audio stream asynchronously to fill the buffer object.
-            # This is necessary so that the input device's buffer doesn't
-            # overflow while the calling thread makes network requests, etc.
-            stream_callback=self._fill_buffer,
-            input_device_index=self.input_device_index,
-        )
-
-        self.closed = False
-
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self._audio_stream.stop_stream()
-        self._audio_stream.close()
-        self.closed = True
-        # Signal the generator to terminate so that the client's
-        # streaming_recognize method will not block the process termination.
-        self._buff.put(None)
-        self._audio_interface.terminate()
-
-    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
-        """Continuously collect data from the audio stream, into the buffer."""
-        self._buff.put(in_data)
-        return None, pyaudio.paContinue
-
-    def generator(self):
-        while not self.closed:
-            # Use a blocking get() to ensure there's at least one chunk of
-            # data, and stop iteration if the chunk is None, indicating the
-            # end of the audio stream.
-            chunk = self._buff.get()
-            if chunk is None:
-                return
-            data = [chunk]
-
-            # Now consume whatever other data's still buffered.
-            while True:
-                try:
-                    chunk = self._buff.get(block=False)
-                    if chunk is None:
-                        return
-                    data.append(chunk)
-                except queue.Empty:
-                    break
-
-            yield b"".join(data)
\ No newline at end of file
diff --git a/AudioTranscriber.py b/AudioTranscriber.py
index e0d76f3..352aa7d 100644
--- a/AudioTranscriber.py
+++ b/AudioTranscriber.py
@@ -1,68 +1,36 @@
 import numpy as np
 import soundcard as sc
-import threading
-import time
-import queue
 import whisper
 import torch
-import argparse
 import wave
 import os
-from Microphone import Microphone
-
-TRANSCRIPT_LIMIT = 10
-RECORDING_TIME = 5
 
 class AudioTranscriber:
-    def __init__(self, lang: str, microphone : Microphone):
-        self.audio_np_array_queue = queue.Queue()
-        self.transcript_data = []
-        self.microphone = microphone
-        self.lang = lang
-        self.lock = threading.Lock()
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--model", default="tiny", help="Model to use",
-                            choices=["tiny", "base", "small", "medium", "large"])
-        parser.add_argument("--non_english", action='store_true',
-                            help="Don't use the english model.")
-        parser.add_argument("--energy_threshold", default=1000,
-                            help="Energy level for mic to detect.", type=int)
-        parser.add_argument("--record_timeout", default=2,
-                            help="How real time the recording is in seconds.", type=float)
-        parser.add_argument("--phrase_timeout", default=3,
-                            help="How much empty space between recordings before we "
-                                 "consider it a new line in the transcription.", type=float)  
-        args = parser.parse_args()
-        # Load / Download model
-        model = args.model
-        if args.model != "large" and not args.non_english:
-            model = model + ".en"
+    def __init__(self):
+        self.transcript = []
         self.audio_model = whisper.load_model(os.getcwd() + r'\tiny.en' + '.pt')
 
+    def transcribe(self, audio_data):
+        with wave.open(f'temp_{id(self)}.wav', 'wb') as wav_file:
+            wav_file.setnchannels(audio_data.shape[1])
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(16000)
+            audio_data = (audio_data * (2**15 - 1)).astype(np.int16)
+            wav_file.writeframes(audio_data.tobytes())
+        result = self.audio_model.transcribe(f'temp_{id(self)}.wav', fp16=torch.cuda.is_available())
+        print(torch.cuda.is_available())
+        text = result['text'].strip()
+        return text
+    
+    def create_transcription_from_queue(self, audio_queue):
+        while True:
+            top_of_queue = audio_queue.get()
+            source = top_of_queue[0]
+            audio_data = top_of_queue[1]
+            audio_data_transcription = self.transcribe(audio_data)
+            # whisper gives "you" on many null inputs
+            if audio_data_transcription != '' and audio_data_transcription.lower() != 'you':
+                self.transcript = [source + ": [" + audio_data_transcription + ']\n\n'] + self.transcript
+
     def get_transcript(self):
-        return self.transcript_data
-
-    def record_into_queue(self):
-        SAMPLE_RATE = 16000
-        with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic:
-            while True:
-                data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array.
-                self.audio_np_array_queue.put(data)
-            return
-
-    def transcribe_from_queue(self):
-        with self.lock:
-            while True:
-                audio_data = self.audio_np_array_queue.get()
-                with wave.open(f'temp_{self.microphone.id}.wav', 'wb') as wav_file:
-                    wav_file.setnchannels(audio_data.shape[1])
-                    wav_file.setsampwidth(2)
-                    wav_file.setframerate(16000)
-                    audio_data = (audio_data * (2**15 - 1)).astype(np.int16)
-                    wav_file.writeframes(audio_data.tobytes())
-                result = self.audio_model.transcribe(f'temp_{self.microphone.id}.wav', fp16=torch.cuda.is_available())
-                text = result['text'].strip()
-                if text != '' and text.lower() != 'you': # whisper gives "you" on many null inputs
-                    timestamp = int(time.time())
-                    self.transcript_data.append({'utterance': text, 'timestamp': timestamp})
\ No newline at end of file
+        return "".join(self.transcript)
\ No newline at end of file
diff --git a/gpt_responder.py b/GPTResponder.py
similarity index 61%
rename from gpt_responder.py
rename to GPTResponder.py
index d16b34a..8538aab 100644
--- a/gpt_responder.py
+++ b/GPTResponder.py
@@ -1,17 +1,15 @@
 import openai
 from keys import OPENAI_API_KEY
 from prompts import create_prompt, INITIAL_RESPONSE
+import time
 
 openai.api_key = OPENAI_API_KEY
 
 class GPTResponder:
     def __init__(self):
-        self.last_transcript = ""
         self.last_response = INITIAL_RESPONSE
 
     def generate_response_from_transcript(self, transcript):
-        if transcript == self.last_transcript:
-            return self.last_response
         response = openai.ChatCompletion.create(
               model="gpt-3.5-turbo-0301",
               messages=[{"role": "system", "content": create_prompt(transcript)}],
@@ -19,9 +17,6 @@ class GPTResponder:
         )
         full_response = response.choices[0].message.content
         try:
-            conversational_response = full_response.split('[')[1].split(']')[0]
+            self.last_response = full_response.split('[')[1].split(']')[0]
         except:
-            return self.last_response
-        self.last_transcript = transcript
-        self.last_response = conversational_response
-        return conversational_response
\ No newline at end of file
+            pass
\ No newline at end of file
diff --git a/main.py b/main.py
index 20385e7..8bdf178 100644
--- a/main.py
+++ b/main.py
@@ -1,38 +1,24 @@
-# pyinstaller --onedir --add-data "C:/Users/mcfar/AppData/Local/Programs/Python/Python310/Lib/site-packages/customtkinter;customtkinter/" --noconfirm --windowed --noconsole main.py
-
+import soundcard as sc
 import threading
-from AudioTranscriber import AudioTranscriber, TRANSCRIPT_LIMIT
-from gpt_responder import GPTResponder
+from AudioTranscriber import AudioTranscriber
+from GPTResponder import GPTResponder
 import customtkinter as ctk
 from Microphone import Microphone
-import soundcard as sc
+from AudioRecorder import AudioRecorder
+import queue
 
 def write_in_textbox(textbox, text):
     textbox.delete("0.0", "end")
     textbox.insert("0.0", text)
 
-#TODO make fast leetcode :)
-def create_transcript_string(transcriber_mic, transcriber_speaker, reverse = True):
-    transcript_string = ""
-
-    mic_transcript = transcriber_mic.get_transcript()
-    speaker_transcript = transcriber_speaker.get_transcript()
-    total_transcript = [('You', data) for data in mic_transcript] + [('Speaker', data) for data in speaker_transcript]
-    sorted_transcript = sorted(total_transcript, key = lambda x: x[1]['timestamp'], reverse = reverse)
-    for source, line in sorted_transcript[:TRANSCRIPT_LIMIT]:
-        transcript_string += source + ": [" + line['utterance'] + ']\n\n'
-    return transcript_string
-
-def update_transcript_UI(transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, textbox):
-    transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker, reverse=True)
+def update_transcript_UI(transcriber, textbox):
+    transcript_string = transcriber.get_transcript()
     textbox.delete("0.0", "end")
     textbox.insert("0.0", transcript_string)
-    textbox.after(200, update_transcript_UI, transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, textbox)
+    textbox.after(300, update_transcript_UI, transcriber, textbox)
 
 def update_response_UI(transcriber_mic, transcriber_speaker, responder, textbox, update_interval_slider_label, update_interval_slider):
-    transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker,reverse=False)
-    t = threading.Thread(target=lambda: responder.generate_response_from_transcript(transcript_string))
-    t.start()
+    #transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker,reverse=False)
     textbox.configure(state="normal")
     textbox.delete("0.0", "end")
     textbox.insert("0.0", responder.last_response)
@@ -61,7 +47,7 @@ if __name__ == "__main__":
     response_textbox.grid(row=0, column=1, padx=10, pady=20, sticky="nsew")
 
      # Add the clear transcript button to the UI
-    clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_transcript_data(transcriber_mic, transcriber_speaker))
+    clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_transcript_data(user_transcriber, transcriber_speaker))
     clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")
     # empty label, necessary for proper grid spacing
     update_interval_slider_label = ctk.CTkLabel(root, text=f"", font=("Arial", 12), text_color="#FFFCF2")
@@ -74,21 +60,23 @@ if __name__ == "__main__":
     update_interval_slider_label = ctk.CTkLabel(root, text=f"Update interval: {update_interval_slider.get()} seconds", font=("Arial", 12), text_color="#FFFCF2")
     update_interval_slider_label.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")
 
-    responder = GPTResponder()
+    audio_queue = queue.Queue()
 
     user_mirophone = Microphone(str(sc.default_microphone().name), False)
-    transcriber_mic = AudioTranscriber(lang='en-US', microphone=user_mirophone)
-    recorder_thread_mic = threading.Thread(target=transcriber_mic.record_into_queue)
-    transcriber_thread_mic = threading.Thread(target=transcriber_mic.transcribe_from_queue)
-    recorder_thread_mic.start()
-    transcriber_thread_mic.start()
+    user_audio_recorder = AudioRecorder(user_mirophone)
+
+    record_user = threading.Thread(target=user_audio_recorder.record_into_queue, args=(audio_queue, "You",))
+    record_user.start()
 
     speaker_mirophone = Microphone(str(sc.default_speaker().name), True)
-    transcriber_speaker = AudioTranscriber(lang='en-US', microphone=speaker_mirophone)
-    recorder_thread_speaker = threading.Thread(target=transcriber_speaker.record_into_queue)
-    transcriber_thread_speaker = threading.Thread(target=transcriber_speaker.transcribe_from_queue)
-    recorder_thread_speaker.start()
-    transcriber_thread_speaker.start()
+    speaker_audio_recorder = AudioRecorder(speaker_mirophone)
+
+    record_speaker = threading.Thread(target=speaker_audio_recorder.record_into_queue, args=(audio_queue, "Speaker",))
+    record_speaker.start()
+
+    global_transcriber = AudioTranscriber()
+    transcribe = threading.Thread(target=global_transcriber.create_transcription_from_queue, args=(audio_queue,))
+    transcribe.start()
 
     root.grid_rowconfigure(0, weight=100)
     root.grid_rowconfigure(1, weight=10)
@@ -98,10 +86,7 @@ if __name__ == "__main__":
     root.grid_columnconfigure(0, weight=2)
     root.grid_columnconfigure(1, weight=1)
 
-    update_transcript_UI(transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, transcript_textbox)
-    update_response_UI(transcriber_mic, transcriber_speaker, responder, response_textbox, update_interval_slider_label, update_interval_slider)
+    update_transcript_UI(global_transcriber, transcript_textbox)
+    #update_response_UI(user_transcriber, transcriber_speaker, responder, response_textbox, update_interval_slider_label, update_interval_slider)
 
-    root.mainloop()
-
-    transcriber_thread_mic.join()
-    transcriber_thread_speaker.join()
\ No newline at end of file
+    root.mainloop()
\ No newline at end of file