This commit is contained in:
SevaSk 2023-05-09 00:10:55 -04:00
parent 832c8da00d
commit 2a5efca9d6
5 changed files with 71 additions and 188 deletions

17
AudioRecorder.py Normal file
View File

@ -0,0 +1,17 @@
import soundcard as sc
from Microphone import Microphone
import pythoncom
RECORDING_TIME = 5
SAMPLE_RATE = 16000
class AudioRecorder:
def __init__(self, microphone : Microphone):
self.microphone = microphone
def record_into_queue(self, audio_queue, source):
pythoncom.CoInitialize()
with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic:
while True:
data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array.
audio_queue.put((source, data))

View File

@ -1,82 +0,0 @@
import pyaudio
import queue
def get_device_list():
devices = []
p = pyaudio.PyAudio()
info = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
for i in range(0, numdevices):
if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
devices.append(p.get_device_info_by_host_api_device_index(0, i).get('name'))
if (p.get_device_info_by_host_api_device_index(0, i).get('maxOutputChannels')) > 0:
devices.append(p.get_device_info_by_host_api_device_index(0, i).get('name'))
return devices
class AudioStream(object):
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, input_device_index):
self._rate = rate
self._chunk = int(rate / 10) # 100ms for 16000Hz
self.input_device_index = input_device_index
# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True
def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
# The API currently only supports 1-channel (mono) audio
# https://goo.gl/z757pE
channels=1,
rate=self._rate,
input=True,
frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
input_device_index=self.input_device_index,
)
self.closed = False
return self
def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def generator(self):
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b"".join(data)

View File

@ -1,68 +1,36 @@
import numpy as np
import soundcard as sc
import threading
import time
import queue
import whisper
import torch
import argparse
import wave
import os
from Microphone import Microphone
TRANSCRIPT_LIMIT = 10
RECORDING_TIME = 5
class AudioTranscriber:
def __init__(self, lang: str, microphone : Microphone):
self.audio_np_array_queue = queue.Queue()
self.transcript_data = []
self.microphone = microphone
self.lang = lang
self.lock = threading.Lock()
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="tiny", help="Model to use",
choices=["tiny", "base", "small", "medium", "large"])
parser.add_argument("--non_english", action='store_true',
help="Don't use the english model.")
parser.add_argument("--energy_threshold", default=1000,
help="Energy level for mic to detect.", type=int)
parser.add_argument("--record_timeout", default=2,
help="How real time the recording is in seconds.", type=float)
parser.add_argument("--phrase_timeout", default=3,
help="How much empty space between recordings before we "
"consider it a new line in the transcription.", type=float)
args = parser.parse_args()
# Load / Download model
model = args.model
if args.model != "large" and not args.non_english:
model = model + ".en"
def __init__(self):
self.transcript = []
self.audio_model = whisper.load_model(os.getcwd() + r'\tiny.en' + '.pt')
def transcribe(self, audio_data):
with wave.open(f'temp_{id(self)}.wav', 'wb') as wav_file:
wav_file.setnchannels(audio_data.shape[1])
wav_file.setsampwidth(2)
wav_file.setframerate(16000)
audio_data = (audio_data * (2**15 - 1)).astype(np.int16)
wav_file.writeframes(audio_data.tobytes())
result = self.audio_model.transcribe(f'temp_{id(self)}.wav', fp16=torch.cuda.is_available())
print(torch.cuda.is_available())
text = result['text'].strip()
return text
def create_transcription_from_queue(self, audio_queue):
while True:
top_of_queue = audio_queue.get()
source = top_of_queue[0]
audio_data = top_of_queue[1]
audio_data_transcription = self.transcribe(audio_data)
# whisper gives "you" on many null inputs
if audio_data_transcription != '' and audio_data_transcription.lower() != 'you':
self.transcript = [source + ": [" + audio_data_transcription + ']\n\n'] + self.transcript
def get_transcript(self):
return self.transcript_data
def record_into_queue(self):
SAMPLE_RATE = 16000
with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic:
while True:
data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array.
self.audio_np_array_queue.put(data)
return
def transcribe_from_queue(self):
with self.lock:
while True:
audio_data = self.audio_np_array_queue.get()
with wave.open(f'temp_{self.microphone.id}.wav', 'wb') as wav_file:
wav_file.setnchannels(audio_data.shape[1])
wav_file.setsampwidth(2)
wav_file.setframerate(16000)
audio_data = (audio_data * (2**15 - 1)).astype(np.int16)
wav_file.writeframes(audio_data.tobytes())
result = self.audio_model.transcribe(f'temp_{self.microphone.id}.wav', fp16=torch.cuda.is_available())
text = result['text'].strip()
if text != '' and text.lower() != 'you': # whisper gives "you" on many null inputs
timestamp = int(time.time())
self.transcript_data.append({'utterance': text, 'timestamp': timestamp})
return "".join(self.transcript)

View File

@ -1,17 +1,15 @@
import openai
from keys import OPENAI_API_KEY
from prompts import create_prompt, INITIAL_RESPONSE
import time
openai.api_key = OPENAI_API_KEY
class GPTResponder:
def __init__(self):
self.last_transcript = ""
self.last_response = INITIAL_RESPONSE
def generate_response_from_transcript(self, transcript):
if transcript == self.last_transcript:
return self.last_response
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-0301",
messages=[{"role": "system", "content": create_prompt(transcript)}],
@ -19,9 +17,6 @@ class GPTResponder:
)
full_response = response.choices[0].message.content
try:
conversational_response = full_response.split('[')[1].split(']')[0]
self.last_response = full_response.split('[')[1].split(']')[0]
except:
return self.last_response
self.last_transcript = transcript
self.last_response = conversational_response
return conversational_response
pass

67
main.py
View File

@ -1,38 +1,24 @@
# pyinstaller --onedir --add-data "C:/Users/mcfar/AppData/Local/Programs/Python/Python310/Lib/site-packages/customtkinter;customtkinter/" --noconfirm --windowed --noconsole main.py
import soundcard as sc
import threading
from AudioTranscriber import AudioTranscriber, TRANSCRIPT_LIMIT
from gpt_responder import GPTResponder
from AudioTranscriber import AudioTranscriber
from GPTResponder import GPTResponder
import customtkinter as ctk
from Microphone import Microphone
import soundcard as sc
from AudioRecorder import AudioRecorder
import queue
def write_in_textbox(textbox, text):
textbox.delete("0.0", "end")
textbox.insert("0.0", text)
#TODO make fast leetcode :)
def create_transcript_string(transcriber_mic, transcriber_speaker, reverse = True):
transcript_string = ""
mic_transcript = transcriber_mic.get_transcript()
speaker_transcript = transcriber_speaker.get_transcript()
total_transcript = [('You', data) for data in mic_transcript] + [('Speaker', data) for data in speaker_transcript]
sorted_transcript = sorted(total_transcript, key = lambda x: x[1]['timestamp'], reverse = reverse)
for source, line in sorted_transcript[:TRANSCRIPT_LIMIT]:
transcript_string += source + ": [" + line['utterance'] + ']\n\n'
return transcript_string
def update_transcript_UI(transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, textbox):
transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker, reverse=True)
def update_transcript_UI(transcriber, textbox):
transcript_string = transcriber.get_transcript()
textbox.delete("0.0", "end")
textbox.insert("0.0", transcript_string)
textbox.after(200, update_transcript_UI, transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, textbox)
textbox.after(300, update_transcript_UI, transcriber, textbox)
def update_response_UI(transcriber_mic, transcriber_speaker, responder, textbox, update_interval_slider_label, update_interval_slider):
transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker,reverse=False)
t = threading.Thread(target=lambda: responder.generate_response_from_transcript(transcript_string))
t.start()
#transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker,reverse=False)
textbox.configure(state="normal")
textbox.delete("0.0", "end")
textbox.insert("0.0", responder.last_response)
@ -61,7 +47,7 @@ if __name__ == "__main__":
response_textbox.grid(row=0, column=1, padx=10, pady=20, sticky="nsew")
# Add the clear transcript button to the UI
clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_transcript_data(transcriber_mic, transcriber_speaker))
clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_transcript_data(user_transcriber, transcriber_speaker))
clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")
# empty label, necessary for proper grid spacing
update_interval_slider_label = ctk.CTkLabel(root, text=f"", font=("Arial", 12), text_color="#FFFCF2")
@ -74,21 +60,23 @@ if __name__ == "__main__":
update_interval_slider_label = ctk.CTkLabel(root, text=f"Update interval: {update_interval_slider.get()} seconds", font=("Arial", 12), text_color="#FFFCF2")
update_interval_slider_label.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")
responder = GPTResponder()
audio_queue = queue.Queue()
user_mirophone = Microphone(str(sc.default_microphone().name), False)
transcriber_mic = AudioTranscriber(lang='en-US', microphone=user_mirophone)
recorder_thread_mic = threading.Thread(target=transcriber_mic.record_into_queue)
transcriber_thread_mic = threading.Thread(target=transcriber_mic.transcribe_from_queue)
recorder_thread_mic.start()
transcriber_thread_mic.start()
user_audio_recorder = AudioRecorder(user_mirophone)
record_user = threading.Thread(target=user_audio_recorder.record_into_queue, args=(audio_queue, "You",))
record_user.start()
speaker_mirophone = Microphone(str(sc.default_speaker().name), True)
transcriber_speaker = AudioTranscriber(lang='en-US', microphone=speaker_mirophone)
recorder_thread_speaker = threading.Thread(target=transcriber_speaker.record_into_queue)
transcriber_thread_speaker = threading.Thread(target=transcriber_speaker.transcribe_from_queue)
recorder_thread_speaker.start()
transcriber_thread_speaker.start()
speaker_audio_recorder = AudioRecorder(speaker_mirophone)
record_speaker = threading.Thread(target=speaker_audio_recorder.record_into_queue, args=(audio_queue, "Speaker",))
record_speaker.start()
global_transcriber = AudioTranscriber()
transcribe = threading.Thread(target=global_transcriber.create_transcription_from_queue, args=(audio_queue,))
transcribe.start()
root.grid_rowconfigure(0, weight=100)
root.grid_rowconfigure(1, weight=10)
@ -98,10 +86,7 @@ if __name__ == "__main__":
root.grid_columnconfigure(0, weight=2)
root.grid_columnconfigure(1, weight=1)
update_transcript_UI(transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, transcript_textbox)
update_response_UI(transcriber_mic, transcriber_speaker, responder, response_textbox, update_interval_slider_label, update_interval_slider)
update_transcript_UI(global_transcriber, transcript_textbox)
#update_response_UI(user_transcriber, transcriber_speaker, responder, response_textbox, update_interval_slider_label, update_interval_slider)
root.mainloop()
transcriber_thread_mic.join()
transcriber_thread_speaker.join()
root.mainloop()