mirror of
https://github.com/SevaSk/ecoute.git
synced 2025-02-24 02:11:18 +00:00
refactor
This commit is contained in:
parent
832c8da00d
commit
2a5efca9d6
17
AudioRecorder.py
Normal file
17
AudioRecorder.py
Normal file
@ -0,0 +1,17 @@
|
||||
import soundcard as sc
|
||||
from Microphone import Microphone
|
||||
import pythoncom
|
||||
|
||||
RECORDING_TIME = 5
|
||||
SAMPLE_RATE = 16000
|
||||
|
||||
class AudioRecorder:
|
||||
def __init__(self, microphone : Microphone):
|
||||
self.microphone = microphone
|
||||
|
||||
def record_into_queue(self, audio_queue, source):
|
||||
pythoncom.CoInitialize()
|
||||
with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic:
|
||||
while True:
|
||||
data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array.
|
||||
audio_queue.put((source, data))
|
@ -1,82 +0,0 @@
|
||||
import pyaudio
|
||||
import queue
|
||||
|
||||
def get_device_list():
|
||||
devices = []
|
||||
p = pyaudio.PyAudio()
|
||||
info = p.get_host_api_info_by_index(0)
|
||||
numdevices = info.get('deviceCount')
|
||||
for i in range(0, numdevices):
|
||||
if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
|
||||
devices.append(p.get_device_info_by_host_api_device_index(0, i).get('name'))
|
||||
if (p.get_device_info_by_host_api_device_index(0, i).get('maxOutputChannels')) > 0:
|
||||
devices.append(p.get_device_info_by_host_api_device_index(0, i).get('name'))
|
||||
return devices
|
||||
|
||||
class AudioStream(object):
|
||||
"""Opens a recording stream as a generator yielding the audio chunks."""
|
||||
|
||||
def __init__(self, rate, input_device_index):
|
||||
self._rate = rate
|
||||
self._chunk = int(rate / 10) # 100ms for 16000Hz
|
||||
self.input_device_index = input_device_index
|
||||
# Create a thread-safe buffer of audio data
|
||||
self._buff = queue.Queue()
|
||||
self.closed = True
|
||||
|
||||
def __enter__(self):
|
||||
self._audio_interface = pyaudio.PyAudio()
|
||||
self._audio_stream = self._audio_interface.open(
|
||||
format=pyaudio.paInt16,
|
||||
# The API currently only supports 1-channel (mono) audio
|
||||
# https://goo.gl/z757pE
|
||||
channels=1,
|
||||
rate=self._rate,
|
||||
input=True,
|
||||
frames_per_buffer=self._chunk,
|
||||
# Run the audio stream asynchronously to fill the buffer object.
|
||||
# This is necessary so that the input device's buffer doesn't
|
||||
# overflow while the calling thread makes network requests, etc.
|
||||
stream_callback=self._fill_buffer,
|
||||
input_device_index=self.input_device_index,
|
||||
)
|
||||
|
||||
self.closed = False
|
||||
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self._audio_stream.stop_stream()
|
||||
self._audio_stream.close()
|
||||
self.closed = True
|
||||
# Signal the generator to terminate so that the client's
|
||||
# streaming_recognize method will not block the process termination.
|
||||
self._buff.put(None)
|
||||
self._audio_interface.terminate()
|
||||
|
||||
def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
|
||||
"""Continuously collect data from the audio stream, into the buffer."""
|
||||
self._buff.put(in_data)
|
||||
return None, pyaudio.paContinue
|
||||
|
||||
def generator(self):
|
||||
while not self.closed:
|
||||
# Use a blocking get() to ensure there's at least one chunk of
|
||||
# data, and stop iteration if the chunk is None, indicating the
|
||||
# end of the audio stream.
|
||||
chunk = self._buff.get()
|
||||
if chunk is None:
|
||||
return
|
||||
data = [chunk]
|
||||
|
||||
# Now consume whatever other data's still buffered.
|
||||
while True:
|
||||
try:
|
||||
chunk = self._buff.get(block=False)
|
||||
if chunk is None:
|
||||
return
|
||||
data.append(chunk)
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
yield b"".join(data)
|
@ -1,68 +1,36 @@
|
||||
import numpy as np
|
||||
import soundcard as sc
|
||||
import threading
|
||||
import time
|
||||
import queue
|
||||
import whisper
|
||||
import torch
|
||||
import argparse
|
||||
import wave
|
||||
import os
|
||||
from Microphone import Microphone
|
||||
|
||||
TRANSCRIPT_LIMIT = 10
|
||||
RECORDING_TIME = 5
|
||||
|
||||
class AudioTranscriber:
|
||||
def __init__(self, lang: str, microphone : Microphone):
|
||||
self.audio_np_array_queue = queue.Queue()
|
||||
self.transcript_data = []
|
||||
self.microphone = microphone
|
||||
self.lang = lang
|
||||
self.lock = threading.Lock()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model", default="tiny", help="Model to use",
|
||||
choices=["tiny", "base", "small", "medium", "large"])
|
||||
parser.add_argument("--non_english", action='store_true',
|
||||
help="Don't use the english model.")
|
||||
parser.add_argument("--energy_threshold", default=1000,
|
||||
help="Energy level for mic to detect.", type=int)
|
||||
parser.add_argument("--record_timeout", default=2,
|
||||
help="How real time the recording is in seconds.", type=float)
|
||||
parser.add_argument("--phrase_timeout", default=3,
|
||||
help="How much empty space between recordings before we "
|
||||
"consider it a new line in the transcription.", type=float)
|
||||
args = parser.parse_args()
|
||||
# Load / Download model
|
||||
model = args.model
|
||||
if args.model != "large" and not args.non_english:
|
||||
model = model + ".en"
|
||||
def __init__(self):
|
||||
self.transcript = []
|
||||
self.audio_model = whisper.load_model(os.getcwd() + r'\tiny.en' + '.pt')
|
||||
|
||||
def get_transcript(self):
|
||||
return self.transcript_data
|
||||
|
||||
def record_into_queue(self):
|
||||
SAMPLE_RATE = 16000
|
||||
with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic:
|
||||
while True:
|
||||
data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array.
|
||||
self.audio_np_array_queue.put(data)
|
||||
return
|
||||
|
||||
def transcribe_from_queue(self):
|
||||
with self.lock:
|
||||
while True:
|
||||
audio_data = self.audio_np_array_queue.get()
|
||||
with wave.open(f'temp_{self.microphone.id}.wav', 'wb') as wav_file:
|
||||
def transcribe(self, audio_data):
|
||||
with wave.open(f'temp_{id(self)}.wav', 'wb') as wav_file:
|
||||
wav_file.setnchannels(audio_data.shape[1])
|
||||
wav_file.setsampwidth(2)
|
||||
wav_file.setframerate(16000)
|
||||
audio_data = (audio_data * (2**15 - 1)).astype(np.int16)
|
||||
wav_file.writeframes(audio_data.tobytes())
|
||||
result = self.audio_model.transcribe(f'temp_{self.microphone.id}.wav', fp16=torch.cuda.is_available())
|
||||
result = self.audio_model.transcribe(f'temp_{id(self)}.wav', fp16=torch.cuda.is_available())
|
||||
print(torch.cuda.is_available())
|
||||
text = result['text'].strip()
|
||||
if text != '' and text.lower() != 'you': # whisper gives "you" on many null inputs
|
||||
timestamp = int(time.time())
|
||||
self.transcript_data.append({'utterance': text, 'timestamp': timestamp})
|
||||
return text
|
||||
|
||||
def create_transcription_from_queue(self, audio_queue):
|
||||
while True:
|
||||
top_of_queue = audio_queue.get()
|
||||
source = top_of_queue[0]
|
||||
audio_data = top_of_queue[1]
|
||||
audio_data_transcription = self.transcribe(audio_data)
|
||||
# whisper gives "you" on many null inputs
|
||||
if audio_data_transcription != '' and audio_data_transcription.lower() != 'you':
|
||||
self.transcript = [source + ": [" + audio_data_transcription + ']\n\n'] + self.transcript
|
||||
|
||||
def get_transcript(self):
|
||||
return "".join(self.transcript)
|
@ -1,17 +1,15 @@
|
||||
import openai
|
||||
from keys import OPENAI_API_KEY
|
||||
from prompts import create_prompt, INITIAL_RESPONSE
|
||||
import time
|
||||
|
||||
openai.api_key = OPENAI_API_KEY
|
||||
|
||||
class GPTResponder:
|
||||
def __init__(self):
|
||||
self.last_transcript = ""
|
||||
self.last_response = INITIAL_RESPONSE
|
||||
|
||||
def generate_response_from_transcript(self, transcript):
|
||||
if transcript == self.last_transcript:
|
||||
return self.last_response
|
||||
response = openai.ChatCompletion.create(
|
||||
model="gpt-3.5-turbo-0301",
|
||||
messages=[{"role": "system", "content": create_prompt(transcript)}],
|
||||
@ -19,9 +17,6 @@ class GPTResponder:
|
||||
)
|
||||
full_response = response.choices[0].message.content
|
||||
try:
|
||||
conversational_response = full_response.split('[')[1].split(']')[0]
|
||||
self.last_response = full_response.split('[')[1].split(']')[0]
|
||||
except:
|
||||
return self.last_response
|
||||
self.last_transcript = transcript
|
||||
self.last_response = conversational_response
|
||||
return conversational_response
|
||||
pass
|
65
main.py
65
main.py
@ -1,38 +1,24 @@
|
||||
# pyinstaller --onedir --add-data "C:/Users/mcfar/AppData/Local/Programs/Python/Python310/Lib/site-packages/customtkinter;customtkinter/" --noconfirm --windowed --noconsole main.py
|
||||
|
||||
import soundcard as sc
|
||||
import threading
|
||||
from AudioTranscriber import AudioTranscriber, TRANSCRIPT_LIMIT
|
||||
from gpt_responder import GPTResponder
|
||||
from AudioTranscriber import AudioTranscriber
|
||||
from GPTResponder import GPTResponder
|
||||
import customtkinter as ctk
|
||||
from Microphone import Microphone
|
||||
import soundcard as sc
|
||||
from AudioRecorder import AudioRecorder
|
||||
import queue
|
||||
|
||||
def write_in_textbox(textbox, text):
|
||||
textbox.delete("0.0", "end")
|
||||
textbox.insert("0.0", text)
|
||||
|
||||
#TODO make fast leetcode :)
|
||||
def create_transcript_string(transcriber_mic, transcriber_speaker, reverse = True):
|
||||
transcript_string = ""
|
||||
|
||||
mic_transcript = transcriber_mic.get_transcript()
|
||||
speaker_transcript = transcriber_speaker.get_transcript()
|
||||
total_transcript = [('You', data) for data in mic_transcript] + [('Speaker', data) for data in speaker_transcript]
|
||||
sorted_transcript = sorted(total_transcript, key = lambda x: x[1]['timestamp'], reverse = reverse)
|
||||
for source, line in sorted_transcript[:TRANSCRIPT_LIMIT]:
|
||||
transcript_string += source + ": [" + line['utterance'] + ']\n\n'
|
||||
return transcript_string
|
||||
|
||||
def update_transcript_UI(transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, textbox):
|
||||
transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker, reverse=True)
|
||||
def update_transcript_UI(transcriber, textbox):
|
||||
transcript_string = transcriber.get_transcript()
|
||||
textbox.delete("0.0", "end")
|
||||
textbox.insert("0.0", transcript_string)
|
||||
textbox.after(200, update_transcript_UI, transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, textbox)
|
||||
textbox.after(300, update_transcript_UI, transcriber, textbox)
|
||||
|
||||
def update_response_UI(transcriber_mic, transcriber_speaker, responder, textbox, update_interval_slider_label, update_interval_slider):
|
||||
transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker,reverse=False)
|
||||
t = threading.Thread(target=lambda: responder.generate_response_from_transcript(transcript_string))
|
||||
t.start()
|
||||
#transcript_string = create_transcript_string(transcriber_mic, transcriber_speaker,reverse=False)
|
||||
textbox.configure(state="normal")
|
||||
textbox.delete("0.0", "end")
|
||||
textbox.insert("0.0", responder.last_response)
|
||||
@ -61,7 +47,7 @@ if __name__ == "__main__":
|
||||
response_textbox.grid(row=0, column=1, padx=10, pady=20, sticky="nsew")
|
||||
|
||||
# Add the clear transcript button to the UI
|
||||
clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_transcript_data(transcriber_mic, transcriber_speaker))
|
||||
clear_transcript_button = ctk.CTkButton(root, text="Clear Transcript", command=lambda: clear_transcript_data(user_transcriber, transcriber_speaker))
|
||||
clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")
|
||||
# empty label, necessary for proper grid spacing
|
||||
update_interval_slider_label = ctk.CTkLabel(root, text=f"", font=("Arial", 12), text_color="#FFFCF2")
|
||||
@ -74,21 +60,23 @@ if __name__ == "__main__":
|
||||
update_interval_slider_label = ctk.CTkLabel(root, text=f"Update interval: {update_interval_slider.get()} seconds", font=("Arial", 12), text_color="#FFFCF2")
|
||||
update_interval_slider_label.grid(row=2, column=1, padx=10, pady=10, sticky="nsew")
|
||||
|
||||
responder = GPTResponder()
|
||||
audio_queue = queue.Queue()
|
||||
|
||||
user_mirophone = Microphone(str(sc.default_microphone().name), False)
|
||||
transcriber_mic = AudioTranscriber(lang='en-US', microphone=user_mirophone)
|
||||
recorder_thread_mic = threading.Thread(target=transcriber_mic.record_into_queue)
|
||||
transcriber_thread_mic = threading.Thread(target=transcriber_mic.transcribe_from_queue)
|
||||
recorder_thread_mic.start()
|
||||
transcriber_thread_mic.start()
|
||||
user_audio_recorder = AudioRecorder(user_mirophone)
|
||||
|
||||
record_user = threading.Thread(target=user_audio_recorder.record_into_queue, args=(audio_queue, "You",))
|
||||
record_user.start()
|
||||
|
||||
speaker_mirophone = Microphone(str(sc.default_speaker().name), True)
|
||||
transcriber_speaker = AudioTranscriber(lang='en-US', microphone=speaker_mirophone)
|
||||
recorder_thread_speaker = threading.Thread(target=transcriber_speaker.record_into_queue)
|
||||
transcriber_thread_speaker = threading.Thread(target=transcriber_speaker.transcribe_from_queue)
|
||||
recorder_thread_speaker.start()
|
||||
transcriber_thread_speaker.start()
|
||||
speaker_audio_recorder = AudioRecorder(speaker_mirophone)
|
||||
|
||||
record_speaker = threading.Thread(target=speaker_audio_recorder.record_into_queue, args=(audio_queue, "Speaker",))
|
||||
record_speaker.start()
|
||||
|
||||
global_transcriber = AudioTranscriber()
|
||||
transcribe = threading.Thread(target=global_transcriber.create_transcription_from_queue, args=(audio_queue,))
|
||||
transcribe.start()
|
||||
|
||||
root.grid_rowconfigure(0, weight=100)
|
||||
root.grid_rowconfigure(1, weight=10)
|
||||
@ -98,10 +86,7 @@ if __name__ == "__main__":
|
||||
root.grid_columnconfigure(0, weight=2)
|
||||
root.grid_columnconfigure(1, weight=1)
|
||||
|
||||
update_transcript_UI(transcriber_mic, transcriber_thread_mic, transcriber_speaker, transcriber_thread_speaker, transcript_textbox)
|
||||
update_response_UI(transcriber_mic, transcriber_speaker, responder, response_textbox, update_interval_slider_label, update_interval_slider)
|
||||
update_transcript_UI(global_transcriber, transcript_textbox)
|
||||
#update_response_UI(user_transcriber, transcriber_speaker, responder, response_textbox, update_interval_slider_label, update_interval_slider)
|
||||
|
||||
root.mainloop()
|
||||
|
||||
transcriber_thread_mic.join()
|
||||
transcriber_thread_speaker.join()
|
Loading…
x
Reference in New Issue
Block a user