working now

This commit is contained in:
SevaSk 2023-05-11 22:06:14 -04:00
parent e3f67d66a3
commit e66bd62740
3 changed files with 99 additions and 81 deletions

View File

@ -1,8 +1,8 @@
import numpy as np
import speech_recognition as sr
import pyaudiowpatch as pyaudio
from datetime import datetime
RECORD_TIMEOUT = 2
RECORD_TIMEOUT = 3
ENERGY_THRESHOLD = 1000
DYNAMIC_ENERGY_THRESHOLD = False
@ -12,6 +12,7 @@ class DefaultMicRecorder:
self.recorder.energy_threshold = ENERGY_THRESHOLD
self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD
self.source = sr.Microphone(sample_rate=16000)
self.num_channels = 1
with self.source:
self.recorder.adjust_for_ambient_noise(self.source)
@ -19,7 +20,7 @@ class DefaultMicRecorder:
def record_into_queue(self, audio_queue):
def record_callback(_, audio:sr.AudioData) -> None:
data = audio.get_raw_data()
audio_queue.put(("You", data, self.source.SAMPLE_RATE, self.source.SAMPLE_WIDTH, 1))
audio_queue.put(("You", data, datetime.utcnow()))
self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)
@ -44,13 +45,14 @@ class DefaultSpeakerRecorder:
self.source = sr.Microphone(sample_rate=int(self.default_speakers["defaultSampleRate"]),
speaker=True,
chunk_size= pyaudio.get_sample_size(pyaudio.paInt16))
self.num_channels = self.default_speakers["maxInputChannels"]
with self.source:
self.recorder.adjust_for_ambient_noise(self.source)
def record_into_queue(self, audio_queue):
def record_callback(_, audio:sr.AudioData) -> None:
data = audio.get_raw_data()
audio_queue.put(("Speaker", data, self.source.SAMPLE_RATE,
self.source.SAMPLE_WIDTH,
self.default_speakers["maxInputChannels"]))
audio_queue.put(("Speaker", data, datetime.utcnow()))
self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)

View File

@ -1,4 +1,3 @@
import numpy as np
import whisper
import torch
import wave
@ -7,91 +6,106 @@ import threading
from tempfile import NamedTemporaryFile
import speech_recognition as sr
import io
from datetime import datetime, timedelta
from datetime import timedelta
from time import sleep
import pyaudiowpatch as pyaudio
from AudioRecorder import DefaultMicRecorder, DefaultSpeakerRecorder
from heapq import merge
PHRASE_TIMEOUT = 3
PHRASE_TIMEOUT = 4
class AudioTranscriber:
def __init__(self):
self.transcript_data = [""]
def __init__(self, default_mic : DefaultMicRecorder, default_speaker : DefaultSpeakerRecorder):
self.mic_transcript_data = []
self.speaker_transcript_data = []
self.transcript_changed_event = threading.Event()
self.audio_model = whisper.load_model(os.getcwd() + r'\tiny.en' + '.pt')
def create_transcription_from_queue(self, audio_queue):
phrase_time = None
last_sample = bytes()
self.mic_sample_rate = default_mic.source.SAMPLE_RATE
self.mic_sample_width = default_mic.source.SAMPLE_WIDTH
self.mic_channels = default_mic.num_channels
who_spoke_changed = False
who_spoke_prev = "You"
sample_prev = bytes()
sample_rate_prev = 16000
sample_width_prev = 2
channels_prev = 1
self.speaker_sample_rate = default_speaker.source.SAMPLE_RATE
self.speaker_sample_rate = default_speaker.source.SAMPLE_RATE
self.speaker_channels = default_speaker.num_channels
def create_transcription_from_queue(self, audio_queue):
mic_last_sample = bytes()
speaker_last_sample = bytes()
mic_last_spoken = None
speaker_last_spoken = None
mic_start_new_phrase = True
speaker_start_new_phrase = True
while True:
now = datetime.utcnow()
top_of_queue = audio_queue.get()
who_spoke = top_of_queue[0]
data = top_of_queue[1]
time_spoken = top_of_queue[2]
if not audio_queue.empty():
phrase_complete = False
if phrase_time and now - phrase_time > timedelta(seconds=PHRASE_TIMEOUT) or who_spoke_changed:
if who_spoke_changed:
who_spoke_changed = False
last_sample = sample_prev
who_spoke = who_spoke_prev
sample_rate = sample_rate_prev
sample_width = sample_width_prev
channels = channels_prev
else:
last_sample = bytes()
phrase_complete = True
phrase_time = now
while not audio_queue.empty() and not who_spoke_changed:
top_of_queue = audio_queue.get()
who_spoke = top_of_queue[0]
data = top_of_queue[1]
sample_rate = top_of_queue[2]
sample_width = top_of_queue[3]
channels = top_of_queue[4]
who_spoke_changed = who_spoke != who_spoke_prev
if who_spoke_changed:
sample_prev = data
who_spoke_prev = who_spoke
sample_rate_prev = sample_rate
sample_width_prev = sample_width
channels_prev = channels
break
else:
last_sample += data
temp_file = NamedTemporaryFile().name
if who_spoke == "You":
audio_data = sr.AudioData(last_sample, sample_rate, sample_width)
wav_data = io.BytesIO(audio_data.get_wav_data())
with open(temp_file, 'w+b') as f:
f.write(wav_data.read())
if who_spoke == "You":
if mic_last_spoken and time_spoken - mic_last_spoken > timedelta(seconds=PHRASE_TIMEOUT):
mic_last_sample = bytes()
mic_start_new_phrase = True
else:
with wave.open(temp_file, 'wb') as wf:
wf.setnchannels(channels)
p = pyaudio.PyAudio()
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(sample_rate)
wf.writeframes(last_sample)
mic_start_new_phrase = False
result = self.audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
mic_last_sample += data
mic_last_spoken = time_spoken
mic_temp_file = NamedTemporaryFile().name
audio_data = sr.AudioData(mic_last_sample, self.mic_sample_rate, self.mic_sample_width)
wav_data = io.BytesIO(audio_data.get_wav_data())
with open(mic_temp_file, 'w+b') as f:
f.write(wav_data.read())
result = self.audio_model.transcribe(mic_temp_file, fp16=torch.cuda.is_available())
text = result['text'].strip()
if phrase_complete:
self.transcript_data = [who_spoke + ": [" + text + ']\n\n'] + self.transcript_data
if text != '' and text.lower() != 'you':
if mic_start_new_phrase or len(self.mic_transcript_data) == 0:
self.mic_transcript_data = [(who_spoke + ": [" + text + ']\n\n', time_spoken)] + self.mic_transcript_data
self.transcript_changed_event.set()
else:
self.mic_transcript_data[0] = (who_spoke + ": [" + text + ']\n\n',
time_spoken)
self.transcript_changed_event.set()
else:
if speaker_last_spoken and time_spoken - speaker_last_spoken > timedelta(seconds=PHRASE_TIMEOUT):
speaker_last_sample = bytes()
speaker_start_new_phrase = True
else:
self.transcript_data[0] = who_spoke + ": [" + text + ']\n\n'
sleep(0.25)
speaker_start_new_phrase = False
speaker_last_sample += data
speaker_last_spoken = time_spoken
speaker_temp_file = NamedTemporaryFile().name
with wave.open(speaker_temp_file, 'wb') as wf:
wf.setnchannels(self.speaker_channels)
p = pyaudio.PyAudio()
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(self.speaker_sample_rate)
wf.writeframes(speaker_last_sample)
result = self.audio_model.transcribe(speaker_temp_file, fp16=torch.cuda.is_available())
text = result['text'].strip()
if text != '' and text.lower() != 'you':
if speaker_start_new_phrase or len(self.speaker_transcript_data) == 0:
self.speaker_transcript_data = [(who_spoke + ": [" + text + ']\n\n', time_spoken)] + self.speaker_transcript_data
self.transcript_changed_event.set()
else:
self.speaker_transcript_data[0] = (who_spoke + ": [" + text + ']\n\n',
time_spoken)
self.transcript_changed_event.set()
def get_transcript(self):
return "".join(self.transcript_data)
key = lambda x : x[1]
transcript_tuple = list(merge(self.mic_transcript_data, self.speaker_transcript_data, key=key, reverse=True))
return "".join([t[0] for t in transcript_tuple])

12
main.py
View File

@ -79,13 +79,15 @@ if __name__ == "__main__":
speaker_audio_recorder = AudioRecorder.DefaultSpeakerRecorder()
speaker_audio_recorder.record_into_queue(audio_queue)
global_transcriber = AudioTranscriber()
global_transcriber = AudioTranscriber(user_audio_recorder, speaker_audio_recorder)
transcribe = threading.Thread(target=global_transcriber.create_transcription_from_queue, args=(audio_queue,))
transcribe.start()
#responder = GPTResponder()
#respond = threading.Thread(target=responder.respond_to_transcriber, args=(global_transcriber,))
#respond.start()
responder = GPTResponder()
respond = threading.Thread(target=responder.respond_to_transcriber, args=(global_transcriber,))
respond.start()
print("READY")
root.grid_rowconfigure(0, weight=100)
root.grid_rowconfigure(1, weight=10)
@ -100,6 +102,6 @@ if __name__ == "__main__":
clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")
update_transcript_UI(global_transcriber, transcript_textbox)
#update_response_UI(responder, response_textbox, update_interval_slider_label, update_interval_slider)
update_response_UI(responder, response_textbox, update_interval_slider_label, update_interval_slider)
root.mainloop()