mirror of
https://github.com/SevaSk/ecoute.git
synced 2025-01-31 08:25:21 +00:00
working now
This commit is contained in:
parent
e3f67d66a3
commit
e66bd62740
@ -1,8 +1,8 @@
|
||||
import numpy as np
|
||||
import speech_recognition as sr
|
||||
import pyaudiowpatch as pyaudio
|
||||
from datetime import datetime
|
||||
|
||||
RECORD_TIMEOUT = 2
|
||||
RECORD_TIMEOUT = 3
|
||||
ENERGY_THRESHOLD = 1000
|
||||
DYNAMIC_ENERGY_THRESHOLD = False
|
||||
|
||||
@ -12,6 +12,7 @@ class DefaultMicRecorder:
|
||||
self.recorder.energy_threshold = ENERGY_THRESHOLD
|
||||
self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD
|
||||
self.source = sr.Microphone(sample_rate=16000)
|
||||
self.num_channels = 1
|
||||
|
||||
with self.source:
|
||||
self.recorder.adjust_for_ambient_noise(self.source)
|
||||
@ -19,7 +20,7 @@ class DefaultMicRecorder:
|
||||
def record_into_queue(self, audio_queue):
|
||||
def record_callback(_, audio:sr.AudioData) -> None:
|
||||
data = audio.get_raw_data()
|
||||
audio_queue.put(("You", data, self.source.SAMPLE_RATE, self.source.SAMPLE_WIDTH, 1))
|
||||
audio_queue.put(("You", data, datetime.utcnow()))
|
||||
|
||||
self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)
|
||||
|
||||
@ -44,13 +45,14 @@ class DefaultSpeakerRecorder:
|
||||
self.source = sr.Microphone(sample_rate=int(self.default_speakers["defaultSampleRate"]),
|
||||
speaker=True,
|
||||
chunk_size= pyaudio.get_sample_size(pyaudio.paInt16))
|
||||
|
||||
self.num_channels = self.default_speakers["maxInputChannels"]
|
||||
|
||||
with self.source:
|
||||
self.recorder.adjust_for_ambient_noise(self.source)
|
||||
|
||||
def record_into_queue(self, audio_queue):
|
||||
def record_callback(_, audio:sr.AudioData) -> None:
|
||||
data = audio.get_raw_data()
|
||||
audio_queue.put(("Speaker", data, self.source.SAMPLE_RATE,
|
||||
self.source.SAMPLE_WIDTH,
|
||||
self.default_speakers["maxInputChannels"]))
|
||||
audio_queue.put(("Speaker", data, datetime.utcnow()))
|
||||
|
||||
self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)
|
@ -1,4 +1,3 @@
|
||||
import numpy as np
|
||||
import whisper
|
||||
import torch
|
||||
import wave
|
||||
@ -7,91 +6,106 @@ import threading
|
||||
from tempfile import NamedTemporaryFile
|
||||
import speech_recognition as sr
|
||||
import io
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import timedelta
|
||||
from time import sleep
|
||||
import pyaudiowpatch as pyaudio
|
||||
from AudioRecorder import DefaultMicRecorder, DefaultSpeakerRecorder
|
||||
from heapq import merge
|
||||
|
||||
PHRASE_TIMEOUT = 3
|
||||
PHRASE_TIMEOUT = 4
|
||||
|
||||
class AudioTranscriber:
|
||||
def __init__(self):
|
||||
self.transcript_data = [""]
|
||||
def __init__(self, default_mic : DefaultMicRecorder, default_speaker : DefaultSpeakerRecorder):
|
||||
self.mic_transcript_data = []
|
||||
self.speaker_transcript_data = []
|
||||
self.transcript_changed_event = threading.Event()
|
||||
self.audio_model = whisper.load_model(os.getcwd() + r'\tiny.en' + '.pt')
|
||||
|
||||
def create_transcription_from_queue(self, audio_queue):
|
||||
phrase_time = None
|
||||
last_sample = bytes()
|
||||
self.mic_sample_rate = default_mic.source.SAMPLE_RATE
|
||||
self.mic_sample_width = default_mic.source.SAMPLE_WIDTH
|
||||
self.mic_channels = default_mic.num_channels
|
||||
|
||||
who_spoke_changed = False
|
||||
who_spoke_prev = "You"
|
||||
sample_prev = bytes()
|
||||
sample_rate_prev = 16000
|
||||
sample_width_prev = 2
|
||||
channels_prev = 1
|
||||
self.speaker_sample_rate = default_speaker.source.SAMPLE_RATE
|
||||
self.speaker_sample_rate = default_speaker.source.SAMPLE_RATE
|
||||
self.speaker_channels = default_speaker.num_channels
|
||||
|
||||
def create_transcription_from_queue(self, audio_queue):
|
||||
mic_last_sample = bytes()
|
||||
speaker_last_sample = bytes()
|
||||
|
||||
mic_last_spoken = None
|
||||
speaker_last_spoken = None
|
||||
|
||||
mic_start_new_phrase = True
|
||||
speaker_start_new_phrase = True
|
||||
|
||||
while True:
|
||||
now = datetime.utcnow()
|
||||
top_of_queue = audio_queue.get()
|
||||
who_spoke = top_of_queue[0]
|
||||
data = top_of_queue[1]
|
||||
time_spoken = top_of_queue[2]
|
||||
|
||||
if not audio_queue.empty():
|
||||
phrase_complete = False
|
||||
if phrase_time and now - phrase_time > timedelta(seconds=PHRASE_TIMEOUT) or who_spoke_changed:
|
||||
if who_spoke_changed:
|
||||
who_spoke_changed = False
|
||||
last_sample = sample_prev
|
||||
who_spoke = who_spoke_prev
|
||||
sample_rate = sample_rate_prev
|
||||
sample_width = sample_width_prev
|
||||
channels = channels_prev
|
||||
else:
|
||||
last_sample = bytes()
|
||||
|
||||
phrase_complete = True
|
||||
phrase_time = now
|
||||
|
||||
while not audio_queue.empty() and not who_spoke_changed:
|
||||
top_of_queue = audio_queue.get()
|
||||
who_spoke = top_of_queue[0]
|
||||
data = top_of_queue[1]
|
||||
sample_rate = top_of_queue[2]
|
||||
sample_width = top_of_queue[3]
|
||||
channels = top_of_queue[4]
|
||||
|
||||
who_spoke_changed = who_spoke != who_spoke_prev
|
||||
if who_spoke_changed:
|
||||
sample_prev = data
|
||||
who_spoke_prev = who_spoke
|
||||
sample_rate_prev = sample_rate
|
||||
sample_width_prev = sample_width
|
||||
channels_prev = channels
|
||||
break
|
||||
else:
|
||||
last_sample += data
|
||||
|
||||
temp_file = NamedTemporaryFile().name
|
||||
|
||||
if who_spoke == "You":
|
||||
audio_data = sr.AudioData(last_sample, sample_rate, sample_width)
|
||||
wav_data = io.BytesIO(audio_data.get_wav_data())
|
||||
with open(temp_file, 'w+b') as f:
|
||||
f.write(wav_data.read())
|
||||
if who_spoke == "You":
|
||||
if mic_last_spoken and time_spoken - mic_last_spoken > timedelta(seconds=PHRASE_TIMEOUT):
|
||||
mic_last_sample = bytes()
|
||||
mic_start_new_phrase = True
|
||||
else:
|
||||
with wave.open(temp_file, 'wb') as wf:
|
||||
wf.setnchannels(channels)
|
||||
p = pyaudio.PyAudio()
|
||||
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
|
||||
wf.setframerate(sample_rate)
|
||||
wf.writeframes(last_sample)
|
||||
mic_start_new_phrase = False
|
||||
|
||||
result = self.audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
|
||||
mic_last_sample += data
|
||||
mic_last_spoken = time_spoken
|
||||
|
||||
mic_temp_file = NamedTemporaryFile().name
|
||||
audio_data = sr.AudioData(mic_last_sample, self.mic_sample_rate, self.mic_sample_width)
|
||||
wav_data = io.BytesIO(audio_data.get_wav_data())
|
||||
with open(mic_temp_file, 'w+b') as f:
|
||||
f.write(wav_data.read())
|
||||
|
||||
result = self.audio_model.transcribe(mic_temp_file, fp16=torch.cuda.is_available())
|
||||
text = result['text'].strip()
|
||||
|
||||
if phrase_complete:
|
||||
self.transcript_data = [who_spoke + ": [" + text + ']\n\n'] + self.transcript_data
|
||||
if text != '' and text.lower() != 'you':
|
||||
if mic_start_new_phrase or len(self.mic_transcript_data) == 0:
|
||||
self.mic_transcript_data = [(who_spoke + ": [" + text + ']\n\n', time_spoken)] + self.mic_transcript_data
|
||||
self.transcript_changed_event.set()
|
||||
else:
|
||||
self.mic_transcript_data[0] = (who_spoke + ": [" + text + ']\n\n',
|
||||
time_spoken)
|
||||
self.transcript_changed_event.set()
|
||||
else:
|
||||
if speaker_last_spoken and time_spoken - speaker_last_spoken > timedelta(seconds=PHRASE_TIMEOUT):
|
||||
speaker_last_sample = bytes()
|
||||
speaker_start_new_phrase = True
|
||||
else:
|
||||
self.transcript_data[0] = who_spoke + ": [" + text + ']\n\n'
|
||||
sleep(0.25)
|
||||
speaker_start_new_phrase = False
|
||||
|
||||
speaker_last_sample += data
|
||||
speaker_last_spoken = time_spoken
|
||||
|
||||
speaker_temp_file = NamedTemporaryFile().name
|
||||
|
||||
with wave.open(speaker_temp_file, 'wb') as wf:
|
||||
wf.setnchannels(self.speaker_channels)
|
||||
p = pyaudio.PyAudio()
|
||||
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
|
||||
wf.setframerate(self.speaker_sample_rate)
|
||||
wf.writeframes(speaker_last_sample)
|
||||
|
||||
result = self.audio_model.transcribe(speaker_temp_file, fp16=torch.cuda.is_available())
|
||||
text = result['text'].strip()
|
||||
|
||||
if text != '' and text.lower() != 'you':
|
||||
if speaker_start_new_phrase or len(self.speaker_transcript_data) == 0:
|
||||
self.speaker_transcript_data = [(who_spoke + ": [" + text + ']\n\n', time_spoken)] + self.speaker_transcript_data
|
||||
self.transcript_changed_event.set()
|
||||
|
||||
else:
|
||||
self.speaker_transcript_data[0] = (who_spoke + ": [" + text + ']\n\n',
|
||||
time_spoken)
|
||||
self.transcript_changed_event.set()
|
||||
|
||||
def get_transcript(self):
|
||||
return "".join(self.transcript_data)
|
||||
key = lambda x : x[1]
|
||||
transcript_tuple = list(merge(self.mic_transcript_data, self.speaker_transcript_data, key=key, reverse=True))
|
||||
return "".join([t[0] for t in transcript_tuple])
|
||||
|
12
main.py
12
main.py
@ -79,13 +79,15 @@ if __name__ == "__main__":
|
||||
speaker_audio_recorder = AudioRecorder.DefaultSpeakerRecorder()
|
||||
speaker_audio_recorder.record_into_queue(audio_queue)
|
||||
|
||||
global_transcriber = AudioTranscriber()
|
||||
global_transcriber = AudioTranscriber(user_audio_recorder, speaker_audio_recorder)
|
||||
transcribe = threading.Thread(target=global_transcriber.create_transcription_from_queue, args=(audio_queue,))
|
||||
transcribe.start()
|
||||
|
||||
#responder = GPTResponder()
|
||||
#respond = threading.Thread(target=responder.respond_to_transcriber, args=(global_transcriber,))
|
||||
#respond.start()
|
||||
responder = GPTResponder()
|
||||
respond = threading.Thread(target=responder.respond_to_transcriber, args=(global_transcriber,))
|
||||
respond.start()
|
||||
|
||||
print("READY")
|
||||
|
||||
root.grid_rowconfigure(0, weight=100)
|
||||
root.grid_rowconfigure(1, weight=10)
|
||||
@ -100,6 +102,6 @@ if __name__ == "__main__":
|
||||
clear_transcript_button.grid(row=1, column=0, padx=10, pady=3, sticky="nsew")
|
||||
|
||||
update_transcript_UI(global_transcriber, transcript_textbox)
|
||||
#update_response_UI(responder, response_textbox, update_interval_slider_label, update_interval_slider)
|
||||
update_response_UI(responder, response_textbox, update_interval_slider_label, update_interval_slider)
|
||||
|
||||
root.mainloop()
|
Loading…
x
Reference in New Issue
Block a user