import numpy as np
import soundcard as sc
import threading
import time
import queue
import whisper
import torch
import argparse
import wave
import os
from Microphone import Microphone

TRANSCRIPT_LIMIT = 10
RECORDING_TIME = 5

class AudioTranscriber:
    def __init__(self, lang: str, microphone : Microphone):
        self.audio_np_array_queue = queue.Queue()
        self.transcript_data = []
        self.microphone = microphone
        self.lang = lang
        self.lock = threading.Lock()

        parser = argparse.ArgumentParser()
        parser.add_argument("--model", default="tiny", help="Model to use",
                            choices=["tiny", "base", "small", "medium", "large"])
        parser.add_argument("--non_english", action='store_true',
                            help="Don't use the english model.")
        parser.add_argument("--energy_threshold", default=1000,
                            help="Energy level for mic to detect.", type=int)
        parser.add_argument("--record_timeout", default=2,
                            help="How real time the recording is in seconds.", type=float)
        parser.add_argument("--phrase_timeout", default=3,
                            help="How much empty space between recordings before we "
                                 "consider it a new line in the transcription.", type=float)  
        args = parser.parse_args()
        # Load / Download model
        model = args.model
        if args.model != "large" and not args.non_english:
            model = model + ".en"
        self.audio_model = whisper.load_model(os.getcwd() + r'\tiny.en' + '.pt')

    def get_transcript(self):
        return self.transcript_data

    def record_into_queue(self):
        SAMPLE_RATE = 16000
        with sc.get_microphone(id=self.microphone.id, include_loopback=self.microphone.loop_back).recorder(samplerate=SAMPLE_RATE) as mic:
            while True:
                data = mic.record(numframes=SAMPLE_RATE*RECORDING_TIME) # data is a frames x channels Numpy array.
                self.audio_np_array_queue.put(data)
            return

    def transcribe_from_queue(self):
        with self.lock:
            while True:
                audio_data = self.audio_np_array_queue.get()
                with wave.open(f'temp_{self.microphone.id}.wav', 'wb') as wav_file:
                    wav_file.setnchannels(audio_data.shape[1])
                    wav_file.setsampwidth(2)
                    wav_file.setframerate(16000)
                    audio_data = (audio_data * (2**15 - 1)).astype(np.int16)
                    wav_file.writeframes(audio_data.tobytes())
                result = self.audio_model.transcribe(f'temp_{self.microphone.id}.wav', fp16=torch.cuda.is_available())
                text = result['text'].strip()
                if text != '' and text.lower() != 'you': # whisper gives "you" on many null inputs
                    timestamp = int(time.time())
                    self.transcript_data.append({'utterance': text, 'timestamp': timestamp})