2024-12-19 13:48:57 +01:00

126 lines
3.1 KiB
Python

import os
import tkinter as tk
import customtkinter as ctk
import torch
import torchaudio
import vlc
from tkVideoPlayer import TkinterVideo
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
from transformers import AutoModelForCausalLM, AutoTokenizer
# Setup the app
app = tk.Tk()
app.geometry("600x550")
app.title("Rap God v2.0")
ctk.set_appearance_mode("dark")
promptFrame = tk.Frame()
promptFrame.pack(padx=10, pady=10)
buttonFrame = tk.Frame()
buttonFrame.pack()
prompt = ctk.CTkEntry(
promptFrame, height=40, width=300, text_color="black", fg_color="white"
)
prompt.pack(side="left", padx=10)
lyrics = ctk.CTkEntry(None, height=240, width=500, text_color="black", fg_color="white")
lyrics.pack()
def generateText():
model = AutoModelForCausalLM.from_pretrained("stormzy").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)
tokens = tokenizer.encode(prompt.get(), return_tensors="pt")
tokens = tokens.to("cuda")
attn_mask = torch.ones_like(tokens)
out = model.generate(
tokens,
attention_mask=attn_mask,
num_beams=5,
early_stopping=True,
max_length=200,
no_repeat_ngram_size=2,
)
rap = tokenizer.decode(out[0])
lyrics.delete(0, tk.END)
lyrics.insert(0, rap)
def generateAudio():
voice_samples, conditioning_latents = load_voice(
"stormzy", extra_voice_dirs=["stormzy_samples"]
)
tts = TextToSpeech()
gen = tts.tts_with_preset(
lyrics.get(),
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset="ultra_fast",
)
torchaudio.save("generated.wav", gen.squeeze(0).cpu(), 24000)
def playAudio():
if os.path.exists("generated.wav"):
p = vlc.MediaPlayer("file:///generated.wav")
p.play()
videoplayer = TkinterVideo(master=app, scaled=True, keep_aspect=True)
def generateVideo():
os.system("xcopy /y generated.wav .\MakeItTalk\examples")
os.system("cd MakeItTalk & python generate.py")
if os.path.exists("generated.wav"):
p = vlc.MediaPlayer("file:///generated.wav")
p.play()
videoplayer.load("MakeItTalk\examples\stormzy_pred_fls_generated_audio_embed.mp4")
videoplayer.pack(fill="both", expand=True)
videoplayer.play()
genTextButton = ctk.CTkButton(
promptFrame,
height=40,
width=120,
text_color="black",
text="Generate",
command=generateText,
)
genTextButton.pack(side="right")
genAudioButton = ctk.CTkButton(
buttonFrame,
height=40,
width=120,
text_color="black",
text="Syn Audio",
command=generateAudio,
)
genAudioButton.pack(side="left", padx=10)
playAudioButton = ctk.CTkButton(
buttonFrame,
height=40,
width=120,
text_color="black",
text="Play Rap",
command=playAudio,
)
playAudioButton.pack(side="left", padx=10)
genVideoButton = ctk.CTkButton(
buttonFrame,
height=40,
width=120,
text_color="black",
text="Syn Video",
command=generateVideo,
)
genVideoButton.pack(side="left", padx=10)
# Run the app
app.mainloop()