126 lines
3.1 KiB
Python
Raw Normal View History

2024-12-19 13:48:57 +01:00
import os
2023-12-14 16:54:39 +01:00
import tkinter as tk
2024-12-19 13:48:57 +01:00
2023-12-14 16:54:39 +01:00
import customtkinter as ctk
import torch
2024-12-19 13:48:57 +01:00
import torchaudio
import vlc
2023-12-14 16:54:39 +01:00
from tkVideoPlayer import TkinterVideo
2024-12-19 13:48:57 +01:00
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
from transformers import AutoModelForCausalLM, AutoTokenizer
2023-12-14 16:54:39 +01:00
2024-12-19 13:48:57 +01:00
# Setup the app
2023-12-14 16:54:39 +01:00
app = tk.Tk()
app.geometry("600x550")
app.title("Rap God v2.0")
2024-12-19 13:48:57 +01:00
ctk.set_appearance_mode("dark")
2023-12-14 16:54:39 +01:00
promptFrame = tk.Frame()
promptFrame.pack(padx=10, pady=10)
buttonFrame = tk.Frame()
buttonFrame.pack()
2024-12-19 13:48:57 +01:00
prompt = ctk.CTkEntry(
promptFrame, height=40, width=300, text_color="black", fg_color="white"
)
prompt.pack(side="left", padx=10)
lyrics = ctk.CTkEntry(None, height=240, width=500, text_color="black", fg_color="white")
2023-12-14 16:54:39 +01:00
lyrics.pack()
2024-12-19 13:48:57 +01:00
def generateText():
model = AutoModelForCausalLM.from_pretrained("stormzy").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)
tokens = tokenizer.encode(prompt.get(), return_tensors="pt")
2023-12-14 16:54:39 +01:00
tokens = tokens.to("cuda")
2024-12-19 13:48:57 +01:00
attn_mask = torch.ones_like(tokens)
out = model.generate(
tokens,
attention_mask=attn_mask,
num_beams=5,
early_stopping=True,
max_length=200,
no_repeat_ngram_size=2,
)
2023-12-14 16:54:39 +01:00
rap = tokenizer.decode(out[0])
lyrics.delete(0, tk.END)
lyrics.insert(0, rap)
2024-12-19 13:48:57 +01:00
def generateAudio():
voice_samples, conditioning_latents = load_voice(
"stormzy", extra_voice_dirs=["stormzy_samples"]
)
2023-12-14 16:54:39 +01:00
tts = TextToSpeech()
2024-12-19 13:48:57 +01:00
gen = tts.tts_with_preset(
lyrics.get(),
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset="ultra_fast",
)
torchaudio.save("generated.wav", gen.squeeze(0).cpu(), 24000)
2023-12-14 16:54:39 +01:00
2024-12-19 13:48:57 +01:00
def playAudio():
if os.path.exists("generated.wav"):
p = vlc.MediaPlayer("file:///generated.wav")
2023-12-14 16:54:39 +01:00
p.play()
2024-12-19 13:48:57 +01:00
videoplayer = TkinterVideo(master=app, scaled=True, keep_aspect=True)
def generateVideo():
2023-12-14 16:54:39 +01:00
os.system("xcopy /y generated.wav .\MakeItTalk\examples")
2024-12-19 13:48:57 +01:00
os.system("cd MakeItTalk & python generate.py")
2023-12-14 16:54:39 +01:00
2024-12-19 13:48:57 +01:00
if os.path.exists("generated.wav"):
p = vlc.MediaPlayer("file:///generated.wav")
2023-12-14 16:54:39 +01:00
p.play()
2024-12-19 13:48:57 +01:00
2023-12-14 16:54:39 +01:00
videoplayer.load("MakeItTalk\examples\stormzy_pred_fls_generated_audio_embed.mp4")
2024-12-19 13:48:57 +01:00
videoplayer.pack(fill="both", expand=True)
2023-12-14 16:54:39 +01:00
videoplayer.play()
2024-12-19 13:48:57 +01:00
genTextButton = ctk.CTkButton(
promptFrame,
height=40,
width=120,
text_color="black",
text="Generate",
command=generateText,
)
genTextButton.pack(side="right")
genAudioButton = ctk.CTkButton(
buttonFrame,
height=40,
width=120,
text_color="black",
text="Syn Audio",
command=generateAudio,
)
genAudioButton.pack(side="left", padx=10)
playAudioButton = ctk.CTkButton(
buttonFrame,
height=40,
width=120,
text_color="black",
text="Play Rap",
command=playAudio,
)
playAudioButton.pack(side="left", padx=10)
genVideoButton = ctk.CTkButton(
buttonFrame,
height=40,
width=120,
text_color="black",
text="Syn Video",
command=generateVideo,
)
genVideoButton.pack(side="left", padx=10)
2023-12-14 16:54:39 +01:00
2024-12-19 13:48:57 +01:00
# Run the app
app.mainloop()