diff --git a/examples/common.cpp b/examples/common.cpp index 99efd18d..9ab162a5 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -863,3 +863,21 @@ bool is_file_exist(const char *fileName) std::ifstream infile(fileName); return infile.good(); } + +bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id) +{ + std::ofstream speak_file(path.c_str()); + if (speak_file.fail()) { + fprintf(stderr, "%s: failed to open speak_file\n", __func__); + return false; + } else { + speak_file.write(text.c_str(), text.size()); + speak_file.close(); + int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str()); + if (ret != 0) { + fprintf(stderr, "%s: failed to speak\n", __func__); + return false; + } + } + return true; +} diff --git a/examples/common.h b/examples/common.h index 0529a057..2ed91ca9 100644 --- a/examples/common.h +++ b/examples/common.h @@ -306,3 +306,6 @@ int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate); // check if file exists using ifstream bool is_file_exist(const char *fileName); + +// write text to file, and call system("command voice_id file") +bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id); diff --git a/examples/talk-llama/.gitignore b/examples/talk-llama/.gitignore index cbf36313..9c08e1f4 100644 --- a/examples/talk-llama/.gitignore +++ b/examples/talk-llama/.gitignore @@ -1 +1,2 @@ audio.mp3 +to_speak.txt diff --git a/examples/talk-llama/eleven-labs.py b/examples/talk-llama/eleven-labs.py index edcd023b..7ed1d5dc 100644 --- a/examples/talk-llama/eleven-labs.py +++ b/examples/talk-llama/eleven-labs.py @@ -1,20 +1,80 @@ import sys -import importlib.util +import argparse +import textwrap -if importlib.util.find_spec("elevenlabs") is None: - print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'") +parser = argparse.ArgumentParser(add_help=False, + formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument("-q", "--quick", action="store_true", + help="skip checking the required library") + +modes = parser.add_argument_group("action") +modes.add_argument("inputfile", metavar="TEXTFILE", + nargs='?', type=argparse.FileType(), default=sys.stdin, + help="read the text file (default: stdin)") +modes.add_argument("-l", "--list", action="store_true", + help="show the list of voices and exit") +modes.add_argument("-h", "--help", action="help", + help="show this help and exit") + +selopts = parser.add_argument_group("voice selection") +selmodes = selopts.add_mutually_exclusive_group() +selmodes.add_argument("-n", "--name", + default="Arnold", + help="get a voice object by name (default: Arnold)") +selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER", + help="get a voice object by number (see --list)") +selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL", + default=["use case=narration"], + help=textwrap.dedent('''\ + filter voices by labels (default: "use case=narration") + this option can be used multiple times + filtering will be disabled if the first -f has no "=" (e.g. -f "any") + ''')) + +outmodes = parser.add_argument_group("output") +outgroup = outmodes.add_mutually_exclusive_group() +outgroup.add_argument("-s", "--save", metavar="FILE", + default="audio.mp3", + help="save the TTS to a file (default: audio.mp3)") +outgroup.add_argument("-p", "--play", action="store_true", + help="play the TTS with ffplay") + +args = parser.parse_args() + +if not args.quick: + import importlib.util + if importlib.util.find_spec("elevenlabs") is None: + print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'") + sys.exit() + +from elevenlabs import voices, generate, play, save + +if args.filter and "=" in args.filter[0]: + voicelist = voices() + for f in args.filter: + label, value = f.split("=") + voicelist = filter(lambda x: x.labels.get(label) == value, voicelist) + voicelist = list(voicelist) +else: + voicelist = list(voices()) + +if args.list: + for i, v in enumerate(voicelist): + print(str(i) + ": " + v.name + " " + str(v.labels)) sys.exit() -from elevenlabs import generate, play, save +if args.voice: + voice = voicelist[args.voice % len(voicelist)] +else: + voice = args.name + # if -n should consult -f, use the following + #voice = next(x for x in voicelist if x.name == args.name) -# Get a Voice object, by name or UUID -voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh - -# Generate the TTS audio = generate( - text=str(sys.argv[2:]), - voice=voice + text=str(args.inputfile.read()), + voice=voice ) - -# Save the TTS to a file -save(audio, "audio.mp3") +if args.play: + play(audio) +else: + save(audio, args.save) diff --git a/examples/talk-llama/speak b/examples/talk-llama/speak index 50e7210e..31ea417a 100755 --- a/examples/talk-llama/speak +++ b/examples/talk-llama/speak @@ -1,32 +1,40 @@ #!/bin/bash # Usage: -# speak.sh +# speak -# espeak -# Mac OS: brew install espeak -# Linux: apt-get install espeak -# -#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2" +function installed() { command -v $1 >/dev/null 2>&1; } -# piper -# -# https://github.com/rhasspy/piper -# -# Tested with Linux: -# -#echo "$2" | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw - +if installed espeak; then + espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2 + +elif installed piper && installed aplay; then + cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw - # for Mac -say "$2" +elif installed say; then + say -f $2 # Eleven Labs -# To use it, install the elevenlabs module from pip (pip install elevenlabs) -# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY=' -#Keep the line commented to use the free version whitout api key -# -#export ELEVEN_API_KEY=your_api_key -#wd=$(dirname $0) -#script=$wd/eleven-labs.py -#python3 $script $1 "$2" >/dev/null 2>&1 -#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1 +elif installed python3 && \ + python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \ + installed ffplay; then + # It's possible to use the API for free with limited number of characters. + # To increase this limit register to https://beta.elevenlabs.io to get an api key + # and paste it after 'ELEVEN_API_KEY=' + # Keep the line commented to use the free version without api key + #export ELEVEN_API_KEY=your_api_key + wd=$(dirname $0) + script=$wd/eleven-labs.py + python3 $script -q -p -v $1 $2 >/dev/null 2>&1 + + # Uncomment to keep the audio file + #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1 + #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1 + +else + echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),' + echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,' + echo 'or elevenlabs ("pip install elevenlabs") with ffplay.' + echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)' +fi diff --git a/examples/talk-llama/speak.bat b/examples/talk-llama/speak.bat index d719d690..91110c05 100644 --- a/examples/talk-llama/speak.bat +++ b/examples/talk-llama/speak.bat @@ -1 +1 @@ -@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2 +@powershell -ExecutionPolicy Bypass -F examples\talk-llama\speak.ps1 %1 %2 diff --git a/examples/talk-llama/speak.ps1 b/examples/talk-llama/speak.ps1 index bdc4c5f8..51139586 100644 --- a/examples/talk-llama/speak.ps1 +++ b/examples/talk-llama/speak.ps1 @@ -1,12 +1,14 @@ # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser param( - # voice options are David or Zira - [Parameter(Mandatory=$true)][string]$voice, - [Parameter(Mandatory=$true)][string]$text + [Parameter(Mandatory=$true)][int]$voicenum, + [Parameter(Mandatory=$true)][string]$textfile ) Add-Type -AssemblyName System.Speech; $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer; -$speak.SelectVoice("Microsoft $voice Desktop"); +$voiceoptions = $speak.GetInstalledVoices("en-US"); +$voice = $voiceoptions[$voicenum % $voiceoptions.count]; +$speak.SelectVoice($voice.VoiceInfo.Name); $speak.Rate="0"; +$text = Get-Content -Path $textfile; $speak.Speak($text); diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp index 60dd99e5..ddc9e765 100644 --- a/examples/talk-llama/talk-llama.cpp +++ b/examples/talk-llama/talk-llama.cpp @@ -75,6 +75,7 @@ struct whisper_params { std::string model_wsp = "models/ggml-base.en.bin"; std::string model_llama = "models/ggml-llama-7B.bin"; std::string speak = "./examples/talk-llama/speak"; + std::string speak_file = "./examples/talk-llama/to_speak.txt"; std::string prompt = ""; std::string fname_out; std::string path_session = ""; // path to file for saving/loading model eval state @@ -113,6 +114,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; } else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; } else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; } + else if (arg == "-sf" || arg == "--speak-file") { params.speak_file = argv[++i]; } else if (arg == "--prompt-file") { std::ifstream file(argv[++i]); std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); @@ -160,6 +162,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str()); fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str()); fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str()); + fprintf(stderr, " -sf FILE, --speak-file [%-7s] file to pass to TTS\n", params.speak_file.c_str()); fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", ""); fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); @@ -546,10 +549,7 @@ int main(int argc, char ** argv) { // optionally give audio feedback that the current text is being processed if (!params.heard_ok.empty()) { - int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + params.heard_ok + "'").c_str()); - if (ret != 0) { - fprintf(stderr, "%s: failed to speak\n", __func__); - } + speak_with_file(params.speak, params.heard_ok, params.speak_file, voice_id); } // remove text between brackets using regex @@ -748,11 +748,7 @@ int main(int argc, char ** argv) { } } - text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'"); - int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str()); - if (ret != 0) { - fprintf(stderr, "%s: failed to speak\n", __func__); - } + speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id); audio.clear(); } diff --git a/examples/talk/.gitignore b/examples/talk/.gitignore index cbf36313..9c08e1f4 100644 --- a/examples/talk/.gitignore +++ b/examples/talk/.gitignore @@ -1 +1,2 @@ audio.mp3 +to_speak.txt diff --git a/examples/talk/eleven-labs.py b/examples/talk/eleven-labs.py index edcd023b..7ed1d5dc 100644 --- a/examples/talk/eleven-labs.py +++ b/examples/talk/eleven-labs.py @@ -1,20 +1,80 @@ import sys -import importlib.util +import argparse +import textwrap -if importlib.util.find_spec("elevenlabs") is None: - print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'") +parser = argparse.ArgumentParser(add_help=False, + formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument("-q", "--quick", action="store_true", + help="skip checking the required library") + +modes = parser.add_argument_group("action") +modes.add_argument("inputfile", metavar="TEXTFILE", + nargs='?', type=argparse.FileType(), default=sys.stdin, + help="read the text file (default: stdin)") +modes.add_argument("-l", "--list", action="store_true", + help="show the list of voices and exit") +modes.add_argument("-h", "--help", action="help", + help="show this help and exit") + +selopts = parser.add_argument_group("voice selection") +selmodes = selopts.add_mutually_exclusive_group() +selmodes.add_argument("-n", "--name", + default="Arnold", + help="get a voice object by name (default: Arnold)") +selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER", + help="get a voice object by number (see --list)") +selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL", + default=["use case=narration"], + help=textwrap.dedent('''\ + filter voices by labels (default: "use case=narration") + this option can be used multiple times + filtering will be disabled if the first -f has no "=" (e.g. -f "any") + ''')) + +outmodes = parser.add_argument_group("output") +outgroup = outmodes.add_mutually_exclusive_group() +outgroup.add_argument("-s", "--save", metavar="FILE", + default="audio.mp3", + help="save the TTS to a file (default: audio.mp3)") +outgroup.add_argument("-p", "--play", action="store_true", + help="play the TTS with ffplay") + +args = parser.parse_args() + +if not args.quick: + import importlib.util + if importlib.util.find_spec("elevenlabs") is None: + print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'") + sys.exit() + +from elevenlabs import voices, generate, play, save + +if args.filter and "=" in args.filter[0]: + voicelist = voices() + for f in args.filter: + label, value = f.split("=") + voicelist = filter(lambda x: x.labels.get(label) == value, voicelist) + voicelist = list(voicelist) +else: + voicelist = list(voices()) + +if args.list: + for i, v in enumerate(voicelist): + print(str(i) + ": " + v.name + " " + str(v.labels)) sys.exit() -from elevenlabs import generate, play, save +if args.voice: + voice = voicelist[args.voice % len(voicelist)] +else: + voice = args.name + # if -n should consult -f, use the following + #voice = next(x for x in voicelist if x.name == args.name) -# Get a Voice object, by name or UUID -voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh - -# Generate the TTS audio = generate( - text=str(sys.argv[2:]), - voice=voice + text=str(args.inputfile.read()), + voice=voice ) - -# Save the TTS to a file -save(audio, "audio.mp3") +if args.play: + play(audio) +else: + save(audio, args.save) diff --git a/examples/talk/speak b/examples/talk/speak index b822f615..31ea417a 100644 --- a/examples/talk/speak +++ b/examples/talk/speak @@ -1,24 +1,40 @@ #!/bin/bash # Usage: -# speak.sh +# speak -# espeak -# Mac OS: brew install espeak -# Linux: apt-get install espeak -# -#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2" +function installed() { command -v $1 >/dev/null 2>&1; } -# Mac OS "say" command -say "$2" +if installed espeak; then + espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2 + +elif installed piper && installed aplay; then + cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw - + +# for Mac +elif installed say; then + say -f $2 # Eleven Labs -# To use it, install the elevenlabs module from pip (pip install elevenlabs) -# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY=' -#Keep the line commented to use the free version without api key -# -#export ELEVEN_API_KEY=your_api_key -#wd=$(dirname $0) -#script=$wd/eleven-labs.py -#python3 $script $1 "$2" -#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 +elif installed python3 && \ + python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \ + installed ffplay; then + # It's possible to use the API for free with limited number of characters. + # To increase this limit register to https://beta.elevenlabs.io to get an api key + # and paste it after 'ELEVEN_API_KEY=' + # Keep the line commented to use the free version without api key + #export ELEVEN_API_KEY=your_api_key + wd=$(dirname $0) + script=$wd/eleven-labs.py + python3 $script -q -p -v $1 $2 >/dev/null 2>&1 + + # Uncomment to keep the audio file + #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1 + #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1 + +else + echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),' + echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,' + echo 'or elevenlabs ("pip install elevenlabs") with ffplay.' + echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)' +fi diff --git a/examples/talk/speak.ps1 b/examples/talk/speak.ps1 index bdc4c5f8..51139586 100644 --- a/examples/talk/speak.ps1 +++ b/examples/talk/speak.ps1 @@ -1,12 +1,14 @@ # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser param( - # voice options are David or Zira - [Parameter(Mandatory=$true)][string]$voice, - [Parameter(Mandatory=$true)][string]$text + [Parameter(Mandatory=$true)][int]$voicenum, + [Parameter(Mandatory=$true)][string]$textfile ) Add-Type -AssemblyName System.Speech; $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer; -$speak.SelectVoice("Microsoft $voice Desktop"); +$voiceoptions = $speak.GetInstalledVoices("en-US"); +$voice = $voiceoptions[$voicenum % $voiceoptions.count]; +$speak.SelectVoice($voice.VoiceInfo.Name); $speak.Rate="0"; +$text = Get-Content -Path $textfile; $speak.Speak($text); diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp index f9de3048..c1c6f8ba 100644 --- a/examples/talk/talk.cpp +++ b/examples/talk/talk.cpp @@ -38,6 +38,7 @@ struct whisper_params { std::string model_wsp = "models/ggml-base.en.bin"; std::string model_gpt = "models/ggml-gpt-2-117M.bin"; std::string speak = "./examples/talk/speak"; + std::string speak_file= "./examples/talk/to_speak.txt"; std::string fname_out; }; @@ -68,6 +69,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; } else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; } else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; } + else if (arg == "-sf" || arg == "--speak_file") { params.speak_file = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -102,6 +104,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str()); fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str()); fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str()); + fprintf(stderr, " -sf FILE, --speak_file [%-7s] file to pass to TTS\n", params.speak_file.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); fprintf(stderr, "\n"); } @@ -316,7 +319,7 @@ int main(int argc, char ** argv) { std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base); text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens); - text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), ""); + //text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), ""); text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n')); // remove first 2 lines of base prompt @@ -354,10 +357,7 @@ int main(int argc, char ** argv) { gpt2_set_prompt(ctx_gpt, prompt_base.c_str()); text_to_speak = ::replace(text_to_speak, params.person + ": ", ""); - int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str()); - if (ret != 0) { - fprintf(stderr, "%s: system() failed!\n", __func__); - } + speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id); audio.clear();