diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d5096263..e1b083d3 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -14,10 +14,6 @@ if (WHISPER_SDL2) message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}") endif() -if (WHISPER_CLBLAST) - find_package(CLBlast REQUIRED) -endif() - # common set(TARGET common) @@ -56,6 +52,8 @@ add_library(${TARGET} STATIC common.cpp common-ggml.h common-ggml.cpp + common-whisper.h + common-whisper.cpp grammar-parser.h grammar-parser.cpp ${COMMON_SOURCES_FFMPEG} diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp index 9200d846..a84d3cb2 100644 --- a/examples/cli/cli.cpp +++ b/examples/cli/cli.cpp @@ -1,4 +1,5 @@ #include "common.h" +#include "common-whisper.h" #include "whisper.h" #include "grammar-parser.h" @@ -6,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/examples/command/command.cpp b/examples/command/command.cpp index 11ed9ed6..1c90e185 100644 --- a/examples/command/command.cpp +++ b/examples/command/command.cpp @@ -11,16 +11,15 @@ #include "whisper.h" #include "grammar-parser.h" -#include -#include +#include +#include #include #include -#include -#include +#include +#include #include #include #include -#include // command-line parameters struct whisper_params { diff --git a/examples/common-whisper.cpp b/examples/common-whisper.cpp new file mode 100644 index 00000000..2bc610d1 --- /dev/null +++ b/examples/common-whisper.cpp @@ -0,0 +1,177 @@ +#define _USE_MATH_DEFINES // for M_PI + +#include "common-whisper.h" + +#include "common.h" + +#include "whisper.h" + +// third-party utilities +// use your favorite implementations +#define STB_VORBIS_HEADER_ONLY +#include "stb_vorbis.c" /* Enables Vorbis decoding. */ + +#ifdef _WIN32 +#ifndef NOMINMAX + #define NOMINMAX +#endif +#endif + +#define MA_NO_DEVICE_IO +#define MA_NO_THREADING +#define MA_NO_ENCODING +#define MA_NO_GENERATION +#define MA_NO_RESOURCE_MANAGER +#define MA_NO_NODE_GRAPH +#define MINIAUDIO_IMPLEMENTATION +#include "miniaudio.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#ifdef _WIN32 +#include +#include +#endif + +#include +#include + +#ifdef WHISPER_FFMPEG +// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support +extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector & wav_data); +#endif + +bool read_audio_data(const std::string & fname, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { + std::vector audio_data; // used for pipe input from stdin or ffmpeg decoding output + + ma_result result; + ma_decoder_config decoder_config; + ma_decoder decoder; + + decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE); + + if (fname == "-") { + #ifdef _WIN32 + _setmode(_fileno(stdin), _O_BINARY); + #endif + + uint8_t buf[1024]; + while (true) + { + const size_t n = fread(buf, 1, sizeof(buf), stdin); + if (n == 0) { + break; + } + audio_data.insert(audio_data.end(), buf, buf + n); + } + + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { + + fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result)); + + return false; + } + + fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size()); + } + else if (is_wav_buffer(fname)) { + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { + fprintf(stderr, "Error: failed to open audio data from fname buffer (%s)\n", ma_result_description(result)); + + return false; + } + } + else if ((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS) { +#if defined(WHISPER_FFMPEG) + if (ffmpeg_decode_audio(fname, audio_data) != 0) { + fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str()); + + return false; + } + + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { + fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result)); + + return false; + } +#else + fprintf(stderr, "error: failed to open '%s' file (%s)\n", fname.c_str(), ma_result_description(result)); + + return false; +#endif + } + + ma_uint64 frame_count; + ma_uint64 frames_read; + + if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) { + fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result)); + + return false; + } + + pcmf32.resize(stereo ? frame_count*2 : frame_count); + + if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) { + fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result)); + + return false; + } + + if (stereo) { + pcmf32s.resize(2); + pcmf32s[0].resize(frame_count); + pcmf32s[1].resize(frame_count); + for (uint64_t i = 0; i < frame_count; i++) { + pcmf32s[0][i] = pcmf32[2*i]; + pcmf32s[1][i] = pcmf32[2*i + 1]; + } + } + + ma_decoder_uninit(&decoder); + + return true; +} + +// 500 -> 00:05.000 +// 6000 -> 01:00.000 +std::string to_timestamp(int64_t t, bool comma) { + int64_t msec = t * 10; + int64_t hr = msec / (1000 * 60 * 60); + msec = msec - hr * (1000 * 60 * 60); + int64_t min = msec / (1000 * 60); + msec = msec - min * (1000 * 60); + int64_t sec = msec / 1000; + msec = msec - sec * 1000; + + char buf[32]; + snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec); + + return std::string(buf); +} + +int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) { + return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100))); +} + +bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id) { + std::ofstream speak_file(path.c_str()); + if (speak_file.fail()) { + fprintf(stderr, "%s: failed to open speak_file\n", __func__); + return false; + } else { + speak_file.write(text.c_str(), text.size()); + speak_file.close(); + int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str()); + if (ret != 0) { + fprintf(stderr, "%s: failed to speak\n", __func__); + return false; + } + } + return true; +} + +#undef STB_VORBIS_HEADER_ONLY +#include "stb_vorbis.c" diff --git a/examples/common-whisper.h b/examples/common-whisper.h new file mode 100644 index 00000000..41343621 --- /dev/null +++ b/examples/common-whisper.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include + +// Read WAV audio file and store the PCM data into pcmf32 +// fname can be a buffer of WAV data instead of a filename +// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE +// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM +bool read_audio_data( + const std::string & fname, + std::vector & pcmf32, + std::vector> & pcmf32s, + bool stereo); + +// convert timestamp to string, 6000 -> 01:00.000 +std::string to_timestamp(int64_t t, bool comma = false); + +// given a timestamp get the sample +int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate); + +// write text to file, and call system("command voice_id file") +bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id); diff --git a/examples/common.cpp b/examples/common.cpp index f114e84d..484cb773 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -2,26 +2,6 @@ #include "common.h" -// third-party utilities -// use your favorite implementations -#define STB_VORBIS_HEADER_ONLY -#include "stb_vorbis.c" /* Enables Vorbis decoding. */ - -#ifdef _WIN32 -#ifndef NOMINMAX - #define NOMINMAX -#endif -#endif - -#define MA_NO_DEVICE_IO -#define MA_NO_THREADING -#define MA_NO_ENCODING -#define MA_NO_GENERATION -#define MA_NO_RESOURCE_MANAGER -#define MA_NO_NODE_GRAPH -#define MINIAUDIO_IMPLEMENTATION -#include "miniaudio.h" - #include #include #include @@ -34,16 +14,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#ifdef _WIN32 -#include -#include -#endif - -#ifdef WHISPER_FFMPEG -// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support -extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector & wav_data); -#endif - // Function to check if the next argument exists static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { if (i + 1 < argc && argv[i + 1][0] != '-') { @@ -654,98 +624,6 @@ bool is_wav_buffer(const std::string buf) { return true; } -bool read_audio_data(const std::string & fname, std::vector& pcmf32, std::vector>& pcmf32s, bool stereo) { - std::vector audio_data; // used for pipe input from stdin or ffmpeg decoding output - - ma_result result; - ma_decoder_config decoder_config; - ma_decoder decoder; - - decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, COMMON_SAMPLE_RATE); - - if (fname == "-") { - #ifdef _WIN32 - _setmode(_fileno(stdin), _O_BINARY); - #endif - - uint8_t buf[1024]; - while (true) - { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - audio_data.insert(audio_data.end(), buf, buf + n); - } - - if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { - - fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result)); - - return false; - } - - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size()); - } - else if (is_wav_buffer(fname)) { - if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { - fprintf(stderr, "Error: failed to open audio data from fname buffer (%s)\n", ma_result_description(result)); - - return false; - } - } - else if ((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS) { -#if defined(WHISPER_FFMPEG) - if (ffmpeg_decode_audio(fname, audio_data) != 0) { - fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str()); - - return false; - } - - if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { - fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result)); - - return false; - } -#else - fprintf(stderr, "error: failed to open '%s' file (%s)\n", fname.c_str(), ma_result_description(result)); - - return false; -#endif - } - - ma_uint64 frame_count; - ma_uint64 frames_read; - - if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) { - fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result)); - - return false; - } - - pcmf32.resize(stereo ? frame_count*2 : frame_count); - - if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) { - fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result)); - - return false; - } - - if (stereo) { - pcmf32s.resize(2); - pcmf32s[0].resize(frame_count); - pcmf32s[1].resize(frame_count); - for (uint64_t i = 0; i < frame_count; i++) { - pcmf32s[0][i] = pcmf32[2*i]; - pcmf32s[1][i] = pcmf32[2*i + 1]; - } - } - - ma_decoder_uninit(&decoder); - - return true; -} - void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { const float rc = 1.0f / (2.0f * M_PI * cutoff); const float dt = 1.0f / sample_rate; @@ -821,93 +699,7 @@ float similarity(const std::string & s0, const std::string & s1) { return 1.0f - (dist / std::max(s0.size(), s1.size())); } -bool sam_params_parse(int argc, char ** argv, sam_params & params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(argv[++i]); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(argv[++i]); - } else if (arg == "-m" || arg == "--model") { - params.model = argv[++i]; - } else if (arg == "-i" || arg == "--inp") { - params.fname_inp = argv[++i]; - } else if (arg == "-o" || arg == "--out") { - params.fname_out = argv[++i]; - } else if (arg == "-h" || arg == "--help") { - sam_print_usage(argc, argv, params); - exit(0); - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - sam_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -i FNAME, --inp FNAME\n"); - fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str()); - fprintf(stderr, " -o FNAME, --out FNAME\n"); - fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str()); - fprintf(stderr, "\n"); -} - -// 500 -> 00:05.000 -// 6000 -> 01:00.000 -std::string to_timestamp(int64_t t, bool comma) { - int64_t msec = t * 10; - int64_t hr = msec / (1000 * 60 * 60); - msec = msec - hr * (1000 * 60 * 60); - int64_t min = msec / (1000 * 60); - msec = msec - min * (1000 * 60); - int64_t sec = msec / 1000; - msec = msec - sec * 1000; - - char buf[32]; - snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec); - - return std::string(buf); -} - -int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) { - return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100))); -} - -bool is_file_exist(const char *fileName) -{ - std::ifstream infile(fileName); +bool is_file_exist(const char * filename) { + std::ifstream infile(filename); return infile.good(); } - -bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id) -{ - std::ofstream speak_file(path.c_str()); - if (speak_file.fail()) { - fprintf(stderr, "%s: failed to open speak_file\n", __func__); - return false; - } else { - speak_file.write(text.c_str(), text.size()); - speak_file.close(); - int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str()); - if (ret != 0) { - fprintf(stderr, "%s: failed to speak\n", __func__); - return false; - } - } - return true; -} - -#undef STB_VORBIS_HEADER_ONLY -#include "stb_vorbis.c" diff --git a/examples/common.h b/examples/common.h index 8f50abfa..7d2219d7 100644 --- a/examples/common.h +++ b/examples/common.h @@ -11,8 +11,6 @@ #include #include -#define COMMON_SAMPLE_RATE 16000 - // // GPT CLI argument parsing // @@ -139,16 +137,6 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat( // Check if a buffer is a WAV audio file bool is_wav_buffer(const std::string buf); -// Read WAV audio file and store the PCM data into pcmf32 -// fname can be a buffer of WAV data instead of a filename -// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE -// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM -bool read_audio_data( - const std::string & fname, - std::vector & pcmf32, - std::vector> & pcmf32s, - bool stereo); - // Write PCM data into WAV audio file class wav_writer { private: @@ -266,23 +254,6 @@ bool vad_simple( // compute similarity between two strings using Levenshtein distance float similarity(const std::string & s0, const std::string & s1); -// -// SAM argument parsing -// - -struct sam_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - - std::string model = "models/sam-vit-b/ggml-model-f16.bin"; // model path - std::string fname_inp = "img.jpg"; - std::string fname_out = "img.out"; -}; - -bool sam_params_parse(int argc, char ** argv, sam_params & params); - -void sam_print_usage(int argc, char ** argv, const sam_params & params); - // // Terminal utils // @@ -330,14 +301,5 @@ const std::vector k_colors = { // Other utils // -// convert timestamp to string, 6000 -> 01:00.000 -std::string to_timestamp(int64_t t, bool comma = false); - -// given a timestamp get the sample -int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate); - // check if file exists using ifstream -bool is_file_exist(const char *fileName); - -// write text to file, and call system("command voice_id file") -bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id); +bool is_file_exist(const char * filename); diff --git a/examples/lsp/lsp.cpp b/examples/lsp/lsp.cpp index 803cd6d5..a32da511 100644 --- a/examples/lsp/lsp.cpp +++ b/examples/lsp/lsp.cpp @@ -3,14 +3,15 @@ #include "whisper.h" #include "json.hpp" -#include #include +#include #include +#include +#include +#include #include #include #include -#include -#include using json = nlohmann::json; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4468ce7a..88d36e30 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,17 +1,18 @@ #include "common.h" +#include "common-whisper.h" #include "whisper.h" #include "httplib.h" #include "json.hpp" +#include #include -#include #include +#include +#include #include #include #include -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 19d42138..65c6587d 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -4,15 +4,15 @@ // #include "common-sdl.h" #include "common.h" +#include "common-whisper.h" #include "whisper.h" -#include +#include #include +#include #include #include #include -#include - // command-line parameters struct whisper_params { diff --git a/examples/talk-llama/CMakeLists.txt b/examples/talk-llama/CMakeLists.txt index c15c6b5a..aea1ae64 100644 --- a/examples/talk-llama/CMakeLists.txt +++ b/examples/talk-llama/CMakeLists.txt @@ -25,10 +25,7 @@ if (WHISPER_SDL2) unicode-data.cpp) target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS}) - if (WHISPER_CLBLAST) - set(CLBLAST_LIBNAME clblast) - endif () - target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CLBLAST_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) + target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) if(WIN32) # It requires Windows 8.1 or later for PrefetchVirtualMemory diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp index dcdaec48..9097c491 100644 --- a/examples/talk-llama/talk-llama.cpp +++ b/examples/talk-llama/talk-llama.cpp @@ -3,18 +3,19 @@ #include "common-sdl.h" #include "common.h" +#include "common-whisper.h" #include "whisper.h" #include "llama.h" -#include +#include #include #include #include +#include +#include #include #include #include -#include -#include static std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { const llama_model * model = llama_get_model(ctx); diff --git a/examples/wchess/libwchess/WChess.cpp b/examples/wchess/libwchess/WChess.cpp index d9f06696..35ac4ca5 100644 --- a/examples/wchess/libwchess/WChess.cpp +++ b/examples/wchess/libwchess/WChess.cpp @@ -2,7 +2,7 @@ #include "Chessboard.h" #include "grammar-parser.h" #include "common.h" -#include +#include WChess::WChess(whisper_context * ctx, const whisper_full_params & wparams,