diff --git a/README.md b/README.md index 5c229797..ab0d8820 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ As an example, here is a video of running the model on an iPhone 13 device - ful https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4 -You can also easily make your own offline voice assistant application: +You can also easily make your own offline voice assistant application: [command](examples/command) https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4 diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e798d1f0..b03694ef 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,6 +21,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) add_subdirectory(whisper.wasm) add_subdirectory(stream.wasm) + add_subdirectory(command.wasm) add_subdirectory(talk.wasm) else() add_subdirectory(main) diff --git a/examples/command.wasm/CMakeLists.txt b/examples/command.wasm/CMakeLists.txt new file mode 100644 index 00000000..27fd0ab0 --- /dev/null +++ b/examples/command.wasm/CMakeLists.txt @@ -0,0 +1,47 @@ +# +# libcommand +# + +set(TARGET libcommand) + +add_executable(${TARGET} + emscripten.cpp + ) + +target_link_libraries(${TARGET} PRIVATE + whisper + ) + +unset(EXTRA_FLAGS) + +if (WHISPER_WASM_SINGLE_FILE) + set(EXTRA_FLAGS "-s SINGLE_FILE=1") + message(STATUS "Embedding WASM inside command.js") + + add_custom_command( + TARGET ${TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_BINARY_DIR}/bin/libcommand.js + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/command.wasm/command.js + ) +endif() + +set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \ + --bind \ + -s USE_PTHREADS=1 \ + -s PTHREAD_POOL_SIZE=8 \ + -s INITIAL_MEMORY=1024MB \ + -s TOTAL_MEMORY=1024MB \ + -s FORCE_FILESYSTEM=1 \ + -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \ + ${EXTRA_FLAGS} \ + ") + +# +# command.wasm +# + +set(TARGET command.wasm) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY) diff --git a/examples/command.wasm/README.md b/examples/command.wasm/README.md new file mode 100644 index 00000000..a6e0cf18 --- /dev/null +++ b/examples/command.wasm/README.md @@ -0,0 +1,23 @@ +# command.wasm + +This is a basic Voice Assistant example that accepts voice commands from the microphone. +It runs in fully in the browser via WebAseembly. + +Online demo: https://whisper.ggerganov.com/command/ + +Terminal version: https://github.com/ggerganov/whisper.cpp/examples/command + +## Build instructions + +```bash +# build using Emscripten (v3.1.2) +git clone https://github.com/ggerganov/whisper.cpp +cd whisper.cpp +mkdir build-em && cd build-em +emcmake cmake .. +make -j + +# copy the produced page to your HTTP path +cp bin/command.wasm/* /path/to/html/ +cp bin/libcommand.worker.js /path/to/html/ +``` diff --git a/examples/command.wasm/emscripten.cpp b/examples/command.wasm/emscripten.cpp new file mode 100644 index 00000000..d4bbb212 --- /dev/null +++ b/examples/command.wasm/emscripten.cpp @@ -0,0 +1,408 @@ +#include "ggml.h" +#include "whisper.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +constexpr int N_THREAD = 8; + +std::vector g_contexts(4, nullptr); + +std::mutex g_mutex; +std::thread g_worker; + +std::atomic g_running(false); + +std::string g_status = ""; +std::string g_status_forced = ""; +std::string g_transcribed = ""; + +std::vector g_pcmf32; + +static std::string trim(const std::string & s) { + std::regex e("^\\s+|\\s+$"); + return std::regex_replace(s, e, ""); +} + +static void high_pass_filter(std::vector & data, float cutoff, float sample_rate) { + const float rc = 1.0f / (2.0f * M_PI * cutoff); + const float dt = 1.0f / sample_rate; + const float alpha = dt / (rc + dt); + + float y = data[0]; + + for (size_t i = 1; i < data.size(); i++) { + y = alpha * (y + data[i] - data[i - 1]); + data[i] = y; + } +} + +// compute similarity between two strings using Levenshtein distance +static float similarity(const std::string & s0, const std::string & s1) { + const size_t len0 = s0.size() + 1; + const size_t len1 = s1.size() + 1; + + std::vector col(len1, 0); + std::vector prevCol(len1, 0); + + for (size_t i = 0; i < len1; i++) { + prevCol[i] = i; + } + + for (size_t i = 0; i < len0; i++) { + col[0] = i; + for (size_t j = 1; j < len1; j++) { + col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1)); + } + col.swap(prevCol); + } + + const float dist = prevCol[len1 - 1]; + + return 1.0f - (dist / std::max(s0.size(), s1.size())); +} + +void command_set_status(const std::string & status) { + std::lock_guard lock(g_mutex); + g_status = status; +} + +bool command_vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) { + const int n_samples = pcmf32.size(); + const int n_samples_last = (sample_rate * last_ms) / 1000; + + if (n_samples_last >= n_samples) { + // not enough samples - assume no speech + return false; + } + + if (freq_thold > 0.0f) { + high_pass_filter(pcmf32, freq_thold, sample_rate); + } + + float energy_all = 0.0f; + float energy_last = 0.0f; + + for (size_t i = 0; i < n_samples; i++) { + energy_all += fabsf(pcmf32[i]); + + if (i >= n_samples - n_samples_last) { + energy_last += fabsf(pcmf32[i]); + } + } + + energy_all /= n_samples; + energy_last /= n_samples_last; + + if (verbose) { + fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold); + } + + if (energy_last > vad_thold*energy_all) { + return false; + } + + return true; +} + +std::string command_transcribe(whisper_context * ctx, const whisper_full_params & wparams, const std::vector & pcmf32, float & prob, int64_t & t_ms) { + const auto t_start = std::chrono::high_resolution_clock::now(); + + prob = 0.0f; + t_ms = 0; + + if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { + return ""; + } + + int prob_n = 0; + std::string result; + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char * text = whisper_full_get_segment_text(ctx, i); + + result += text; + + const int n_tokens = whisper_full_n_tokens(ctx, i); + for (int j = 0; j < n_tokens; ++j) { + const auto token = whisper_full_get_token_data(ctx, i, j); + + prob += token.p; + ++prob_n; + } + } + + if (prob_n > 0) { + prob /= prob_n; + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + t_ms = std::chrono::duration_cast(t_end - t_start).count(); + + return result; +} + +void command_get_audio(int ms, int sample_rate, std::vector & audio) { + const int64_t n_samples = (ms * sample_rate) / 1000; + + int64_t n_take = 0; + if (g_pcmf32.size() < n_samples) { + n_take = g_pcmf32.size(); + } else { + n_take = n_samples; + } + + audio.resize(n_take); + std::copy(g_pcmf32.end() - n_take, g_pcmf32.end(), audio.begin()); +} + +void command_main(size_t index) { + command_set_status("loading data ..."); + + struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY); + + wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency()); + wparams.offset_ms = 0; + wparams.translate = false; + wparams.no_context = true; + wparams.single_segment = true; + wparams.print_realtime = false; + wparams.print_progress = false; + wparams.print_timestamps = true; + wparams.print_special = false; + + wparams.max_tokens = 32; + wparams.audio_ctx = 768; // partial encoder context for better performance + + wparams.language = "en"; + + printf("command: using %d threads\n", wparams.n_threads); + + bool is_running = true; + bool have_prompt = false; + bool ask_prompt = true; + bool print_energy = false; + + float prob0 = 0.0f; + float prob = 0.0f; + + std::vector pcmf32_cur; + std::vector pcmf32_prompt; + + const std::string k_prompt = "Ok Whisper, start listening for commands."; + + // whisper context + auto & ctx = g_contexts[index]; + + const int32_t vad_ms = 2000; + const int32_t prompt_ms = 5000; + const int32_t command_ms = 4000; + + const float vad_thold = 0.1f; + const float freq_thold = -1.0f; + + while (g_running) { + // delay + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + if (ask_prompt) { + fprintf(stdout, "\n"); + fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m"); + fprintf(stdout, "\n"); + + { + char txt[1024]; + snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str()); + command_set_status(txt); + } + + ask_prompt = false; + } + + int64_t t_ms = 0; + + { + command_get_audio(vad_ms, WHISPER_SAMPLE_RATE, pcmf32_cur); + + if (command_vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, vad_thold, freq_thold, print_energy)) { + fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__); + command_set_status("Speech detected! Processing ..."); + + if (!have_prompt) { + command_get_audio(prompt_ms, WHISPER_SAMPLE_RATE, pcmf32_cur); + + const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob0, t_ms)); + + fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms); + + const float sim = similarity(txt, k_prompt); + + if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) { + fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__); + ask_prompt = true; + } else { + fprintf(stdout, "\n"); + fprintf(stdout, "%s: The prompt has been recognized!\n", __func__); + fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__); + fprintf(stdout, "\n"); + + { + char txt[1024]; + snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ..."); + command_set_status(txt); + } + + // save the audio for the prompt + pcmf32_prompt = pcmf32_cur; + have_prompt = true; + } + } else { + command_get_audio(command_ms, WHISPER_SAMPLE_RATE, pcmf32_cur); + + // prepend the prompt audio + pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end()); + + const auto txt = ::trim(::command_transcribe(ctx, wparams, pcmf32_cur, prob, t_ms)); + + prob = 100.0f*(prob - prob0); + + fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str()); + + // find the prompt in the text + float best_sim = 0.0f; + size_t best_len = 0; + for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) { + const auto prompt = txt.substr(0, n); + + const float sim = similarity(prompt, k_prompt); + + //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim); + + if (sim > best_sim) { + best_sim = sim; + best_len = n; + } + } + + const std::string command = ::trim(txt.substr(best_len)); + + fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms); + fprintf(stdout, "\n"); + + { + char txt[1024]; + snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms); + command_set_status(txt); + } + { + std::lock_guard lock(g_mutex); + g_transcribed = command; + } + } + + g_pcmf32.clear(); + } + } + } + + if (index < g_contexts.size()) { + whisper_free(g_contexts[index]); + g_contexts[index] = nullptr; + } +} + +EMSCRIPTEN_BINDINGS(command) { + emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { + for (size_t i = 0; i < g_contexts.size(); ++i) { + if (g_contexts[i] == nullptr) { + g_contexts[i] = whisper_init(path_model.c_str()); + if (g_contexts[i] != nullptr) { + g_running = true; + if (g_worker.joinable()) { + g_worker.join(); + } + g_worker = std::thread([i]() { + command_main(i); + }); + + return i + 1; + } else { + return (size_t) 0; + } + } + } + + return (size_t) 0; + })); + + emscripten::function("free", emscripten::optional_override([](size_t index) { + if (g_running) { + g_running = false; + } + })); + + emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) { + --index; + + if (index >= g_contexts.size()) { + return -1; + } + + if (g_contexts[index] == nullptr) { + return -2; + } + + { + std::lock_guard lock(g_mutex); + const int n = audio["length"].as(); + + emscripten::val heap = emscripten::val::module_property("HEAPU8"); + emscripten::val memory = heap["buffer"]; + + g_pcmf32.resize(n); + + emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast(g_pcmf32.data()), n); + memoryView.call("set", audio); + } + + return 0; + })); + + emscripten::function("get_transcribed", emscripten::optional_override([]() { + std::string transcribed; + + { + std::lock_guard lock(g_mutex); + transcribed = std::move(g_transcribed); + } + + return transcribed; + })); + + emscripten::function("get_status", emscripten::optional_override([]() { + std::string status; + + { + std::lock_guard lock(g_mutex); + status = g_status_forced.empty() ? g_status : g_status_forced; + } + + return status; + })); + + emscripten::function("set_status", emscripten::optional_override([](const std::string & status) { + { + std::lock_guard lock(g_mutex); + g_status_forced = status; + } + })); +} diff --git a/examples/command.wasm/index-tmpl.html b/examples/command.wasm/index-tmpl.html new file mode 100644 index 00000000..08670a1e --- /dev/null +++ b/examples/command.wasm/index-tmpl.html @@ -0,0 +1,386 @@ + + + + command : Voice assistant example using Whisper + WebAssembly + + + + +
+ command : Voice assistant example using Whisper + WebAssembly + +

+ + You can find more about this project on GitHub. + +

+ +
+ + Select the model you would like to use, click the "Start" button and follow the instructions. + +

+ +
+ Whisper model: + + + + + +
+ +
+ +
+ + + +
+ +
+ +
+ Status: not started + +
[The recognized voice commands will be displayed here]
+
+ +
+ + Debug output: + + +
+ + Troubleshooting + +

+ + The page does some heavy computations, so make sure: + +
    +
  • To use a modern web browser (e.g. Chrome, Firefox)
  • +
  • To use a fast desktop or laptop computer (i.e. not a mobile phone)
  • +
  • Your browser supports WASM Fixed-width SIMD
  • +
+ +
+ + | + Build time: @GIT_DATE@ | + Commit hash: @GIT_SHA1@ | + Commit subject: @GIT_COMMIT_SUBJECT@ | + Source Code | + +
+
+ + + + + + diff --git a/examples/command/README.md b/examples/command/README.md index 3ef73684..de8b61ca 100644 --- a/examples/command/README.md +++ b/examples/command/README.md @@ -13,6 +13,8 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/ https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4 +Web version: https://github.com/ggerganov/whisper.cpp/examples/command.wasm + ## Building The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: diff --git a/examples/command/command.cpp b/examples/command/command.cpp index 2e47be0c..9cc6dce9 100644 --- a/examples/command/command.cpp +++ b/examples/command/command.cpp @@ -535,7 +535,7 @@ int main(int argc, char ** argv) { bool is_running = true; bool have_prompt = false; - bool ask_prompt = true; + bool ask_prompt = true; float prob0 = 0.0f; float prob = 0.0f; diff --git a/examples/stream.wasm/index-tmpl.html b/examples/stream.wasm/index-tmpl.html index cd72b6fd..2033d961 100644 --- a/examples/stream.wasm/index-tmpl.html +++ b/examples/stream.wasm/index-tmpl.html @@ -100,12 +100,6 @@