whisper.cpp/examples/talk-llama/talk-llama.cpp

// Talk with AI
//

#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include "llama.h"

#include <cassert>
#include <cstdio>
#include <fstream>
#include <regex>
#include <string>
#include <thread>
#include <vector>
#include <regex>
#include <sstream>

std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    auto * model = llama_get_model(ctx);

    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return result;
}

std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }

    return std::string(result.data(), result.size());
}

// command-line parameters
struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t voice_ms   = 10000;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
    int32_t n_gpu_layers = 999;

    float vad_thold  = 0.6f;
    float freq_thold = 100.0f;

    bool translate      = false;
    bool print_special  = false;
    bool print_energy   = false;
    bool no_timestamps  = true;
    bool verbose_prompt = false;
    bool use_gpu        = true;
    bool flash_attn     = false;

    std::string person      = "Georgi";
    std::string bot_name    = "LLaMA";
    std::string wake_cmd    = "";
    std::string heard_ok    = "";
    std::string language    = "en";
    std::string model_wsp   = "models/ggml-base.en.bin";
    std::string model_llama = "models/ggml-llama-7B.bin";
    std::string speak       = "./examples/talk-llama/speak";
    std::string speak_file  = "./examples/talk-llama/to_speak.txt";
    std::string prompt      = "";
    std::string fname_out;
    std::string path_session = "";       // path to file for saving/loading model eval state
};

void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
        else if (arg == "-t"   || arg == "--threads")        { params.n_threads      = std::stoi(argv[++i]); }
        else if (arg == "-vms" || arg == "--voice-ms")       { params.voice_ms       = std::stoi(argv[++i]); }
        else if (arg == "-c"   || arg == "--capture")        { params.capture_id     = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")     { params.max_tokens     = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")      { params.audio_ctx      = std::stoi(argv[++i]); }
        else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
        else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
        else if (arg == "-fa"  || arg == "--flash-attn")     { params.flash_attn     = true; }
        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
        else if (arg == "-bn"   || arg == "--bot-name")      { params.bot_name       = argv[++i]; }
        else if (arg == "--session")                         { params.path_session   = argv[++i]; }
        else if (arg == "-w"   || arg == "--wake-command")   { params.wake_cmd       = argv[++i]; }
        else if (arg == "-ho"  || arg == "--heard-ok")       { params.heard_ok       = argv[++i]; }
        else if (arg == "-l"   || arg == "--language")       { params.language       = argv[++i]; }
        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
        else if (arg == "-s"   || arg == "--speak")          { params.speak          = argv[++i]; }
        else if (arg == "-sf"  || arg == "--speak-file")     { params.speak_file     = argv[++i]; }
        else if (arg == "--prompt-file")                     {
            std::ifstream file(argv[++i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }

    return true;
}

void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -vms N,   --voice-ms N     [%-7d] voice duration in milliseconds\n",              params.voice_ms);
    fprintf(stderr, "  -c ID,    --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,    --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -fa,      --flash-attn     [%-7s] flash attention\n",                             params.flash_attn ? "true" : "false");
    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
    fprintf(stderr, "  -bn NAME, --bot-name NAME  [%-7s] bot name (to display)\n",                       params.bot_name.c_str());
    fprintf(stderr, "  -w TEXT,  --wake-command T [%-7s] wake-up command to listen for\n",               params.wake_cmd.c_str());
    fprintf(stderr, "  -ho TEXT, --heard-ok TEXT  [%-7s] said by TTS before generating reply\n",         params.heard_ok.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
    fprintf(stderr, "  -s FILE,  --speak TEXT     [%-7s] command for TTS\n",                             params.speak.c_str());
    fprintf(stderr, "  -sf FILE, --speak-file     [%-7s] file to pass to TTS\n",                         params.speak_file.c_str());
    fprintf(stderr, "  --prompt-file FNAME        [%-7s] file with custom prompt to start dialog\n",     "");
    fprintf(stderr, "  --session FNAME                   file to cache model state in (may be large!) (default: none)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
}

std::string transcribe(
        whisper_context * ctx,
        const whisper_params & params,
        const std::vector<float> & pcmf32,
        const std::string prompt_text,
        float & prob,
        int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

    prob = 0.0f;
    t_ms = 0;

    std::vector<whisper_token> prompt_tokens;

    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

    prompt_tokens.resize(1024);
    prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));

    wparams.print_progress   = false;
    wparams.print_special    = params.print_special;
    wparams.print_realtime   = false;
    wparams.print_timestamps = !params.no_timestamps;
    wparams.translate        = params.translate;
    wparams.no_context       = true;
    wparams.single_segment   = true;
    wparams.max_tokens       = params.max_tokens;
    wparams.language         = params.language.c_str();
    wparams.n_threads        = params.n_threads;

    wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();

    wparams.audio_ctx        = params.audio_ctx;

    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }

    int prob_n = 0;
    std::string result;

    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);

        result += text;

        const int n_tokens = whisper_full_n_tokens(ctx, i);
        for (int j = 0; j < n_tokens; ++j) {
            const auto token = whisper_full_get_token_data(ctx, i, j);

            prob += token.p;
            ++prob_n;
        }
    }

    if (prob_n > 0) {
        prob /= prob_n;
    }

    const auto t_end = std::chrono::high_resolution_clock::now();
    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();

    return result;
}

std::vector<std::string> get_words(const std::string &txt) {
    std::vector<std::string> words;

    std::istringstream iss(txt);
    std::string word;
    while (iss >> word) {
        words.push_back(word);
    }

    return words;
}

const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";

const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{1} responds with short and concise answers.

{0}{4} Hello, {1}!
{1}{4} Hello {0}! How may I help you today?
{0}{4} What time is it?
{1}{4} It is {2} o'clock.
{0}{4} What year is it?
{1}{4} We are in {3}.
{0}{4} What is a cat?
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{0}{4} Name a color.
{1}{4} Blue
{0}{4})";

int main(int argc, char ** argv) {
    whisper_params params;

    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }

    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }

    // whisper init

    struct whisper_context_params cparams = whisper_context_default_params();

    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;

    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
    if (!ctx_wsp) {
        fprintf(stderr, "No whisper.cpp model specified. Please provide using -mw <modelfile>\n");
        return 1;
    }

    // llama init

    llama_backend_init();

    auto lmparams = llama_model_default_params();
    if (!params.use_gpu) {
        lmparams.n_gpu_layers = 0;
    } else {
        lmparams.n_gpu_layers = params.n_gpu_layers;
    }

    struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);
    if (!model_llama) {
        fprintf(stderr, "No llama.cpp model specified. Please provide using -ml <modelfile>\n");
        return 1;
    }

    llama_context_params lcparams = llama_context_default_params();

    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
    lcparams.n_threads  = params.n_threads;
    lcparams.flash_attn = params.flash_attn;

    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);

    // print some info about the processing
    {
        fprintf(stderr, "\n");

        if (!whisper_is_multilingual(ctx_wsp)) {
            if (params.language != "en" || params.translate) {
                params.language = "en";
                params.translate = false;
                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
            }
        }
        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
                __func__,
                params.n_threads,
                params.language.c_str(),
                params.translate ? "translate" : "transcribe",
                params.no_timestamps ? 0 : 1);

        fprintf(stderr, "\n");
    }

    // init audio

    audio_async audio(30*1000);
    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
        return 1;
    }

    audio.resume();

    bool is_running  = true;
    bool force_speak = false;

    float prob0 = 0.0f;

    const std::string chat_symb = ":";

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", params.bot_name);

    // construct the initial prompt for LLaMA inference
    std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;

    // need to have leading ' '
    prompt_llama.insert(0, 1, ' ');

    prompt_llama = ::replace(prompt_llama, "{0}", params.person);
    prompt_llama = ::replace(prompt_llama, "{1}", params.bot_name);

    {
        // get time string
        std::string time_str;
        {
            time_t t = time(0);
            struct tm * now = localtime(&t);
            char buf[128];
            strftime(buf, sizeof(buf), "%H:%M", now);
            time_str = buf;
        }
        prompt_llama = ::replace(prompt_llama, "{2}", time_str);
    }

    {
        // get year string
        std::string year_str;
        {
            time_t t = time(0);
            struct tm * now = localtime(&t);
            char buf[128];
            strftime(buf, sizeof(buf), "%Y", now);
            year_str = buf;
        }
        prompt_llama = ::replace(prompt_llama, "{3}", year_str);
    }

    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);

    llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);

    // init session
    std::string path_session = params.path_session;
    std::vector<llama_token> session_tokens;
    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);

    if (!path_session.empty()) {
        fprintf(stderr, "%s: attempting to load saved session from %s\n", __func__, path_session.c_str());

        // fopen to check for existing session
        FILE * fp = std::fopen(path_session.c_str(), "rb");
        if (fp != NULL) {
            std::fclose(fp);

            session_tokens.resize(llama_n_ctx(ctx_llama));
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
            session_tokens.resize(n_token_count_out);
            for (size_t i = 0; i < session_tokens.size(); i++) {
                embd_inp[i] = session_tokens[i];
            }

            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
        } else {
            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
        }
    }

    // evaluate the initial prompt

    printf("\n");
    printf("%s : initializing - please wait ...\n", __func__);

    // prepare batch
    {
        batch.n_tokens = embd_inp.size();

        for (int i = 0; i < batch.n_tokens; i++) {
            batch.token[i]     = embd_inp[i];
            batch.pos[i]       = i;
            batch.n_seq_id[i]  = 1;
            batch.seq_id[i][0] = 0;
            batch.logits[i]    = i == batch.n_tokens - 1;
        }
    }

    if (llama_decode(ctx_llama, batch)) {
        fprintf(stderr, "%s : failed to decode\n", __func__);
        return 1;
    }

    if (params.verbose_prompt) {
        fprintf(stdout, "\n");
        fprintf(stdout, "%s", prompt_llama.c_str());
        fflush(stdout);
    }

     // debug message about similarity of saved session, if applicable
    size_t n_matching_session_tokens = 0;
    if (session_tokens.size()) {
        for (llama_token id : session_tokens) {
            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                break;
            }
            n_matching_session_tokens++;
        }
        if (n_matching_session_tokens >= embd_inp.size()) {
            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        } else {
            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        }
    }

    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
    // initial prompt so it doesn't need to be an exact match.
    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);

    printf("%s : done! start speaking in the microphone\n", __func__);

    // show wake command if enabled
    const std::string wake_cmd = params.wake_cmd;
    const int wake_cmd_length = get_words(wake_cmd).size();
    const bool use_wake_cmd = wake_cmd_length > 0;

    if (use_wake_cmd) {
        printf("%s : the wake-up command is: '%s%s%s'\n", __func__, "\033[1m", wake_cmd.c_str(), "\033[0m");
    }

    printf("\n");
    printf("%s%s", params.person.c_str(), chat_symb.c_str());
    fflush(stdout);

    // clear audio buffer
    audio.clear();

    // text inference variables
    const int voice_id = 2;
    const int n_keep   = embd_inp.size();
    const int n_ctx    = llama_n_ctx(ctx_llama);

    int n_past = n_keep;
    int n_prev = 64; // TODO arg
    int n_session_consumed = !path_session.empty() && session_tokens.size() > 0 ? session_tokens.size() : 0;

    std::vector<llama_token> embd;

    // reverse prompts for detecting when it's time to stop speaking
    std::vector<std::string> antiprompts = {
        params.person + chat_symb,
    };

    // main loop
    while (is_running) {
        // handle Ctrl + C
        is_running = sdl_poll_events();

        if (!is_running) {
            break;
        }

        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));

        int64_t t_ms = 0;

        {
            audio.get(2000, pcmf32_cur);

            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);

                audio.get(params.voice_ms, pcmf32_cur);

                std::string all_heard;

                if (!force_speak) {
                    all_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
                }

                const auto words = get_words(all_heard);

                std::string wake_cmd_heard;
                std::string text_heard;

                for (int i = 0; i < (int) words.size(); ++i) {
                    if (i < wake_cmd_length) {
                        wake_cmd_heard += words[i] + " ";
                    } else {
                        text_heard += words[i] + " ";
                    }
                }

                // check if audio starts with the wake-up command if enabled
                if (use_wake_cmd) {
                    const float sim = similarity(wake_cmd_heard, wake_cmd);

                    if ((sim < 0.7f) || (text_heard.empty())) {
                        audio.clear();
                        continue;
                    }
                }

                // optionally give audio feedback that the current text is being processed
                if (!params.heard_ok.empty()) {
                    speak_with_file(params.speak, params.heard_ok, params.speak_file, voice_id);
                }

                // remove text between brackets using regex
                {
                    std::regex re("\\[.*?\\]");
                    text_heard = std::regex_replace(text_heard, re, "");
                }

                // remove text between brackets using regex
                {
                    std::regex re("\\(.*?\\)");
                    text_heard = std::regex_replace(text_heard, re, "");
                }

                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");

                // take first line
                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));

                // remove leading and trailing whitespace
                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");

                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);

                if (text_heard.empty() || tokens.empty() || force_speak) {
                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
                    audio.clear();

                    continue;
                }

                force_speak = false;

                text_heard.insert(0, 1, ' ');
                text_heard += "\n" + params.bot_name + chat_symb;
                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
                fflush(stdout);

                embd = ::llama_tokenize(ctx_llama, text_heard, false);

                // Append the new input tokens to the session_tokens vector
                if (!path_session.empty()) {
                    session_tokens.insert(session_tokens.end(), tokens.begin(), tokens.end());
                }

                // text inference
                bool done = false;
                std::string text_to_speak;
                while (true) {
                    // predict
                    if (embd.size() > 0) {
                        if (n_past + (int) embd.size() > n_ctx) {
                            n_past = n_keep;

                            // insert n_left/2 tokens at the start of embd from last_n_tokens
                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
                            // stop saving session if we run out of context
                            path_session = "";
                            //printf("\n---\n");
                            //printf("resetting: '");
                            //for (int i = 0; i < (int) embd.size(); i++) {
                            //    printf("%s", llama_token_to_piece(ctx_llama, embd[i]));
                            //}
                            //printf("'\n");
                            //printf("\n---\n");
                        }

                        // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
                        // REVIEW
                        if (n_session_consumed < (int) session_tokens.size()) {
                            size_t i = 0;
                            for ( ; i < embd.size(); i++) {
                                if (embd[i] != session_tokens[n_session_consumed]) {
                                    session_tokens.resize(n_session_consumed);
                                    break;
                                }

                                n_past++;
                                n_session_consumed++;

                                if (n_session_consumed >= (int) session_tokens.size()) {
                                    i++;
                                    break;
                                }
                            }
                            if (i > 0) {
                                embd.erase(embd.begin(), embd.begin() + i);
                            }
                        }

                        if (embd.size() > 0 && !path_session.empty()) {
                            session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
                            n_session_consumed = session_tokens.size();
                        }

                        // prepare batch
                        {
                            batch.n_tokens = embd.size();

                            for (int i = 0; i < batch.n_tokens; i++) {
                                batch.token[i]     = embd[i];
                                batch.pos[i]       = n_past + i;
                                batch.n_seq_id[i]  = 1;
                                batch.seq_id[i][0] = 0;
                                batch.logits[i]    = i == batch.n_tokens - 1;
                            }
                        }

                        if (llama_decode(ctx_llama, batch)) {
                            fprintf(stderr, "%s : failed to decode\n", __func__);
                            return 1;
                        }
                    }


                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
                    n_past += embd.size();

                    embd.clear();

                    if (done) break;

                    {
                        // out of user input, sample next token
                        const float top_k          = 5;
                        const float top_p          = 0.80f;
                        const float temp           = 0.30f;
                        const float repeat_penalty = 1.1764f;

                        const int repeat_last_n    = 256;

                        if (!path_session.empty() && need_to_save_session) {
                            need_to_save_session = false;
                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
                        }

                        llama_token id = 0;

                        {
                            auto logits = llama_get_logits(ctx_llama);
                            auto n_vocab = llama_n_vocab(model_llama);

                            logits[llama_token_eos(model_llama)] = 0;

                            std::vector<llama_token_data> candidates;
                            candidates.reserve(n_vocab);
                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                            }

                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

                            // apply repeat penalty
                            const float nl_logit = logits[llama_token_nl(model_llama)];

                            llama_sample_repetition_penalties(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
                                    repeat_last_n, repeat_penalty, 0.0, 0.0f);

                            logits[llama_token_nl(model_llama)] = nl_logit;

                            if (temp <= 0) {
                                // Greedy sampling
                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
                            } else {
                                // Temperature sampling
                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                                llama_sample_temp (ctx_llama, &candidates_p, temp);
                                id = llama_sample_token(ctx_llama, &candidates_p);
                            }
                        }

                        if (id != llama_token_eos(model_llama)) {
                            // add it to the context
                            embd.push_back(id);

                            text_to_speak += llama_token_to_piece(ctx_llama, id);

                            printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
                            fflush(stdout);
                        }
                    }

                    {
                        std::string last_output;
                        for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
                            last_output += llama_token_to_piece(ctx_llama, embd_inp[i]);
                        }
                        last_output += llama_token_to_piece(ctx_llama, embd[0]);

                        for (std::string & antiprompt : antiprompts) {
                            if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
                                done = true;
                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
                                fflush(stdout);
                                need_to_save_session = true;
                                break;
                            }
                        }
                    }

                    is_running = sdl_poll_events();

                    if (!is_running) {
                        break;
                    }
                }

                speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);

                audio.clear();
            }
        }
    }

    audio.pause();

    whisper_print_timings(ctx_wsp);
    whisper_free(ctx_wsp);

    llama_print_timings(ctx_llama);
    llama_free(ctx_llama);

    return 0;
}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								// Talk with AI
 								//
-												Revert "ggml : do not use _GNU_SOURCE gratuitously (#1027)"

This reverts commit 3f7a03ebe3b65be0792849e300a122f6a050e3f8.

											
										
										
											2023-07-02 18:53:52 +00:00
+								#include "common-sdl.h"
-												build : do not use _GNU_SOURCE gratuitously (#1129)

* Do not use _GNU_SOURCE gratuitously.

What is needed to build whisper.cpp and examples is availability of
stuff defined in The Open Group Base Specifications Issue 6
(https://pubs.opengroup.org/onlinepubs/009695399/) known also as
Single Unix Specification v3 (SUSv3) or POSIX.1-2001 + XSI extensions,
plus some stuff from BSD that is not specified in POSIX.1.

Well, that was true until NUMA support was added recently in ggml,
so enable GNU libc extensions for Linux builds to cover that.

There is no need to penalize musl libc which simply follows standards.

Not having feature test macros in source code gives greater flexibility
to those wanting to reuse it in 3rd party app, as they can build it with
minimal FTM (_XOPEN_SOURCE=600) or other FTM depending on their needs.

It builds without issues in Alpine (musl libc), Ubuntu (glibc), MSYS2.

* examples : include SDL headers before other headers

Avoid macOS build error when _DARWIN_C_SOURCE is not defined, brought by
SDL2 relying on Darwin extension memset_pattern4/8/16 (from string.h).

* make : enable BSD extensions for DragonFlyBSD to expose RLIMIT_MEMLOCK

* make : use BSD-specific FTMs to enable alloca on BSDs

* make : fix OpenBSD build by exposing newer POSIX definitions

* cmake : follow recent FTM improvements from Makefile
											
										
										
											2023-09-07 09:36:14 +00:00
+								#include "common.h"
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								#include "whisper.h"
 								#include "llama.h"
 								#include <cassert>
 								#include <cstdio>
 								#include <fstream>
 								#include <regex>
 								#include <string>
 								#include <thread>
 								#include <vector>
 								#include <regex>
-												talk-llama : optional wake-up command and audio confirmation (#1765)

* talk-llama: add optional wake-word detection from command

* talk-llama: add optional audio confirmation before generating answer

* talk-llama: fix small formatting issue in output

* talk-llama.cpp: fix Windows build
											
										
										
											2024-01-16 13:52:01 +00:00
+								#include <sstream>
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    auto * model = llama_get_model(ctx);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // upper limit for the number of tokens
 								    int n_tokens = text.length() + add_bos;
 								    std::vector<llama_token> result(n_tokens);
 								    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
 								    if (n_tokens < 0) {
 								        result.resize(-n_tokens);
 								        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
 								        GGML_ASSERT(check == -n_tokens);
 								    } else {
 								        result.resize(n_tokens);
 								    }
 								    return result;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
 								    std::vector<char> result(8, 0);
-												talk-llama : sync llama.cpp

											
										
										
											2024-05-12 17:12:46 +00:00
+								    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								    if (n_tokens < 0) {
 								        result.resize(-n_tokens);
-												talk-llama : sync llama.cpp

											
										
										
											2024-05-12 17:12:46 +00:00
+								        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								        GGML_ASSERT(check == -n_tokens);
 								    } else {
 								        result.resize(n_tokens);
 								    }
 								    return std::string(result.data(), result.size());
 								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								// command-line parameters
 								struct whisper_params {
 								    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
 								    int32_t voice_ms   = 10000;
 								    int32_t capture_id = -1;
 								    int32_t max_tokens = 32;
 								    int32_t audio_ctx  = 0;
-												talk-llama : enable GPU by default

											
										
										
											2023-11-15 19:32:25 +00:00
+								    int32_t n_gpu_layers = 999;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								    float vad_thold  = 0.6f;
 								    float freq_thold = 100.0f;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								    bool translate      = false;
 								    bool print_special  = false;
 								    bool print_energy   = false;
 								    bool no_timestamps  = true;
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								    bool verbose_prompt = false;
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    bool use_gpu        = true;
-												whisper : use flash attention (#2152)

* whisper : use flash attention in the encoder

* whisper : add kv_pad

* whisper : remove extra backend instance (huh?)

* whisper : use FA for cross-attention

* whisper : use FA for self-attention

* whisper : simplify encoder FA

* whisper : add flash_attn runtime parameter

* scripts : add bench log

* scripts : add M1 Pro bench log
											
										
										
											2024-05-15 06:38:19 +00:00
+								    bool flash_attn     = false;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    std::string person      = "Georgi";
-												talk-llama : add optional CLI arg to set the bot name (#1764)


											
										
										
											2024-01-13 18:51:35 +00:00
+								    std::string bot_name    = "LLaMA";
-												talk-llama : optional wake-up command and audio confirmation (#1765)

* talk-llama: add optional wake-word detection from command

* talk-llama: add optional audio confirmation before generating answer

* talk-llama: fix small formatting issue in output

* talk-llama.cpp: fix Windows build
											
										
										
											2024-01-16 13:52:01 +00:00
+								    std::string wake_cmd    = "";
 								    std::string heard_ok    = "";
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    std::string language    = "en";
 								    std::string model_wsp   = "models/ggml-base.en.bin";
 								    std::string model_llama = "models/ggml-llama-7B.bin";
-												`speak` scripts for Windows

											
										
										
											2023-06-01 12:45:00 +00:00
+								    std::string speak       = "./examples/talk-llama/speak";
-												talk, talk-llama : pass text_to_speak as a file (#1865)

* talk-llama: pass file instead of arg

it is too hard to quote text in a portable way

* talk-llama: pass heard_ok as a file

* talk-llama: let eleven-labs.py accept options

Options: -v voice, -s savefile, -p (--play)

* talk-llama: check installed commands in "speak"

Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed

* talk-llama: pass voice_id again

in order to sync talk with talk-llama

* talk: sync with talk-llama

Passing text_to_speak as a file is safer and more portable
cf. https://stackoverflow.com/a/59036879/45375

* talk and talk-llama: get all installed voices in speak.ps1

* talk and talk-llama: get voices from api

* talk and talk-llama: add more options to eleven-labs.py

and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/)

```
usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE]

options:
  -q, --quick           skip checking the required library

action:
  TEXTFILE              read the text file (default: stdin)
  -l, --list            show the list of voices and exit
  -h, --help            show this help and exit

voice selection:
  -n NAME, --name NAME  get a voice object by name (default: Arnold)
  -v NUMBER, --voice NUMBER
                        get a voice object by number (see --list)
  -f KEY=VAL, --filter KEY=VAL
                        filter voices by labels (default: "use case=narration")
                        this option can be used multiple times
                        filtering will be disabled if the first -f has no "=" (e.g. -f "any")

output:
  -s FILE, --save FILE  save the TTS to a file (default: audio.mp3)
  -p, --play            play the TTS with ffplay
```

* examples: add speak_with_file()

as suggested in the review

* talk and talk-llama: ignore to_speak.txt
											
										
										
											2024-02-24 07:24:47 +00:00
+								    std::string speak_file  = "./examples/talk-llama/to_speak.txt";
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								    std::string prompt      = "";
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    std::string fname_out;
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								    std::string path_session = "";       // path to file for saving/loading model eval state
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								};
 								void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 								bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
 								    for (int i = 1; i < argc; i++) {
 								        std::string arg = argv[i];
 								        if (arg == "-h" || arg == "--help") {
 								            whisper_print_usage(argc, argv, params);
 								            exit(0);
 								        }
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								        else if (arg == "-t"   || arg == "--threads")        { params.n_threads      = std::stoi(argv[++i]); }
 								        else if (arg == "-vms" || arg == "--voice-ms")       { params.voice_ms       = std::stoi(argv[++i]); }
 								        else if (arg == "-c"   || arg == "--capture")        { params.capture_id     = std::stoi(argv[++i]); }
 								        else if (arg == "-mt"  || arg == "--max-tokens")     { params.max_tokens     = std::stoi(argv[++i]); }
 								        else if (arg == "-ac"  || arg == "--audio-ctx")      { params.audio_ctx      = std::stoi(argv[++i]); }
-												talk-llama : add n_gpu_layers parameter (#1475)


											
										
										
											2023-11-13 08:04:16 +00:00
+								        else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								        else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
 								        else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
 								        else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
 								        else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
 								        else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
 								        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
 								        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
-												whisper : use flash attention (#2152)

* whisper : use flash attention in the encoder

* whisper : add kv_pad

* whisper : remove extra backend instance (huh?)

* whisper : use FA for cross-attention

* whisper : use FA for self-attention

* whisper : simplify encoder FA

* whisper : add flash_attn runtime parameter

* scripts : add bench log

* scripts : add M1 Pro bench log
											
										
										
											2024-05-15 06:38:19 +00:00
+								        else if (arg == "-fa"  || arg == "--flash-attn")     { params.flash_attn     = true; }
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
-												talk-llama : add optional CLI arg to set the bot name (#1764)


											
										
										
											2024-01-13 18:51:35 +00:00
+								        else if (arg == "-bn"   || arg == "--bot-name")      { params.bot_name       = argv[++i]; }
 								        else if (arg == "--session")                         { params.path_session   = argv[++i]; }
-												talk-llama : optional wake-up command and audio confirmation (#1765)

* talk-llama: add optional wake-word detection from command

* talk-llama: add optional audio confirmation before generating answer

* talk-llama: fix small formatting issue in output

* talk-llama.cpp: fix Windows build
											
										
										
											2024-01-16 13:52:01 +00:00
+								        else if (arg == "-w"   || arg == "--wake-command")   { params.wake_cmd       = argv[++i]; }
 								        else if (arg == "-ho"  || arg == "--heard-ok")       { params.heard_ok       = argv[++i]; }
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								        else if (arg == "-l"   || arg == "--language")       { params.language       = argv[++i]; }
 								        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
 								        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
 								        else if (arg == "-s"   || arg == "--speak")          { params.speak          = argv[++i]; }
-												talk, talk-llama : pass text_to_speak as a file (#1865)

* talk-llama: pass file instead of arg

it is too hard to quote text in a portable way

* talk-llama: pass heard_ok as a file

* talk-llama: let eleven-labs.py accept options

Options: -v voice, -s savefile, -p (--play)

* talk-llama: check installed commands in "speak"

Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed

* talk-llama: pass voice_id again

in order to sync talk with talk-llama

* talk: sync with talk-llama

Passing text_to_speak as a file is safer and more portable
cf. https://stackoverflow.com/a/59036879/45375

* talk and talk-llama: get all installed voices in speak.ps1

* talk and talk-llama: get voices from api

* talk and talk-llama: add more options to eleven-labs.py

and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/)

```
usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE]

options:
  -q, --quick           skip checking the required library

action:
  TEXTFILE              read the text file (default: stdin)
  -l, --list            show the list of voices and exit
  -h, --help            show this help and exit

voice selection:
  -n NAME, --name NAME  get a voice object by name (default: Arnold)
  -v NUMBER, --voice NUMBER
                        get a voice object by number (see --list)
  -f KEY=VAL, --filter KEY=VAL
                        filter voices by labels (default: "use case=narration")
                        this option can be used multiple times
                        filtering will be disabled if the first -f has no "=" (e.g. -f "any")

output:
  -s FILE, --save FILE  save the TTS to a file (default: audio.mp3)
  -p, --play            play the TTS with ffplay
```

* examples: add speak_with_file()

as suggested in the review

* talk and talk-llama: ignore to_speak.txt
											
										
										
											2024-02-24 07:24:47 +00:00
+								        else if (arg == "-sf"  || arg == "--speak-file")     { params.speak_file     = argv[++i]; }
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								        else if (arg == "--prompt-file")                     {
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								            std::ifstream file(argv[++i]);
 								            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
 								            if (params.prompt.back() == '\n') {
 								                params.prompt.pop_back();
 								            }
 								        }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
 								        else {
 								            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
 								            whisper_print_usage(argc, argv, params);
 								            exit(0);
 								        }
 								    }
 								    return true;
 								}
 								void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
 								    fprintf(stderr, "\n");
 								    fprintf(stderr, "usage: %s [options]\n", argv[0]);
 								    fprintf(stderr, "\n");
 								    fprintf(stderr, "options:\n");
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
 								    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
 								    fprintf(stderr, "  -vms N,   --voice-ms N     [%-7d] voice duration in milliseconds\n",              params.voice_ms);
 								    fprintf(stderr, "  -c ID,    --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
 								    fprintf(stderr, "  -mt N,    --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
 								    fprintf(stderr, "  -ac N,    --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-												talk-llama : enable GPU by default

											
										
										
											2023-11-15 19:32:25 +00:00
+								    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
 								    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
 								    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
 								    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
 								    fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
 								    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
 								    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
-												whisper : use flash attention (#2152)

* whisper : use flash attention in the encoder

* whisper : add kv_pad

* whisper : remove extra backend instance (huh?)

* whisper : use FA for cross-attention

* whisper : use FA for self-attention

* whisper : simplify encoder FA

* whisper : add flash_attn runtime parameter

* scripts : add bench log

* scripts : add M1 Pro bench log
											
										
										
											2024-05-15 06:38:19 +00:00
+								    fprintf(stderr, "  -fa,      --flash-attn     [%-7s] flash attention\n",                             params.flash_attn ? "true" : "false");
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
-												talk-llama : add optional CLI arg to set the bot name (#1764)


											
										
										
											2024-01-13 18:51:35 +00:00
+								    fprintf(stderr, "  -bn NAME, --bot-name NAME  [%-7s] bot name (to display)\n",                       params.bot_name.c_str());
-												talk-llama : optional wake-up command and audio confirmation (#1765)

* talk-llama: add optional wake-word detection from command

* talk-llama: add optional audio confirmation before generating answer

* talk-llama: fix small formatting issue in output

* talk-llama.cpp: fix Windows build
											
										
										
											2024-01-16 13:52:01 +00:00
+								    fprintf(stderr, "  -w TEXT,  --wake-command T [%-7s] wake-up command to listen for\n",               params.wake_cmd.c_str());
 								    fprintf(stderr, "  -ho TEXT, --heard-ok TEXT  [%-7s] said by TTS before generating reply\n",         params.heard_ok.c_str());
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
 								    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
 								    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
 								    fprintf(stderr, "  -s FILE,  --speak TEXT     [%-7s] command for TTS\n",                             params.speak.c_str());
-												talk, talk-llama : pass text_to_speak as a file (#1865)

* talk-llama: pass file instead of arg

it is too hard to quote text in a portable way

* talk-llama: pass heard_ok as a file

* talk-llama: let eleven-labs.py accept options

Options: -v voice, -s savefile, -p (--play)

* talk-llama: check installed commands in "speak"

Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed

* talk-llama: pass voice_id again

in order to sync talk with talk-llama

* talk: sync with talk-llama

Passing text_to_speak as a file is safer and more portable
cf. https://stackoverflow.com/a/59036879/45375

* talk and talk-llama: get all installed voices in speak.ps1

* talk and talk-llama: get voices from api

* talk and talk-llama: add more options to eleven-labs.py

and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/)

```
usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE]

options:
  -q, --quick           skip checking the required library

action:
  TEXTFILE              read the text file (default: stdin)
  -l, --list            show the list of voices and exit
  -h, --help            show this help and exit

voice selection:
  -n NAME, --name NAME  get a voice object by name (default: Arnold)
  -v NUMBER, --voice NUMBER
                        get a voice object by number (see --list)
  -f KEY=VAL, --filter KEY=VAL
                        filter voices by labels (default: "use case=narration")
                        this option can be used multiple times
                        filtering will be disabled if the first -f has no "=" (e.g. -f "any")

output:
  -s FILE, --save FILE  save the TTS to a file (default: audio.mp3)
  -p, --play            play the TTS with ffplay
```

* examples: add speak_with_file()

as suggested in the review

* talk and talk-llama: ignore to_speak.txt
											
										
										
											2024-02-24 07:24:47 +00:00
+								    fprintf(stderr, "  -sf FILE, --speak-file     [%-7s] file to pass to TTS\n",                         params.speak_file.c_str());
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    fprintf(stderr, "  --prompt-file FNAME        [%-7s] file with custom prompt to start dialog\n",     "");
 								    fprintf(stderr, "  --session FNAME                   file to cache model state in (may be large!) (default: none)\n");
 								    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    fprintf(stderr, "\n");
 								}
 								std::string transcribe(
 								        whisper_context * ctx,
 								        const whisper_params & params,
 								        const std::vector<float> & pcmf32,
 								        const std::string prompt_text,
 								        float & prob,
 								        int64_t & t_ms) {
 								    const auto t_start = std::chrono::high_resolution_clock::now();
 								    prob = 0.0f;
 								    t_ms = 0;
 								    std::vector<whisper_token> prompt_tokens;
 								    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
 								    prompt_tokens.resize(1024);
 								    prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));
 								    wparams.print_progress   = false;
 								    wparams.print_special    = params.print_special;
 								    wparams.print_realtime   = false;
 								    wparams.print_timestamps = !params.no_timestamps;
 								    wparams.translate        = params.translate;
 								    wparams.no_context       = true;
 								    wparams.single_segment   = true;
 								    wparams.max_tokens       = params.max_tokens;
 								    wparams.language         = params.language.c_str();
 								    wparams.n_threads        = params.n_threads;
 								    wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
 								    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
 								    wparams.audio_ctx        = params.audio_ctx;
 								    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
 								        return "";
 								    }
 								    int prob_n = 0;
 								    std::string result;
 								    const int n_segments = whisper_full_n_segments(ctx);
 								    for (int i = 0; i < n_segments; ++i) {
 								        const char * text = whisper_full_get_segment_text(ctx, i);
 								        result += text;
 								        const int n_tokens = whisper_full_n_tokens(ctx, i);
 								        for (int j = 0; j < n_tokens; ++j) {
 								            const auto token = whisper_full_get_token_data(ctx, i, j);
 								            prob += token.p;
 								            ++prob_n;
 								        }
 								    }
 								    if (prob_n > 0) {
 								        prob /= prob_n;
 								    }
 								    const auto t_end = std::chrono::high_resolution_clock::now();
 								    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
 								    return result;
 								}
-												talk-llama : optional wake-up command and audio confirmation (#1765)

* talk-llama: add optional wake-word detection from command

* talk-llama: add optional audio confirmation before generating answer

* talk-llama: fix small formatting issue in output

* talk-llama.cpp: fix Windows build
											
										
										
											2024-01-16 13:52:01 +00:00
+								std::vector<std::string> get_words(const std::string &txt) {
 								    std::vector<std::string> words;
 								    std::istringstream iss(txt);
 								    std::string word;
 								    while (iss >> word) {
 								        words.push_back(word);
 								    }
 								    return words;
 								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
 								There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
 								The transcript only includes text, it does not include markup like HTML and Markdown.
 								{1} responds with short and concise answers.
 								{0}{4} Hello, {1}!
 								{1}{4} Hello {0}! How may I help you today?
 								{0}{4} What time is it?
 								{1}{4} It is {2} o'clock.
 								{0}{4} What year is it?
 								{1}{4} We are in {3}.
 								{0}{4} What is a cat?
 								{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
 								{0}{4} Name a color.
 								{1}{4} Blue
 								{0}{4})";
 								int main(int argc, char ** argv) {
 								    whisper_params params;
 								    if (whisper_params_parse(argc, argv, params) == false) {
 								        return 1;
 								    }
-												talk-llama : add language auto detect (#1467)

* Add '-l auto' to talk-llama example

* Update examples/talk-llama/talk-llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-09 17:21:44 +00:00
+								    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
 								        whisper_print_usage(argc, argv, params);
 								        exit(0);
 								    }
 								    // whisper init
-												examples : initialize context params properly (#1852)

											
										
										
											2024-02-11 14:39:12 +00:00
+								    struct whisper_context_params cparams = whisper_context_default_params();
-												whisper : use flash attention (#2152)

* whisper : use flash attention in the encoder

* whisper : add kv_pad

* whisper : remove extra backend instance (huh?)

* whisper : use FA for cross-attention

* whisper : use FA for self-attention

* whisper : simplify encoder FA

* whisper : add flash_attn runtime parameter

* scripts : add bench log

* scripts : add M1 Pro bench log
											
										
										
											2024-05-15 06:38:19 +00:00
 								    cparams.use_gpu    = params.use_gpu;
 								    cparams.flash_attn = params.flash_attn;
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
 								    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
-												talk-llama : reject runs without required arguments (#2153)

* Extended talk-llama example to reject runs without required arguments.

Print warning and exit if models are not specified on the command line.

* Update examples/talk-llama/talk-llama.cpp

* Update examples/talk-llama/talk-llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-05-14 18:32:41 +00:00
+								    if (!ctx_wsp) {
 								        fprintf(stderr, "No whisper.cpp model specified. Please provide using -mw <modelfile>\n");
 								        return 1;
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    // llama init
-												talk-llama : sync llama.cpp

											
										
										
											2024-02-20 10:09:57 +00:00
+								    llama_backend_init();
-												talk-llama : sync latest llama.cpp (close #922, close #954)

											
										
										
											2023-05-23 11:04:39 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    auto lmparams = llama_model_default_params();
-												examples : fix n_gpu_layers usage in talk-llama (#1441)


											
										
										
											2023-11-07 01:36:23 +00:00
+								    if (!params.use_gpu) {
-												talk-llama : fix n_gpu_layers usage again (#1442)


											
										
										
											2023-11-07 08:51:27 +00:00
+								        lmparams.n_gpu_layers = 0;
-												talk-llama : add n_gpu_layers parameter (#1475)


											
										
										
											2023-11-13 08:04:16 +00:00
+								    } else {
 								        lmparams.n_gpu_layers = params.n_gpu_layers;
-												examples : fix n_gpu_layers usage in talk-llama (#1441)


											
										
										
											2023-11-07 01:36:23 +00:00
+								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);
-												talk-llama : reject runs without required arguments (#2153)

* Extended talk-llama example to reject runs without required arguments.

Print warning and exit if models are not specified on the command line.

* Update examples/talk-llama/talk-llama.cpp

* Update examples/talk-llama/talk-llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-05-14 18:32:41 +00:00
+								    if (!model_llama) {
 								        fprintf(stderr, "No llama.cpp model specified. Please provide using -ml <modelfile>\n");
 								        return 1;
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    llama_context_params lcparams = llama_context_default_params();
 								    // tune these to your liking
 								    lcparams.n_ctx      = 2048;
 								    lcparams.seed       = 1;
 								    lcparams.n_threads  = params.n_threads;
-												whisper : use flash attention (#2152)

* whisper : use flash attention in the encoder

* whisper : add kv_pad

* whisper : remove extra backend instance (huh?)

* whisper : use FA for cross-attention

* whisper : use FA for self-attention

* whisper : simplify encoder FA

* whisper : add flash_attn runtime parameter

* scripts : add bench log

* scripts : add M1 Pro bench log
											
										
										
											2024-05-15 06:38:19 +00:00
+								    lcparams.flash_attn = params.flash_attn;
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    // print some info about the processing
 								    {
 								        fprintf(stderr, "\n");
 								        if (!whisper_is_multilingual(ctx_wsp)) {
 								            if (params.language != "en" || params.translate) {
 								                params.language = "en";
 								                params.translate = false;
 								                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
 								            }
 								        }
 								        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
 								                __func__,
 								                params.n_threads,
 								                params.language.c_str(),
 								                params.translate ? "translate" : "transcribe",
 								                params.no_timestamps ? 0 : 1);
 								        fprintf(stderr, "\n");
 								    }
 								    // init audio
 								    audio_async audio(30*1000);
 								    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
 								        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
 								        return 1;
 								    }
 								    audio.resume();
 								    bool is_running  = true;
 								    bool force_speak = false;
 								    float prob0 = 0.0f;
 								    const std::string chat_symb = ":";
 								    std::vector<float> pcmf32_cur;
 								    std::vector<float> pcmf32_prompt;
-												talk-llama : add optional CLI arg to set the bot name (#1764)


											
										
										
											2024-01-13 18:51:35 +00:00
+								    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", params.bot_name);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    // construct the initial prompt for LLaMA inference
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								    std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
 								    // need to have leading ' '
 								    prompt_llama.insert(0, 1, ' ');
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    prompt_llama = ::replace(prompt_llama, "{0}", params.person);
-												talk-llama : add optional CLI arg to set the bot name (#1764)


											
										
										
											2024-01-13 18:51:35 +00:00
+								    prompt_llama = ::replace(prompt_llama, "{1}", params.bot_name);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    {
 								        // get time string
 								        std::string time_str;
 								        {
 								            time_t t = time(0);
 								            struct tm * now = localtime(&t);
 								            char buf[128];
 								            strftime(buf, sizeof(buf), "%H:%M", now);
 								            time_str = buf;
 								        }
 								        prompt_llama = ::replace(prompt_llama, "{2}", time_str);
 								    }
 								    {
 								        // get year string
 								        std::string year_str;
 								        {
 								            time_t t = time(0);
 								            struct tm * now = localtime(&t);
 								            char buf[128];
 								            strftime(buf, sizeof(buf), "%Y", now);
 								            year_str = buf;
 								        }
 								        prompt_llama = ::replace(prompt_llama, "{3}", year_str);
 								    }
 								    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
-												talk-llama : use llama_decode instead of llama_eval

											
										
										
											2024-03-08 10:04:43 +00:00
+								    llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								    // init session
 								    std::string path_session = params.path_session;
 								    std::vector<llama_token> session_tokens;
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
 								    if (!path_session.empty()) {
 								        fprintf(stderr, "%s: attempting to load saved session from %s\n", __func__, path_session.c_str());
 								        // fopen to check for existing session
 								        FILE * fp = std::fopen(path_session.c_str(), "rb");
 								        if (fp != NULL) {
 								            std::fclose(fp);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            session_tokens.resize(llama_n_ctx(ctx_llama));
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								            size_t n_token_count_out = 0;
 								            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
 								                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
 								                return 1;
 								            }
 								            session_tokens.resize(n_token_count_out);
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								            for (size_t i = 0; i < session_tokens.size(); i++) {
 								                embd_inp[i] = session_tokens[i];
 								            }
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
 								            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
 								        } else {
 								            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
 								        }
 								    }
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								    // evaluate the initial prompt
 								    printf("\n");
 								    printf("%s : initializing - please wait ...\n", __func__);
-												talk-llama : use llama_decode instead of llama_eval

											
										
										
											2024-03-08 10:04:43 +00:00
+								    // prepare batch
 								    {
 								        batch.n_tokens = embd_inp.size();
 								        for (int i = 0; i < batch.n_tokens; i++) {
 								            batch.token[i]     = embd_inp[i];
 								            batch.pos[i]       = i;
 								            batch.n_seq_id[i]  = 1;
 								            batch.seq_id[i][0] = 0;
 								            batch.logits[i]    = i == batch.n_tokens - 1;
 								        }
 								    }
 								    if (llama_decode(ctx_llama, batch)) {
 								        fprintf(stderr, "%s : failed to decode\n", __func__);
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								        return 1;
 								    }
 								    if (params.verbose_prompt) {
 								        fprintf(stdout, "\n");
 								        fprintf(stdout, "%s", prompt_llama.c_str());
 								        fflush(stdout);
 								    }
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								     // debug message about similarity of saved session, if applicable
 								    size_t n_matching_session_tokens = 0;
 								    if (session_tokens.size()) {
 								        for (llama_token id : session_tokens) {
 								            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
 								                break;
 								            }
 								            n_matching_session_tokens++;
 								        }
 								        if (n_matching_session_tokens >= embd_inp.size()) {
 								            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
 								        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
 								            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
 								                __func__, n_matching_session_tokens, embd_inp.size());
 								        } else {
 								            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
 								                __func__, n_matching_session_tokens, embd_inp.size());
 								        }
 								    }
 								    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
 								    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
 								    // initial prompt so it doesn't need to be an exact match.
 								    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    printf("%s : done! start speaking in the microphone\n", __func__);
-												talk-llama : optional wake-up command and audio confirmation (#1765)

* talk-llama: add optional wake-word detection from command

* talk-llama: add optional audio confirmation before generating answer

* talk-llama: fix small formatting issue in output

* talk-llama.cpp: fix Windows build
											
										
										
											2024-01-16 13:52:01 +00:00
 								    // show wake command if enabled
 								    const std::string wake_cmd = params.wake_cmd;
 								    const int wake_cmd_length = get_words(wake_cmd).size();
 								    const bool use_wake_cmd = wake_cmd_length > 0;
 								    if (use_wake_cmd) {
 								        printf("%s : the wake-up command is: '%s%s%s'\n", __func__, "\033[1m", wake_cmd.c_str(), "\033[0m");
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    printf("\n");
 								    printf("%s%s", params.person.c_str(), chat_symb.c_str());
 								    fflush(stdout);
 								    // clear audio buffer
 								    audio.clear();
 								    // text inference variables
 								    const int voice_id = 2;
 								    const int n_keep   = embd_inp.size();
 								    const int n_ctx    = llama_n_ctx(ctx_llama);
 								    int n_past = n_keep;
 								    int n_prev = 64; // TODO arg
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								    int n_session_consumed = !path_session.empty() && session_tokens.size() > 0 ? session_tokens.size() : 0;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    std::vector<llama_token> embd;
 								    // reverse prompts for detecting when it's time to stop speaking
 								    std::vector<std::string> antiprompts = {
 								        params.person + chat_symb,
 								    };
 								    // main loop
 								    while (is_running) {
 								        // handle Ctrl + C
 								        is_running = sdl_poll_events();
 								        if (!is_running) {
 								            break;
 								        }
 								        // delay
 								        std::this_thread::sleep_for(std::chrono::milliseconds(100));
 								        int64_t t_ms = 0;
 								        {
 								            audio.get(2000, pcmf32_cur);
 								            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
 								                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
 								                audio.get(params.voice_ms, pcmf32_cur);
-												talk-llama : optional wake-up command and audio confirmation (#1765)

* talk-llama: add optional wake-word detection from command

* talk-llama: add optional audio confirmation before generating answer

* talk-llama: fix small formatting issue in output

* talk-llama.cpp: fix Windows build
											
										
										
											2024-01-16 13:52:01 +00:00
+								                std::string all_heard;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								                if (!force_speak) {
-												talk-llama : optional wake-up command and audio confirmation (#1765)

* talk-llama: add optional wake-word detection from command

* talk-llama: add optional audio confirmation before generating answer

* talk-llama: fix small formatting issue in output

* talk-llama.cpp: fix Windows build
											
										
										
											2024-01-16 13:52:01 +00:00
+								                    all_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
 								                }
 								                const auto words = get_words(all_heard);
 								                std::string wake_cmd_heard;
 								                std::string text_heard;
 								                for (int i = 0; i < (int) words.size(); ++i) {
 								                    if (i < wake_cmd_length) {
 								                        wake_cmd_heard += words[i] + " ";
 								                    } else {
 								                        text_heard += words[i] + " ";
 								                    }
 								                }
 								                // check if audio starts with the wake-up command if enabled
 								                if (use_wake_cmd) {
 								                    const float sim = similarity(wake_cmd_heard, wake_cmd);
 								                    if ((sim < 0.7f) || (text_heard.empty())) {
 								                        audio.clear();
 								                        continue;
 								                    }
 								                }
 								                // optionally give audio feedback that the current text is being processed
 								                if (!params.heard_ok.empty()) {
-												talk, talk-llama : pass text_to_speak as a file (#1865)

* talk-llama: pass file instead of arg

it is too hard to quote text in a portable way

* talk-llama: pass heard_ok as a file

* talk-llama: let eleven-labs.py accept options

Options: -v voice, -s savefile, -p (--play)

* talk-llama: check installed commands in "speak"

Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed

* talk-llama: pass voice_id again

in order to sync talk with talk-llama

* talk: sync with talk-llama

Passing text_to_speak as a file is safer and more portable
cf. https://stackoverflow.com/a/59036879/45375

* talk and talk-llama: get all installed voices in speak.ps1

* talk and talk-llama: get voices from api

* talk and talk-llama: add more options to eleven-labs.py

and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/)

```
usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE]

options:
  -q, --quick           skip checking the required library

action:
  TEXTFILE              read the text file (default: stdin)
  -l, --list            show the list of voices and exit
  -h, --help            show this help and exit

voice selection:
  -n NAME, --name NAME  get a voice object by name (default: Arnold)
  -v NUMBER, --voice NUMBER
                        get a voice object by number (see --list)
  -f KEY=VAL, --filter KEY=VAL
                        filter voices by labels (default: "use case=narration")
                        this option can be used multiple times
                        filtering will be disabled if the first -f has no "=" (e.g. -f "any")

output:
  -s FILE, --save FILE  save the TTS to a file (default: audio.mp3)
  -p, --play            play the TTS with ffplay
```

* examples: add speak_with_file()

as suggested in the review

* talk and talk-llama: ignore to_speak.txt
											
										
										
											2024-02-24 07:24:47 +00:00
+								                    speak_with_file(params.speak, params.heard_ok, params.speak_file, voice_id);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                }
 								                // remove text between brackets using regex
 								                {
 								                    std::regex re("\\[.*?\\]");
 								                    text_heard = std::regex_replace(text_heard, re, "");
 								                }
 								                // remove text between brackets using regex
 								                {
 								                    std::regex re("\\(.*?\\)");
 								                    text_heard = std::regex_replace(text_heard, re, "");
 								                }
 								                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
 								                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
 								                // take first line
 								                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
 								                // remove leading and trailing whitespace
 								                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
 								                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
 								                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
 								                if (text_heard.empty() || tokens.empty() || force_speak) {
 								                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
 								                    audio.clear();
 								                    continue;
 								                }
 								                force_speak = false;
 								                text_heard.insert(0, 1, ' ');
-												talk-llama : add optional CLI arg to set the bot name (#1764)


											
										
										
											2024-01-13 18:51:35 +00:00
+								                text_heard += "\n" + params.bot_name + chat_symb;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
 								                fflush(stdout);
 								                embd = ::llama_tokenize(ctx_llama, text_heard, false);
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								                // Append the new input tokens to the session_tokens vector
 								                if (!path_session.empty()) {
 								                    session_tokens.insert(session_tokens.end(), tokens.begin(), tokens.end());
 								                }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                // text inference
 								                bool done = false;
 								                std::string text_to_speak;
 								                while (true) {
 								                    // predict
 								                    if (embd.size() > 0) {
 								                        if (n_past + (int) embd.size() > n_ctx) {
 								                            n_past = n_keep;
 								                            // insert n_left/2 tokens at the start of embd from last_n_tokens
 								                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								                            // stop saving session if we run out of context
 								                            path_session = "";
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                            //printf("\n---\n");
 								                            //printf("resetting: '");
 								                            //for (int i = 0; i < (int) embd.size(); i++) {
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                            //    printf("%s", llama_token_to_piece(ctx_llama, embd[i]));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                            //}
 								                            //printf("'\n");
 								                            //printf("\n---\n");
 								                        }
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								                        // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
 								                        // REVIEW
 								                        if (n_session_consumed < (int) session_tokens.size()) {
 								                            size_t i = 0;
 								                            for ( ; i < embd.size(); i++) {
 								                                if (embd[i] != session_tokens[n_session_consumed]) {
 								                                    session_tokens.resize(n_session_consumed);
 								                                    break;
 								                                }
 								                                n_past++;
 								                                n_session_consumed++;
 								                                if (n_session_consumed >= (int) session_tokens.size()) {
 								                                    i++;
 								                                    break;
 								                                }
 								                            }
 								                            if (i > 0) {
 								                                embd.erase(embd.begin(), embd.begin() + i);
 								                            }
 								                        }
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								                        if (embd.size() > 0 && !path_session.empty()) {
 								                            session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
 								                            n_session_consumed = session_tokens.size();
 								                        }
-												talk-llama : use llama_decode instead of llama_eval

											
										
										
											2024-03-08 10:04:43 +00:00
+								                        // prepare batch
 								                        {
 								                            batch.n_tokens = embd.size();
 								                            for (int i = 0; i < batch.n_tokens; i++) {
 								                                batch.token[i]     = embd[i];
 								                                batch.pos[i]       = n_past + i;
 								                                batch.n_seq_id[i]  = 1;
 								                                batch.seq_id[i][0] = 0;
 								                                batch.logits[i]    = i == batch.n_tokens - 1;
 								                            }
 								                        }
 								                        if (llama_decode(ctx_llama, batch)) {
 								                            fprintf(stderr, "%s : failed to decode\n", __func__);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                            return 1;
 								                        }
 								                    }
 								                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
 								                    n_past += embd.size();
-												talk-llama : fix build + sync latest llama.cpp

											
										
										
											2023-05-14 15:46:19 +00:00
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                    embd.clear();
 								                    if (done) break;
 								                    {
 								                        // out of user input, sample next token
 								                        const float top_k          = 5;
 								                        const float top_p          = 0.80f;
 								                        const float temp           = 0.30f;
 								                        const float repeat_penalty = 1.1764f;
 								                        const int repeat_last_n    = 256;
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								                        if (!path_session.empty() && need_to_save_session) {
 								                            need_to_save_session = false;
 								                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
-												talk-llama : fix build + sync latest llama.cpp

											
										
										
											2023-05-14 15:46:19 +00:00
+								                        }
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                        llama_token id = 0;
 								                        {
 								                            auto logits = llama_get_logits(ctx_llama);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            auto n_vocab = llama_n_vocab(model_llama);
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            logits[llama_token_eos(model_llama)] = 0;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
+								                            std::vector<llama_token_data> candidates;
 								                            candidates.reserve(n_vocab);
 								                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
 								                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
 								                            }
 								                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 								                            // apply repeat penalty
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            const float nl_logit = logits[llama_token_nl(model_llama)];
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            llama_sample_repetition_penalties(ctx_llama, &candidates_p,
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                    repeat_last_n, repeat_penalty, 0.0, 0.0f);
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            logits[llama_token_nl(model_llama)] = nl_logit;
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
 								                            if (temp <= 0) {
 								                                // Greedy sampling
 								                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
 								                            } else {
 								                                // Temperature sampling
-												talk-llama : fix build + sync latest llama.cpp

											
										
										
											2023-05-14 15:46:19 +00:00
+								                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
 								                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                llama_sample_temp (ctx_llama, &candidates_p, temp);
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
+								                                id = llama_sample_token(ctx_llama, &candidates_p);
 								                            }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        if (id != llama_token_eos(model_llama)) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                            // add it to the context
 								                            embd.push_back(id);
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                            text_to_speak += llama_token_to_piece(ctx_llama, id);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                            printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
-												talk-llama : stream response (#1121)

											
										
										
											2024-02-06 17:56:12 +00:00
+								                            fflush(stdout);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                        }
 								                    }
 								                    {
 								                        std::string last_output;
 								                        for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                            last_output += llama_token_to_piece(ctx_llama, embd_inp[i]);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                        }
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                        last_output += llama_token_to_piece(ctx_llama, embd[0]);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								                        for (std::string & antiprompt : antiprompts) {
 								                            if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
 								                                done = true;
 								                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
 								                                fflush(stdout);
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								                                need_to_save_session = true;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                                break;
 								                            }
 								                        }
 								                    }
 								                    is_running = sdl_poll_events();
 								                    if (!is_running) {
 								                        break;
 								                    }
 								                }
-												talk, talk-llama : pass text_to_speak as a file (#1865)

* talk-llama: pass file instead of arg

it is too hard to quote text in a portable way

* talk-llama: pass heard_ok as a file

* talk-llama: let eleven-labs.py accept options

Options: -v voice, -s savefile, -p (--play)

* talk-llama: check installed commands in "speak"

Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed

* talk-llama: pass voice_id again

in order to sync talk with talk-llama

* talk: sync with talk-llama

Passing text_to_speak as a file is safer and more portable
cf. https://stackoverflow.com/a/59036879/45375

* talk and talk-llama: get all installed voices in speak.ps1

* talk and talk-llama: get voices from api

* talk and talk-llama: add more options to eleven-labs.py

and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/)

```
usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE]

options:
  -q, --quick           skip checking the required library

action:
  TEXTFILE              read the text file (default: stdin)
  -l, --list            show the list of voices and exit
  -h, --help            show this help and exit

voice selection:
  -n NAME, --name NAME  get a voice object by name (default: Arnold)
  -v NUMBER, --voice NUMBER
                        get a voice object by number (see --list)
  -f KEY=VAL, --filter KEY=VAL
                        filter voices by labels (default: "use case=narration")
                        this option can be used multiple times
                        filtering will be disabled if the first -f has no "=" (e.g. -f "any")

output:
  -s FILE, --save FILE  save the TTS to a file (default: audio.mp3)
  -p, --play            play the TTS with ffplay
```

* examples: add speak_with_file()

as suggested in the review

* talk and talk-llama: ignore to_speak.txt
											
										
										
											2024-02-24 07:24:47 +00:00
+								                speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								                audio.clear();
 								            }
 								        }
 								    }
 								    audio.pause();
 								    whisper_print_timings(ctx_wsp);
 								    whisper_free(ctx_wsp);
 								    llama_print_timings(ctx_llama);
 								    llama_free(ctx_llama);
 								    return 0;
 								}