whisper.cpp/examples/talk-llama/talk-llama.cpp

// Talk with AI
//

#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include "llama.h"

#include <cassert>
#include <cstdio>
#include <fstream>
#include <regex>
#include <string>
#include <thread>
#include <vector>
#include <regex>

std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    auto * model = llama_get_model(ctx);

    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return result;
}

std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }

    return std::string(result.data(), result.size());
}

// command-line parameters
struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t voice_ms   = 10000;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
    int32_t n_gpu_layers = 999;

    float vad_thold  = 0.6f;
    float freq_thold = 100.0f;

    bool speed_up       = false;
    bool translate      = false;
    bool print_special  = false;
    bool print_energy   = false;
    bool no_timestamps  = true;
    bool verbose_prompt = false;
    bool use_gpu        = true;

    std::string person      = "Georgi";
    std::string language    = "en";
    std::string model_wsp   = "models/ggml-base.en.bin";
    std::string model_llama = "models/ggml-llama-7B.bin";
    std::string speak       = "./examples/talk-llama/speak";
    std::string prompt      = "";
    std::string fname_out;
    std::string path_session = "";       // path to file for saving/loading model eval state
};

void whisper_print_usage(int argc, char ** argv, const whisper_params & params);

bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
        else if (arg == "-t"   || arg == "--threads")        { params.n_threads      = std::stoi(argv[++i]); }
        else if (arg == "-vms" || arg == "--voice-ms")       { params.voice_ms       = std::stoi(argv[++i]); }
        else if (arg == "-c"   || arg == "--capture")        { params.capture_id     = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")     { params.max_tokens     = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")      { params.audio_ctx      = std::stoi(argv[++i]); }
        else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")       { params.speed_up       = true; }
        else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
        else if (arg == "--session")                         { params.path_session   = argv[++i];}
        else if (arg == "-l"   || arg == "--language")       { params.language       = argv[++i]; }
        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
        else if (arg == "-s"   || arg == "--speak")          { params.speak          = argv[++i]; }
        else if (arg == "--prompt-file")                     {
            std::ifstream file(argv[++i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }

    return true;
}

void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -vms N,   --voice-ms N     [%-7d] voice duration in milliseconds\n",              params.voice_ms);
    fprintf(stderr, "  -c ID,    --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,    --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
    fprintf(stderr, "  -s FILE,  --speak TEXT     [%-7s] command for TTS\n",                             params.speak.c_str());
    fprintf(stderr, "  --prompt-file FNAME        [%-7s] file with custom prompt to start dialog\n",     "");
    fprintf(stderr, "  --session FNAME                   file to cache model state in (may be large!) (default: none)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
}

std::string transcribe(
        whisper_context * ctx,
        const whisper_params & params,
        const std::vector<float> & pcmf32,
        const std::string prompt_text,
        float & prob,
        int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

    prob = 0.0f;
    t_ms = 0;

    std::vector<whisper_token> prompt_tokens;

    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

    prompt_tokens.resize(1024);
    prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));

    wparams.print_progress   = false;
    wparams.print_special    = params.print_special;
    wparams.print_realtime   = false;
    wparams.print_timestamps = !params.no_timestamps;
    wparams.translate        = params.translate;
    wparams.no_context       = true;
    wparams.single_segment   = true;
    wparams.max_tokens       = params.max_tokens;
    wparams.language         = params.language.c_str();
    wparams.n_threads        = params.n_threads;

    wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();

    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;

    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }

    int prob_n = 0;
    std::string result;

    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);

        result += text;

        const int n_tokens = whisper_full_n_tokens(ctx, i);
        for (int j = 0; j < n_tokens; ++j) {
            const auto token = whisper_full_get_token_data(ctx, i, j);

            prob += token.p;
            ++prob_n;
        }
    }

    if (prob_n > 0) {
        prob /= prob_n;
    }

    const auto t_end = std::chrono::high_resolution_clock::now();
    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();

    return result;
}

const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";

const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{1} responds with short and concise answers.

{0}{4} Hello, {1}!
{1}{4} Hello {0}! How may I help you today?
{0}{4} What time is it?
{1}{4} It is {2} o'clock.
{0}{4} What year is it?
{1}{4} We are in {3}.
{0}{4} What is a cat?
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{0}{4} Name a color.
{1}{4} Blue
{0}{4})";

int main(int argc, char ** argv) {
    whisper_params params;

    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }

    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }

    // whisper init

    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);

    // llama init

    llama_backend_init(true);

    auto lmparams = llama_model_default_params();
    if (!params.use_gpu) {
        lmparams.n_gpu_layers = 0;
    } else {
        lmparams.n_gpu_layers = params.n_gpu_layers;
    }

    struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);

    llama_context_params lcparams = llama_context_default_params();

    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
    lcparams.f16_kv     = true;
    lcparams.n_threads  = params.n_threads;

    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);

    // print some info about the processing
    {
        fprintf(stderr, "\n");

        if (!whisper_is_multilingual(ctx_wsp)) {
            if (params.language != "en" || params.translate) {
                params.language = "en";
                params.translate = false;
                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
            }
        }
        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
                __func__,
                params.n_threads,
                params.language.c_str(),
                params.translate ? "translate" : "transcribe",
                params.no_timestamps ? 0 : 1);

        fprintf(stderr, "\n");
    }

    // init audio

    audio_async audio(30*1000);
    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
        return 1;
    }

    audio.resume();

    bool is_running  = true;
    bool force_speak = false;

    float prob0 = 0.0f;

    const std::string chat_symb = ":";
    const std::string bot_name  = "LLaMA";

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);

    // construct the initial prompt for LLaMA inference
    std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;

    // need to have leading ' '
    prompt_llama.insert(0, 1, ' ');

    prompt_llama = ::replace(prompt_llama, "{0}", params.person);
    prompt_llama = ::replace(prompt_llama, "{1}", bot_name);

    {
        // get time string
        std::string time_str;
        {
            time_t t = time(0);
            struct tm * now = localtime(&t);
            char buf[128];
            strftime(buf, sizeof(buf), "%H:%M", now);
            time_str = buf;
        }
        prompt_llama = ::replace(prompt_llama, "{2}", time_str);
    }

    {
        // get year string
        std::string year_str;
        {
            time_t t = time(0);
            struct tm * now = localtime(&t);
            char buf[128];
            strftime(buf, sizeof(buf), "%Y", now);
            year_str = buf;
        }
        prompt_llama = ::replace(prompt_llama, "{3}", year_str);
    }

    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);

    // init session
    std::string path_session = params.path_session;
    std::vector<llama_token> session_tokens;
    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);

    if (!path_session.empty()) {
        fprintf(stderr, "%s: attempting to load saved session from %s\n", __func__, path_session.c_str());

        // fopen to check for existing session
        FILE * fp = std::fopen(path_session.c_str(), "rb");
        if (fp != NULL) {
            std::fclose(fp);

            session_tokens.resize(llama_n_ctx(ctx_llama));
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
            session_tokens.resize(n_token_count_out);
            for (size_t i = 0; i < session_tokens.size(); i++) {
                embd_inp[i] = session_tokens[i];
            }

            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
        } else {
            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
        }
    }

    // evaluate the initial prompt

    printf("\n");
    printf("%s : initializing - please wait ...\n", __func__);

    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0)) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return 1;
    }

    if (params.verbose_prompt) {
        fprintf(stdout, "\n");
        fprintf(stdout, "%s", prompt_llama.c_str());
        fflush(stdout);
    }

     // debug message about similarity of saved session, if applicable
    size_t n_matching_session_tokens = 0;
    if (session_tokens.size()) {
        for (llama_token id : session_tokens) {
            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                break;
            }
            n_matching_session_tokens++;
        }
        if (n_matching_session_tokens >= embd_inp.size()) {
            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        } else {
            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        }
    }

    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
    // initial prompt so it doesn't need to be an exact match.
    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);

    printf("%s : done! start speaking in the microphone\n", __func__);
    printf("\n");
    printf("%s%s", params.person.c_str(), chat_symb.c_str());
    fflush(stdout);

    // clear audio buffer
    audio.clear();

    // text inference variables
    const int voice_id = 2;
    const int n_keep   = embd_inp.size();
    const int n_ctx    = llama_n_ctx(ctx_llama);

    int n_past = n_keep;
    int n_prev = 64; // TODO arg
    int n_session_consumed = !path_session.empty() && session_tokens.size() > 0 ? session_tokens.size() : 0;

    std::vector<llama_token> embd;

    // reverse prompts for detecting when it's time to stop speaking
    std::vector<std::string> antiprompts = {
        params.person + chat_symb,
    };

    // main loop
    while (is_running) {
        // handle Ctrl + C
        is_running = sdl_poll_events();

        if (!is_running) {
            break;
        }

        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));

        int64_t t_ms = 0;

        {
            audio.get(2000, pcmf32_cur);

            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);

                audio.get(params.voice_ms, pcmf32_cur);

                std::string text_heard;

                if (!force_speak) {
                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
                }

                // remove text between brackets using regex
                {
                    std::regex re("\\[.*?\\]");
                    text_heard = std::regex_replace(text_heard, re, "");
                }

                // remove text between brackets using regex
                {
                    std::regex re("\\(.*?\\)");
                    text_heard = std::regex_replace(text_heard, re, "");
                }

                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");

                // take first line
                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));

                // remove leading and trailing whitespace
                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");

                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);

                if (text_heard.empty() || tokens.empty() || force_speak) {
                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
                    audio.clear();

                    continue;
                }

                force_speak = false;

                text_heard.insert(0, 1, ' ');
                text_heard += "\n" + bot_name + chat_symb;
                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
                fflush(stdout);

                embd = ::llama_tokenize(ctx_llama, text_heard, false);

                // Append the new input tokens to the session_tokens vector
                if (!path_session.empty()) {
                    session_tokens.insert(session_tokens.end(), tokens.begin(), tokens.end());
                }

                // text inference
                bool done = false;
                std::string text_to_speak;
                while (true) {
                    // predict
                    if (embd.size() > 0) {
                        if (n_past + (int) embd.size() > n_ctx) {
                            n_past = n_keep;

                            // insert n_left/2 tokens at the start of embd from last_n_tokens
                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
                            // stop saving session if we run out of context
                            path_session = "";
                            //printf("\n---\n");
                            //printf("resetting: '");
                            //for (int i = 0; i < (int) embd.size(); i++) {
                            //    printf("%s", llama_token_to_piece(ctx_llama, embd[i]));
                            //}
                            //printf("'\n");
                            //printf("\n---\n");
                        }

                        // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
                        // REVIEW
                        if (n_session_consumed < (int) session_tokens.size()) {
                            size_t i = 0;
                            for ( ; i < embd.size(); i++) {
                                if (embd[i] != session_tokens[n_session_consumed]) {
                                    session_tokens.resize(n_session_consumed);
                                    break;
                                }

                                n_past++;
                                n_session_consumed++;

                                if (n_session_consumed >= (int) session_tokens.size()) {
                                    i++;
                                    break;
                                }
                            }
                            if (i > 0) {
                                embd.erase(embd.begin(), embd.begin() + i);
                            }
                        }

                        if (embd.size() > 0 && !path_session.empty()) {
                            session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
                            n_session_consumed = session_tokens.size();
                        }

                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past)) {
                            fprintf(stderr, "%s : failed to eval\n", __func__);
                            return 1;
                        }
                    }


                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
                    n_past += embd.size();

                    embd.clear();

                    if (done) break;

                    {
                        // out of user input, sample next token
                        const float top_k          = 5;
                        const float top_p          = 0.80f;
                        const float temp           = 0.30f;
                        const float repeat_penalty = 1.1764f;

                        const int repeat_last_n    = 256;

                        if (!path_session.empty() && need_to_save_session) {
                            need_to_save_session = false;
                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
                        }

                        llama_token id = 0;

                        {
                            auto logits = llama_get_logits(ctx_llama);
                            auto n_vocab = llama_n_vocab(model_llama);

                            logits[llama_token_eos(model_llama)] = 0;

                            std::vector<llama_token_data> candidates;
                            candidates.reserve(n_vocab);
                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                            }

                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

                            // apply repeat penalty
                            const float nl_logit = logits[llama_token_nl(model_llama)];

                            llama_sample_repetition_penalties(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
                                    repeat_last_n, repeat_penalty, 0.0, 0.0f);

                            logits[llama_token_nl(model_llama)] = nl_logit;

                            if (temp <= 0) {
                                // Greedy sampling
                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
                            } else {
                                // Temperature sampling
                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                                llama_sample_temp (ctx_llama, &candidates_p, temp);
                                id = llama_sample_token(ctx_llama, &candidates_p);
                            }
                        }

                        if (id != llama_token_eos(model_llama)) {
                            // add it to the context
                            embd.push_back(id);

                            text_to_speak += llama_token_to_piece(ctx_llama, id);

                            printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
                        }
                    }

                    {
                        std::string last_output;
                        for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
                            last_output += llama_token_to_piece(ctx_llama, embd_inp[i]);
                        }
                        last_output += llama_token_to_piece(ctx_llama, embd[0]);

                        for (std::string & antiprompt : antiprompts) {
                            if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
                                done = true;
                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
                                fflush(stdout);
                                need_to_save_session = true;
                                break;
                            }
                        }
                    }

                    is_running = sdl_poll_events();

                    if (!is_running) {
                        break;
                    }
                }

                text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
                int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
                if (ret != 0) {
                    fprintf(stderr, "%s: failed to speak\n", __func__);
                }

                audio.clear();
            }
        }
    }

    audio.pause();

    whisper_print_timings(ctx_wsp);
    whisper_free(ctx_wsp);

    llama_print_timings(ctx_llama);
    llama_free(ctx_llama);

    return 0;
}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								// Talk with AI
 								//
-												Revert "ggml : do not use _GNU_SOURCE gratuitously (#1027)"

This reverts commit 3f7a03ebe3b65be0792849e300a122f6a050e3f8.

											
										
										
											2023-07-02 18:53:52 +00:00
+								#include "common-sdl.h"
-												build : do not use _GNU_SOURCE gratuitously (#1129)

* Do not use _GNU_SOURCE gratuitously.

What is needed to build whisper.cpp and examples is availability of
stuff defined in The Open Group Base Specifications Issue 6
(https://pubs.opengroup.org/onlinepubs/009695399/) known also as
Single Unix Specification v3 (SUSv3) or POSIX.1-2001 + XSI extensions,
plus some stuff from BSD that is not specified in POSIX.1.

Well, that was true until NUMA support was added recently in ggml,
so enable GNU libc extensions for Linux builds to cover that.

There is no need to penalize musl libc which simply follows standards.

Not having feature test macros in source code gives greater flexibility
to those wanting to reuse it in 3rd party app, as they can build it with
minimal FTM (_XOPEN_SOURCE=600) or other FTM depending on their needs.

It builds without issues in Alpine (musl libc), Ubuntu (glibc), MSYS2.

* examples : include SDL headers before other headers

Avoid macOS build error when _DARWIN_C_SOURCE is not defined, brought by
SDL2 relying on Darwin extension memset_pattern4/8/16 (from string.h).

* make : enable BSD extensions for DragonFlyBSD to expose RLIMIT_MEMLOCK

* make : use BSD-specific FTMs to enable alloca on BSDs

* make : fix OpenBSD build by exposing newer POSIX definitions

* cmake : follow recent FTM improvements from Makefile
											
										
										
											2023-09-07 09:36:14 +00:00
+								#include "common.h"
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								#include "whisper.h"
 								#include "llama.h"
 								#include <cassert>
 								#include <cstdio>
 								#include <fstream>
 								#include <regex>
 								#include <string>
 								#include <thread>
 								#include <vector>
 								#include <regex>
 								std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    auto * model = llama_get_model(ctx);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // upper limit for the number of tokens
 								    int n_tokens = text.length() + add_bos;
 								    std::vector<llama_token> result(n_tokens);
 								    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
 								    if (n_tokens < 0) {
 								        result.resize(-n_tokens);
 								        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
 								        GGML_ASSERT(check == -n_tokens);
 								    } else {
 								        result.resize(n_tokens);
 								    }
 								    return result;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
 								    std::vector<char> result(8, 0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								    if (n_tokens < 0) {
 								        result.resize(-n_tokens);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								        GGML_ASSERT(check == -n_tokens);
 								    } else {
 								        result.resize(n_tokens);
 								    }
 								    return std::string(result.data(), result.size());
 								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								// command-line parameters
 								struct whisper_params {
 								    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
 								    int32_t voice_ms   = 10000;
 								    int32_t capture_id = -1;
 								    int32_t max_tokens = 32;
 								    int32_t audio_ctx  = 0;
-												talk-llama : enable GPU by default

											
										
										
											2023-11-15 19:32:25 +00:00
+								    int32_t n_gpu_layers = 999;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								    float vad_thold  = 0.6f;
 								    float freq_thold = 100.0f;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								    bool speed_up       = false;
 								    bool translate      = false;
 								    bool print_special  = false;
 								    bool print_energy   = false;
 								    bool no_timestamps  = true;
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								    bool verbose_prompt = false;
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    bool use_gpu        = true;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    std::string person      = "Georgi";
 								    std::string language    = "en";
 								    std::string model_wsp   = "models/ggml-base.en.bin";
 								    std::string model_llama = "models/ggml-llama-7B.bin";
-												`speak` scripts for Windows

											
										
										
											2023-06-01 12:45:00 +00:00
+								    std::string speak       = "./examples/talk-llama/speak";
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								    std::string prompt      = "";
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    std::string fname_out;
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								    std::string path_session = "";       // path to file for saving/loading model eval state
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								};
 								void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 								bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
 								    for (int i = 1; i < argc; i++) {
 								        std::string arg = argv[i];
 								        if (arg == "-h" || arg == "--help") {
 								            whisper_print_usage(argc, argv, params);
 								            exit(0);
 								        }
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								        else if (arg == "-t"   || arg == "--threads")        { params.n_threads      = std::stoi(argv[++i]); }
 								        else if (arg == "-vms" || arg == "--voice-ms")       { params.voice_ms       = std::stoi(argv[++i]); }
 								        else if (arg == "-c"   || arg == "--capture")        { params.capture_id     = std::stoi(argv[++i]); }
 								        else if (arg == "-mt"  || arg == "--max-tokens")     { params.max_tokens     = std::stoi(argv[++i]); }
 								        else if (arg == "-ac"  || arg == "--audio-ctx")      { params.audio_ctx      = std::stoi(argv[++i]); }
-												talk-llama : add n_gpu_layers parameter (#1475)


											
										
										
											2023-11-13 08:04:16 +00:00
+								        else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								        else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
 								        else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
 								        else if (arg == "-su"  || arg == "--speed-up")       { params.speed_up       = true; }
 								        else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
 								        else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
 								        else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
 								        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
 								        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
 								        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
 								        else if (arg == "--session")                         { params.path_session   = argv[++i];}
 								        else if (arg == "-l"   || arg == "--language")       { params.language       = argv[++i]; }
 								        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
 								        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
 								        else if (arg == "-s"   || arg == "--speak")          { params.speak          = argv[++i]; }
 								        else if (arg == "--prompt-file")                     {
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								            std::ifstream file(argv[++i]);
 								            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
 								            if (params.prompt.back() == '\n') {
 								                params.prompt.pop_back();
 								            }
 								        }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        else {
 								            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
 								            whisper_print_usage(argc, argv, params);
 								            exit(0);
 								        }
 								    }
 								    return true;
 								}
 								void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
 								    fprintf(stderr, "\n");
 								    fprintf(stderr, "usage: %s [options]\n", argv[0]);
 								    fprintf(stderr, "\n");
 								    fprintf(stderr, "options:\n");
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
 								    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
 								    fprintf(stderr, "  -vms N,   --voice-ms N     [%-7d] voice duration in milliseconds\n",              params.voice_ms);
 								    fprintf(stderr, "  -c ID,    --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
 								    fprintf(stderr, "  -mt N,    --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
 								    fprintf(stderr, "  -ac N,    --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-												talk-llama : enable GPU by default

											
										
										
											2023-11-15 19:32:25 +00:00
+								    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
 								    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
 								    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
 								    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
 								    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
 								    fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
 								    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
 								    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
 								    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
 								    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
 								    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
 								    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
 								    fprintf(stderr, "  -s FILE,  --speak TEXT     [%-7s] command for TTS\n",                             params.speak.c_str());
 								    fprintf(stderr, "  --prompt-file FNAME        [%-7s] file with custom prompt to start dialog\n",     "");
 								    fprintf(stderr, "  --session FNAME                   file to cache model state in (may be large!) (default: none)\n");
 								    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    fprintf(stderr, "\n");
 								}
 								std::string transcribe(
 								        whisper_context * ctx,
 								        const whisper_params & params,
 								        const std::vector<float> & pcmf32,
 								        const std::string prompt_text,
 								        float & prob,
 								        int64_t & t_ms) {
 								    const auto t_start = std::chrono::high_resolution_clock::now();
 								    prob = 0.0f;
 								    t_ms = 0;
 								    std::vector<whisper_token> prompt_tokens;
 								    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
 								    prompt_tokens.resize(1024);
 								    prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));
 								    wparams.print_progress   = false;
 								    wparams.print_special    = params.print_special;
 								    wparams.print_realtime   = false;
 								    wparams.print_timestamps = !params.no_timestamps;
 								    wparams.translate        = params.translate;
 								    wparams.no_context       = true;
 								    wparams.single_segment   = true;
 								    wparams.max_tokens       = params.max_tokens;
 								    wparams.language         = params.language.c_str();
 								    wparams.n_threads        = params.n_threads;
 								    wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
 								    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
 								    wparams.audio_ctx        = params.audio_ctx;
 								    wparams.speed_up         = params.speed_up;
 								    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
 								        return "";
 								    }
 								    int prob_n = 0;
 								    std::string result;
 								    const int n_segments = whisper_full_n_segments(ctx);
 								    for (int i = 0; i < n_segments; ++i) {
 								        const char * text = whisper_full_get_segment_text(ctx, i);
 								        result += text;
 								        const int n_tokens = whisper_full_n_tokens(ctx, i);
 								        for (int j = 0; j < n_tokens; ++j) {
 								            const auto token = whisper_full_get_token_data(ctx, i, j);
 								            prob += token.p;
 								            ++prob_n;
 								        }
 								    }
 								    if (prob_n > 0) {
 								        prob /= prob_n;
 								    }
 								    const auto t_end = std::chrono::high_resolution_clock::now();
 								    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
 								    return result;
 								}
 								const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
 								There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
 								The transcript only includes text, it does not include markup like HTML and Markdown.
 								{1} responds with short and concise answers.
 								{0}{4} Hello, {1}!
 								{1}{4} Hello {0}! How may I help you today?
 								{0}{4} What time is it?
 								{1}{4} It is {2} o'clock.
 								{0}{4} What year is it?
 								{1}{4} We are in {3}.
 								{0}{4} What is a cat?
 								{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
 								{0}{4} Name a color.
 								{1}{4} Blue
 								{0}{4})";
 								int main(int argc, char ** argv) {
 								    whisper_params params;
 								    if (whisper_params_parse(argc, argv, params) == false) {
 								        return 1;
 								    }
-												talk-llama : add language auto detect (#1467)

* Add '-l auto' to talk-llama example

* Update examples/talk-llama/talk-llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-09 17:21:44 +00:00
+								    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
 								        whisper_print_usage(argc, argv, params);
 								        exit(0);
 								    }
 								    // whisper init
-												whisper : add context param to disable gpu (#1293)

* whisper : check state->ctx_metal not null

* whisper : add whisper_context_params { use_gpu }

* whisper : new API with params & deprecate old API

* examples : use no-gpu param && whisper_init_from_file_with_params

* whisper.objc : enable metal & disable on simulator

* whisper.swiftui, metal : enable metal & support load default.metallib

* whisper.android : use new API

* bindings : use new API

* addon.node : fix build & test

* bindings : updata java binding

* bindings : add missing whisper_context_default_params_by_ref WHISPER_API for java

* metal : use SWIFTPM_MODULE_BUNDLE for GGML_SWIFT and reuse library load

* metal : move bundle var into block

* metal : use SWIFT_PACKAGE instead of GGML_SWIFT

* style : minor updates

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-11-06 09:04:24 +00:00
+								    struct whisper_context_params cparams;
 								    cparams.use_gpu = params.use_gpu;
 								    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    // llama init
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								    llama_backend_init(true);
-												talk-llama : sync latest llama.cpp (close #922, close #954)

											
										
										
											2023-05-23 11:04:39 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    auto lmparams = llama_model_default_params();
-												examples : fix n_gpu_layers usage in talk-llama (#1441)


											
										
										
											2023-11-07 01:36:23 +00:00
+								    if (!params.use_gpu) {
-												talk-llama : fix n_gpu_layers usage again (#1442)


											
										
										
											2023-11-07 08:51:27 +00:00
+								        lmparams.n_gpu_layers = 0;
-												talk-llama : add n_gpu_layers parameter (#1475)


											
										
										
											2023-11-13 08:04:16 +00:00
+								    } else {
 								        lmparams.n_gpu_layers = params.n_gpu_layers;
-												examples : fix n_gpu_layers usage in talk-llama (#1441)


											
										
										
											2023-11-07 01:36:23 +00:00
+								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    llama_context_params lcparams = llama_context_default_params();
 								    // tune these to your liking
 								    lcparams.n_ctx      = 2048;
 								    lcparams.seed       = 1;
 								    lcparams.f16_kv     = true;
 								    lcparams.n_threads  = params.n_threads;
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    // print some info about the processing
 								    {
 								        fprintf(stderr, "\n");
 								        if (!whisper_is_multilingual(ctx_wsp)) {
 								            if (params.language != "en" || params.translate) {
 								                params.language = "en";
 								                params.translate = false;
 								                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
 								            }
 								        }
 								        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
 								                __func__,
 								                params.n_threads,
 								                params.language.c_str(),
 								                params.translate ? "translate" : "transcribe",
 								                params.no_timestamps ? 0 : 1);
 								        fprintf(stderr, "\n");
 								    }
 								    // init audio
 								    audio_async audio(30*1000);
 								    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
 								        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
 								        return 1;
 								    }
 								    audio.resume();
 								    bool is_running  = true;
 								    bool force_speak = false;
 								    float prob0 = 0.0f;
 								    const std::string chat_symb = ":";
 								    const std::string bot_name  = "LLaMA";
 								    std::vector<float> pcmf32_cur;
 								    std::vector<float> pcmf32_prompt;
 								    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);
 								    // construct the initial prompt for LLaMA inference
-												talk-llama : add alpaca support (#668)


											
										
										
											2023-03-29 20:01:14 +00:00
+								    std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
 								    // need to have leading ' '
 								    prompt_llama.insert(0, 1, ' ');
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    prompt_llama = ::replace(prompt_llama, "{0}", params.person);
 								    prompt_llama = ::replace(prompt_llama, "{1}", bot_name);
 								    {
 								        // get time string
 								        std::string time_str;
 								        {
 								            time_t t = time(0);
 								            struct tm * now = localtime(&t);
 								            char buf[128];
 								            strftime(buf, sizeof(buf), "%H:%M", now);
 								            time_str = buf;
 								        }
 								        prompt_llama = ::replace(prompt_llama, "{2}", time_str);
 								    }
 								    {
 								        // get year string
 								        std::string year_str;
 								        {
 								            time_t t = time(0);
 								            struct tm * now = localtime(&t);
 								            char buf[128];
 								            strftime(buf, sizeof(buf), "%Y", now);
 								            year_str = buf;
 								        }
 								        prompt_llama = ::replace(prompt_llama, "{3}", year_str);
 								    }
 								    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								    // init session
 								    std::string path_session = params.path_session;
 								    std::vector<llama_token> session_tokens;
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
 								    if (!path_session.empty()) {
 								        fprintf(stderr, "%s: attempting to load saved session from %s\n", __func__, path_session.c_str());
 								        // fopen to check for existing session
 								        FILE * fp = std::fopen(path_session.c_str(), "rb");
 								        if (fp != NULL) {
 								            std::fclose(fp);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            session_tokens.resize(llama_n_ctx(ctx_llama));
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								            size_t n_token_count_out = 0;
 								            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
 								                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
 								                return 1;
 								            }
 								            session_tokens.resize(n_token_count_out);
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								            for (size_t i = 0; i < session_tokens.size(); i++) {
 								                embd_inp[i] = session_tokens[i];
 								            }
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
 								            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
 								        } else {
 								            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
 								        }
 								    }
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								    // evaluate the initial prompt
 								    printf("\n");
 								    printf("%s : initializing - please wait ...\n", __func__);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0)) {
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								        fprintf(stderr, "%s : failed to eval\n", __func__);
 								        return 1;
 								    }
 								    if (params.verbose_prompt) {
 								        fprintf(stdout, "\n");
 								        fprintf(stdout, "%s", prompt_llama.c_str());
 								        fflush(stdout);
 								    }
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								     // debug message about similarity of saved session, if applicable
 								    size_t n_matching_session_tokens = 0;
 								    if (session_tokens.size()) {
 								        for (llama_token id : session_tokens) {
 								            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
 								                break;
 								            }
 								            n_matching_session_tokens++;
 								        }
 								        if (n_matching_session_tokens >= embd_inp.size()) {
 								            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
 								        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
 								            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
 								                __func__, n_matching_session_tokens, embd_inp.size());
 								        } else {
 								            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
 								                __func__, n_matching_session_tokens, embd_inp.size());
 								        }
 								    }
 								    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
 								    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
 								    // initial prompt so it doesn't need to be an exact match.
 								    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    printf("%s : done! start speaking in the microphone\n", __func__);
 								    printf("\n");
 								    printf("%s%s", params.person.c_str(), chat_symb.c_str());
 								    fflush(stdout);
 								    // clear audio buffer
 								    audio.clear();
 								    // text inference variables
 								    const int voice_id = 2;
 								    const int n_keep   = embd_inp.size();
 								    const int n_ctx    = llama_n_ctx(ctx_llama);
 								    int n_past = n_keep;
 								    int n_prev = 64; // TODO arg
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								    int n_session_consumed = !path_session.empty() && session_tokens.size() > 0 ? session_tokens.size() : 0;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    std::vector<llama_token> embd;
 								    // reverse prompts for detecting when it's time to stop speaking
 								    std::vector<std::string> antiprompts = {
 								        params.person + chat_symb,
 								    };
 								    // main loop
 								    while (is_running) {
 								        // handle Ctrl + C
 								        is_running = sdl_poll_events();
 								        if (!is_running) {
 								            break;
 								        }
 								        // delay
 								        std::this_thread::sleep_for(std::chrono::milliseconds(100));
 								        int64_t t_ms = 0;
 								        {
 								            audio.get(2000, pcmf32_cur);
 								            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
 								                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
 								                audio.get(params.voice_ms, pcmf32_cur);
 								                std::string text_heard;
 								                if (!force_speak) {
 								                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
 								                }
 								                // remove text between brackets using regex
 								                {
 								                    std::regex re("\\[.*?\\]");
 								                    text_heard = std::regex_replace(text_heard, re, "");
 								                }
 								                // remove text between brackets using regex
 								                {
 								                    std::regex re("\\(.*?\\)");
 								                    text_heard = std::regex_replace(text_heard, re, "");
 								                }
 								                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
 								                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
 								                // take first line
 								                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
 								                // remove leading and trailing whitespace
 								                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
 								                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
 								                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
 								                if (text_heard.empty() || tokens.empty() || force_speak) {
 								                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
 								                    audio.clear();
 								                    continue;
 								                }
 								                force_speak = false;
 								                text_heard.insert(0, 1, ' ');
 								                text_heard += "\n" + bot_name + chat_symb;
 								                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
 								                fflush(stdout);
 								                embd = ::llama_tokenize(ctx_llama, text_heard, false);
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								                // Append the new input tokens to the session_tokens vector
 								                if (!path_session.empty()) {
 								                    session_tokens.insert(session_tokens.end(), tokens.begin(), tokens.end());
 								                }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                // text inference
 								                bool done = false;
 								                std::string text_to_speak;
 								                while (true) {
 								                    // predict
 								                    if (embd.size() > 0) {
 								                        if (n_past + (int) embd.size() > n_ctx) {
 								                            n_past = n_keep;
 								                            // insert n_left/2 tokens at the start of embd from last_n_tokens
 								                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								                            // stop saving session if we run out of context
 								                            path_session = "";
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                            //printf("\n---\n");
 								                            //printf("resetting: '");
 								                            //for (int i = 0; i < (int) embd.size(); i++) {
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                            //    printf("%s", llama_token_to_piece(ctx_llama, embd[i]));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                            //}
 								                            //printf("'\n");
 								                            //printf("\n---\n");
 								                        }
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								                        // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
 								                        // REVIEW
 								                        if (n_session_consumed < (int) session_tokens.size()) {
 								                            size_t i = 0;
 								                            for ( ; i < embd.size(); i++) {
 								                                if (embd[i] != session_tokens[n_session_consumed]) {
 								                                    session_tokens.resize(n_session_consumed);
 								                                    break;
 								                                }
 								                                n_past++;
 								                                n_session_consumed++;
 								                                if (n_session_consumed >= (int) session_tokens.size()) {
 								                                    i++;
 								                                    break;
 								                                }
 								                            }
 								                            if (i > 0) {
 								                                embd.erase(embd.begin(), embd.begin() + i);
 								                            }
 								                        }
-												talk-llama : fix session prompt load (#854)


											
										
										
											2023-05-02 17:05:27 +00:00
+								                        if (embd.size() > 0 && !path_session.empty()) {
 								                            session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
 								                            n_session_consumed = session_tokens.size();
 								                        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past)) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                            fprintf(stderr, "%s : failed to eval\n", __func__);
 								                            return 1;
 								                        }
 								                    }
 								                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
 								                    n_past += embd.size();
-												talk-llama : fix build + sync latest llama.cpp

											
										
										
											2023-05-14 15:46:19 +00:00
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                    embd.clear();
 								                    if (done) break;
 								                    {
 								                        // out of user input, sample next token
 								                        const float top_k          = 5;
 								                        const float top_p          = 0.80f;
 								                        const float temp           = 0.30f;
 								                        const float repeat_penalty = 1.1764f;
 								                        const int repeat_last_n    = 256;
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								                        if (!path_session.empty() && need_to_save_session) {
 								                            need_to_save_session = false;
 								                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
-												talk-llama : fix build + sync latest llama.cpp

											
										
										
											2023-05-14 15:46:19 +00:00
+								                        }
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                        llama_token id = 0;
 								                        {
 								                            auto logits = llama_get_logits(ctx_llama);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            auto n_vocab = llama_n_vocab(model_llama);
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            logits[llama_token_eos(model_llama)] = 0;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
+								                            std::vector<llama_token_data> candidates;
 								                            candidates.reserve(n_vocab);
 								                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
 								                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
 								                            }
 								                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 								                            // apply repeat penalty
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            const float nl_logit = logits[llama_token_nl(model_llama)];
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            llama_sample_repetition_penalties(ctx_llama, &candidates_p,
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                    repeat_last_n, repeat_penalty, 0.0, 0.0f);
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            logits[llama_token_nl(model_llama)] = nl_logit;
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
 								                            if (temp <= 0) {
 								                                // Greedy sampling
 								                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
 								                            } else {
 								                                // Temperature sampling
-												talk-llama : fix build + sync latest llama.cpp

											
										
										
											2023-05-14 15:46:19 +00:00
+								                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
 								                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                llama_sample_temp (ctx_llama, &candidates_p, temp);
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
+								                                id = llama_sample_token(ctx_llama, &candidates_p);
 								                            }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        if (id != llama_token_eos(model_llama)) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                            // add it to the context
 								                            embd.push_back(id);
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                            text_to_speak += llama_token_to_piece(ctx_llama, id);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                            printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                        }
 								                    }
 								                    {
 								                        std::string last_output;
 								                        for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                            last_output += llama_token_to_piece(ctx_llama, embd_inp[i]);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                        }
-												talk-llama : update to latest llama.cpp

											
										
										
											2023-09-15 17:06:31 +00:00
+								                        last_output += llama_token_to_piece(ctx_llama, embd[0]);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								                        for (std::string & antiprompt : antiprompts) {
 								                            if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
 								                                done = true;
 								                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
 								                                fflush(stdout);
-												talk-llama : add --session support (#845)

* feat: adding session support

* readme: adding --session info in examples/talk-llama

* llama: adding session fixes

* readme: updating session doc

* talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction

* talk-llama: adding missing function which updates session_tokens
											
										
										
											2023-05-01 17:18:10 +00:00
+								                                need_to_save_session = true;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                                break;
 								                            }
 								                        }
 								                    }
 								                    is_running = sdl_poll_events();
 								                    if (!is_running) {
 								                        break;
 								                    }
 								                }
-												talk-llama : improve quote and backtick handling (#1364)

* ISSUE-1329: replace " with ' so it doesn't try to execute code in backticks.

* Typo

* Update to keep possessives in the output

Closes the ' then puts a ' in quotes then reopens the ' to escape the ' characters.
											
										
										
											2023-11-16 08:34:05 +00:00
+								                text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
 								                int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
-												examples : fix build + compile warnings (close #1256)

											
										
										
											2023-09-07 09:33:12 +00:00
+								                if (ret != 0) {
 								                    fprintf(stderr, "%s: failed to speak\n", __func__);
 								                }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								                audio.clear();
 								            }
 								        }
 								    }
 								    audio.pause();
 								    whisper_print_timings(ctx_wsp);
 								    whisper_free(ctx_wsp);
 								    llama_print_timings(ctx_llama);
 								    llama_free(ctx_llama);
 								    return 0;
 								}