whisper.cpp/ggml.c

#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
#define _USE_MATH_DEFINES // For M_PI on MSVC

#include "ggml-impl.h"
#include "ggml-quants.h"
#include "ggml.h"

#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#include <alloca.h>
#endif

#include <assert.h>
#include <errno.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <float.h>
#include <limits.h>
#include <stdarg.h>
#include <signal.h>
#if defined(__gnu_linux__)
#include <syscall.h>
#endif

#ifdef GGML_USE_METAL
#include <unistd.h>
#endif

#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
#pragma warning(disable: 4244 4267)

// disable POSIX deprecation warnings
// these functions are never going away, anyway
#pragma warning(disable: 4996)
#endif

#if defined(_WIN32)

#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
    #define NOMINMAX
#endif
#include <windows.h>

typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;

static void atomic_store(atomic_int * ptr, LONG val) {
    InterlockedExchange(ptr, val);
}
static LONG atomic_load(atomic_int * ptr) {
    return InterlockedCompareExchange(ptr, 0, 0);
}
static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
    return InterlockedExchangeAdd(ptr, inc);
}
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
    return atomic_fetch_add(ptr, -(dec));
}

typedef HANDLE pthread_t;

typedef DWORD thread_ret_t;
static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
    (void) unused;
    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
    if (handle == NULL)
    {
        return EAGAIN;
    }

    *out = handle;
    return 0;
}

static int pthread_join(pthread_t thread, void * unused) {
    (void) unused;
    int ret = (int) WaitForSingleObject(thread, INFINITE);
    CloseHandle(thread);
    return ret;
}

static int sched_yield (void) {
    Sleep (0);
    return 0;
}
#else
#include <pthread.h>
#include <stdatomic.h>

typedef void * thread_ret_t;

#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#endif

#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif

#if defined(__APPLE__)
#include <TargetConditionals.h>
#endif

#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))

#include <sys/wait.h>

void ggml_print_backtrace(void) {
    /*
    #include <execinfo.h>
    #include <dlfcn.h>

    void * trace[100];

    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));

    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
    */

    // backtrack_symbols does not show line numbers, use gdb instead
    char attach[32];
    snprintf(attach, sizeof(attach), "attach %d", getpid());
    int pid = fork();
    if (pid == 0) {
        execlp("gdb", "gdb", "--batch",
            "-ex", "set style enabled on",
            "-ex", attach,
            "-ex", "bt -frame-info source-and-location",
            "-ex", "detach",
            "-ex", "quit",
            (char *) NULL);
    } else {
        waitpid(pid, NULL, 0);
    }
}
#else
void ggml_print_backtrace(void) {
    // platform not supported
}
#endif

/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_GELU_FP16
#define GGML_GELU_QUICK_FP16
#define GGML_SILU_FP16
// #define GGML_CROSS_ENTROPY_EXP_FP16
// #define GGML_FLASH_ATTN_EXP_FP16

#define GGML_SOFT_MAX_UNROLL 4
#define GGML_VEC_DOT_UNROLL  2
#define GGML_VEC_MAD_UNROLL  32

//
// logging
//

#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif

#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif

#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif

#define GGML_PRINT(...) printf(__VA_ARGS__)

//
// end of logging block
//

#ifdef GGML_USE_ACCELERATE
// uncomment to use vDSP for soft max computation
// note: not sure if it is actually faster
//#define GGML_SOFT_MAX_ACCELERATE
#endif

#if defined(_MSC_VER) || defined(__MINGW32__)
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
#else
inline static void * ggml_aligned_malloc(size_t size) {
    if (size == 0) {
        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
        return NULL;
    }
    void * aligned_memory = NULL;
#ifdef GGML_USE_CPU_HBM
    int result = hbw_posix_memalign(&aligned_memory, 16, size);
#elif GGML_USE_METAL
    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
#else
    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
#endif
    if (result != 0) {
        // Handle allocation failure
        const char *error_desc = "unknown allocation error";
        switch (result) {
            case EINVAL:
                error_desc = "invalid alignment value";
                break;
            case ENOMEM:
                error_desc = "insufficient memory";
                break;
        }
        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
        GGML_ASSERT(false);
        return NULL;
    }
    return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#ifdef GGML_USE_CPU_HBM
#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
#else
#define GGML_ALIGNED_FREE(ptr)    free(ptr)
#endif
#endif

inline static void * ggml_malloc(size_t size) {
    if (size == 0) {
        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
        return NULL;
    }
    void * result = malloc(size);
    if (result == NULL) {
        GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
        GGML_ASSERT(false);
    }
    return result;
}

// calloc
inline static void * ggml_calloc(size_t num, size_t size) {
    if (num == 0 || size == 0) {
        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
        return NULL;
    }
    void * result = calloc(num, size);
    if (result == NULL) {
        GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
        GGML_ASSERT(false);
    }
    return result;
}

#define GGML_MALLOC(size)      ggml_malloc(size)
#define GGML_CALLOC(num, size) ggml_calloc(num, size)

#define GGML_FREE(ptr) free(ptr)

#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)

#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
#include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#endif
#elif defined(GGML_USE_OPENBLAS)
#if defined(GGML_BLAS_USE_MKL)
#include <mkl.h>
#else
#include <cblas.h>
#endif
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#endif

// floating point type used to accumulate sums
typedef double ggml_float;

#undef MIN
#undef MAX

#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

//
// global data
//

// precomputed gelu table for f16 (128 KB)
static ggml_fp16_t ggml_table_gelu_f16[1 << 16];

// precomputed quick gelu table for f16 (128 KB)
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];

// precomputed silu table for f16 (128 KB)
static ggml_fp16_t ggml_table_silu_f16[1 << 16];

// precomputed exp table for f16 (128 KB)
static ggml_fp16_t ggml_table_exp_f16[1 << 16];

// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
float ggml_table_f32_f16[1 << 16];

const char * ggml_status_to_string(enum ggml_status status) {
    switch (status) {
        case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
        case GGML_STATUS_FAILED:       return "GGML status: error (operation failed)";
        case GGML_STATUS_SUCCESS:      return "GGML status: success";
        case GGML_STATUS_ABORTED:      return "GGML status: warning (operation aborted)";
    }

    return "GGML status: unknown";
}

// note: do not use these inside ggml.c
// these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32(ggml_fp16_t x) {
    return GGML_FP16_TO_FP32(x);
}

ggml_fp16_t ggml_fp32_to_fp16(float x) {
    return GGML_FP32_TO_FP16(x);
}

void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
    for (int i = 0; i < n; i++) {
        y[i] = GGML_FP16_TO_FP32(x[i]);
    }
}

void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
    int i = 0;
#if defined(__F16C__)
    for (; i + 7 < n; i += 8) {
        __m256 x_vec = _mm256_loadu_ps(x + i);
        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
        _mm_storeu_si128((__m128i *)(y + i), y_vec);
    }
    for(; i + 3 < n; i += 4) {
        __m128 x_vec = _mm_loadu_ps(x + i);
        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
        _mm_storel_epi64((__m128i *)(y + i), y_vec);
    }
#endif
    for (; i < n; i++) {
        y[i] = GGML_FP32_TO_FP16(x[i]);
    }
}

bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
    return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
}

//
// timing
//

#if defined(_MSC_VER) || defined(__MINGW32__)
static int64_t timer_freq, timer_start;
void ggml_time_init(void) {
    LARGE_INTEGER t;
    QueryPerformanceFrequency(&t);
    timer_freq = t.QuadPart;

    // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
    // and the uptime is high enough.
    // We subtract the program start time to reduce the likelihood of that happening.
    QueryPerformanceCounter(&t);
    timer_start = t.QuadPart;
}
int64_t ggml_time_ms(void) {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return ((t.QuadPart-timer_start) * 1000) / timer_freq;
}
int64_t ggml_time_us(void) {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
}
#else
void ggml_time_init(void) {}
int64_t ggml_time_ms(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
}

int64_t ggml_time_us(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
}
#endif

int64_t ggml_cycles(void) {
    return clock();
}

int64_t ggml_cycles_per_ms(void) {
    return CLOCKS_PER_SEC/1000;
}

#ifdef GGML_PERF
#define ggml_perf_time_ms()       ggml_time_ms()
#define ggml_perf_time_us()       ggml_time_us()
#define ggml_perf_cycles()        ggml_cycles()
#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
#else
#define ggml_perf_time_ms()       0
#define ggml_perf_time_us()       0
#define ggml_perf_cycles()        0
#define ggml_perf_cycles_per_ms() 0
#endif

//
// cross-platform UTF-8 file paths
//

#ifdef _WIN32
static wchar_t * ggml_mbstowcs(const char * mbs) {
    int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
    if (!wlen) {
        errno = EINVAL;
        return NULL;
    }

    wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
    wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
    if (!wlen) {
        GGML_FREE(wbuf);
        errno = EINVAL;
        return NULL;
    }

    return wbuf;
}
#endif

FILE * ggml_fopen(const char * fname, const char * mode) {
#ifdef _WIN32
    FILE * file = NULL;

    // convert fname (UTF-8)
    wchar_t * wfname = ggml_mbstowcs(fname);
    if (wfname) {
        // convert mode (ANSI)
        wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
        wchar_t * wmode_p = wmode;
        do {
            *wmode_p++ = (wchar_t)*mode;
        } while (*mode++);

        // open file
        file = _wfopen(wfname, wmode);

        GGML_FREE(wfname);
        GGML_FREE(wmode);
    }

    return file;
#else
    return fopen(fname, mode);
#endif
}

//
// cache line
//

#if defined(__cpp_lib_hardware_interference_size)
#define CACHE_LINE_SIZE hardware_destructive_interference_size
#else
#if defined(__POWER9_VECTOR__)
#define CACHE_LINE_SIZE 128
#else
#define CACHE_LINE_SIZE 64
#endif
#endif

static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);

static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);

static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
    [GGML_TYPE_I8] = {
        .type_name                = "i8",
        .blck_size                = 1,
        .type_size                = sizeof(int8_t),
        .is_quantized             = false,
    },
    [GGML_TYPE_I16] = {
        .type_name                = "i16",
        .blck_size                = 1,
        .type_size                = sizeof(int16_t),
        .is_quantized             = false,
    },
    [GGML_TYPE_I32] = {
        .type_name                = "i32",
        .blck_size                = 1,
        .type_size                = sizeof(int32_t),
        .is_quantized             = false,
    },
    [GGML_TYPE_I64] = {
        .type_name                = "i64",
        .blck_size                = 1,
        .type_size                = sizeof(int64_t),
        .is_quantized             = false,
    },
    [GGML_TYPE_F64] = {
        .type_name                = "f64",
        .blck_size                = 1,
        .type_size                = sizeof(double),
        .is_quantized             = false,
        .nrows                    = 1,
    },
    [GGML_TYPE_F32] = {
        .type_name                = "f32",
        .blck_size                = 1,
        .type_size                = sizeof(float),
        .is_quantized             = false,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
        .vec_dot_type             = GGML_TYPE_F32,
        .nrows                    = 1,
    },
    [GGML_TYPE_F16] = {
        .type_name                = "f16",
        .blck_size                = 1,
        .type_size                = sizeof(ggml_fp16_t),
        .is_quantized             = false,
        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
        .vec_dot_type             = GGML_TYPE_F16,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q4_0] = {
        .type_name                = "q4_0",
        .blck_size                = QK4_0,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
        .from_float               = quantize_row_q4_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
        .nrows                    = 2,
#else
        .nrows                    = 1,
#endif
    },
    [GGML_TYPE_Q4_1] = {
        .type_name                = "q4_1",
        .blck_size                = QK4_1,
        .type_size                = sizeof(block_q4_1),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
        .from_float               = quantize_row_q4_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
#if defined (__ARM_FEATURE_MATMUL_INT8)
        .nrows                    = 2,
#else
        .nrows                    = 1,
#endif
    },
    [4] = { // GGML_TYPE_Q4_2
        .type_name                = "DEPRECATED",
        .blck_size                = 0,
        .type_size                = 0,
        .is_quantized             = false,
        .to_float                 = NULL,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_COUNT,
        .nrows                    = 1,
    },
    [5] = { // GGML_TYPE_Q4_3
        .type_name                = "DEPRECATED",
        .blck_size                = 0,
        .type_size                = 0,
        .is_quantized             = false,
        .to_float                 = NULL,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_COUNT,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q5_0] = {
        .type_name                = "q5_0",
        .blck_size                = QK5_0,
        .type_size                = sizeof(block_q5_0),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
        .from_float               = quantize_row_q5_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q5_1] = {
        .type_name                = "q5_1",
        .blck_size                = QK5_1,
        .type_size                = sizeof(block_q5_1),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
        .from_float               = quantize_row_q5_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q8_0] = {
        .type_name                = "q8_0",
        .blck_size                = QK8_0,
        .type_size                = sizeof(block_q8_0),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
        .from_float               = quantize_row_q8_0,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
#if defined (__ARM_FEATURE_MATMUL_INT8)
        .nrows                    = 2,
#else
        .nrows                    = 1,
#endif
    },
    [GGML_TYPE_Q8_1] = {
        .type_name                = "q8_1",
        .blck_size                = QK8_1,
        .type_size                = sizeof(block_q8_1),
        .is_quantized             = true,
        .from_float               = quantize_row_q8_1,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
        .vec_dot_type             = GGML_TYPE_Q8_1,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q2_K] = {
        .type_name                = "q2_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q2_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
        .from_float               = quantize_row_q2_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q3_K] = {
        .type_name                = "q3_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q3_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
        .from_float               = quantize_row_q3_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q4_K] = {
        .type_name                = "q4_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q4_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
        .from_float               = quantize_row_q4_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q5_K] = {
        .type_name                = "q5_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q5_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
        .from_float               = quantize_row_q5_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q6_K] = {
        .type_name                = "q6_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q6_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
        .from_float               = quantize_row_q6_K,
        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ2_XXS] = {
        .type_name                = "iq2_xxs",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq2_xxs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ2_XS] = {
        .type_name                = "iq2_xs",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq2_xs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ3_XXS] = {
        .type_name                = "iq3_xxs",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq3_xxs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
        .from_float               = quantize_row_iq3_xxs,
        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
        .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ3_S] = {
        .type_name                = "iq3_s",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq3_s),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
        .from_float               = quantize_row_iq3_s,
        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_s_reference,
        .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ2_S] = {
        .type_name                = "iq2_s",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq2_s),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
        .from_float               = quantize_row_iq2_s,
        .from_float_reference     = (ggml_from_float_t)quantize_row_iq2_s_reference,
        .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ1_S] = {
        .type_name                = "iq1_s",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq1_s),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ1_M] = {
        .type_name                = "iq1_m",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq1_m),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_m,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = ggml_vec_dot_iq1_m_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ4_NL] = {
        .type_name                = "iq4_nl",
        .blck_size                = QK4_NL,
        .type_size                = sizeof(block_iq4_nl),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
        .from_float               = quantize_row_iq4_nl,
        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_nl_reference,
        .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ4_XS] = {
        .type_name                = "iq4_xs",
#if QK_K == 64
        .blck_size                = QK4_NL,
#else
        .blck_size                = QK_K,
#endif
        .type_size                = sizeof(block_iq4_xs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
        .from_float               = quantize_row_iq4_xs,
        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_xs_reference,
        .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
#if QK_K == 64
        .vec_dot_type             = GGML_TYPE_Q8_0,
#else
        .vec_dot_type             = GGML_TYPE_Q8_K,
#endif
        .nrows                    = 1,
    },
    [GGML_TYPE_Q8_K] = {
        .type_name                = "q8_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q8_K),
        .is_quantized             = true,
        .from_float               = quantize_row_q8_K,
    }
};

// For internal test use
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
    GGML_ASSERT(type < GGML_TYPE_COUNT);
    return type_traits[type];
}

//
// simd mappings
//

#if defined(__ARM_NEON)
#if !defined(__aarch64__)

// 64-bit compatibility

inline static float vaddvq_f32(float32x4_t v) {
    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}

#endif
#endif

// we define a common set of C macros which map to specific intrinsics based on the current architecture
// we then implement the fundamental computation operations below using only these macros
// adding support for new architectures requires to define the corresponding SIMD macros
//
// GGML_F32_STEP / GGML_F16_STEP
//   number of elements to process in a single step
//
// GGML_F32_EPR / GGML_F16_EPR
//   number of elements to fit in a single register
//

#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)

#define GGML_SIMD

// F32 NEON

#define GGML_F32_STEP 16
#define GGML_F32_EPR  4

#define GGML_F32x4              float32x4_t
#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
#define GGML_F32x4_LOAD         vld1q_f32
#define GGML_F32x4_STORE        vst1q_f32
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
#define GGML_F32x4_ADD          vaddq_f32
#define GGML_F32x4_MUL          vmulq_f32
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
#define GGML_F32x4_REDUCE(res, x)              \
{                                              \
    int offset = GGML_F32_ARR >> 1;            \
    for (int i = 0; i < offset; ++i) {         \
        x[i] = vaddq_f32(x[i], x[offset+i]);   \
    }                                          \
    offset >>= 1;                              \
    for (int i = 0; i < offset; ++i) {         \
        x[i] = vaddq_f32(x[i], x[offset+i]);   \
    }                                          \
    offset >>= 1;                              \
    for (int i = 0; i < offset; ++i) {         \
        x[i] = vaddq_f32(x[i], x[offset+i]);   \
    }                                          \
    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
}

#define GGML_F32_VEC        GGML_F32x4
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE

// F16 NEON

#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
    #define GGML_F16_STEP 32
    #define GGML_F16_EPR  8

    #define GGML_F16x8              float16x8_t
    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
    #define GGML_F16x8_LOAD(x)      vld1q_f16((const ggml_fp16_internal_t *)(x))
    #define GGML_F16x8_STORE        vst1q_f16
    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
    #define GGML_F16x8_ADD          vaddq_f16
    #define GGML_F16x8_MUL          vmulq_f16
    #define GGML_F16x8_REDUCE(res, x)                             \
    do {                                                          \
        int offset = GGML_F16_ARR >> 1;                           \
        for (int i = 0; i < offset; ++i) {                        \
            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
        }                                                         \
        offset >>= 1;                                             \
        for (int i = 0; i < offset; ++i) {                        \
            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
        }                                                         \
        offset >>= 1;                                             \
        for (int i = 0; i < offset; ++i) {                        \
            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
        }                                                         \
        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
    } while (0)

    #define GGML_F16_VEC                GGML_F16x8
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
#else
    // if FP16 vector arithmetic is not supported, we use FP32 instead
    // and take advantage of the vcvt_ functions to convert to/from FP16

    #define GGML_F16_STEP 16
    #define GGML_F16_EPR  4

    #define GGML_F32Cx4              float32x4_t
    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
    #define GGML_F32Cx4_ADD          vaddq_f32
    #define GGML_F32Cx4_MUL          vmulq_f32
    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE

    #define GGML_F16_VEC                GGML_F32Cx4
    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
#endif

#elif defined(__AVX512F__)

#define GGML_SIMD

// F32 AVX512

#define GGML_F32_STEP 64
#define GGML_F32_EPR  16

#define GGML_F32x16         __m512
#define GGML_F32x16_ZERO    _mm512_setzero_ps()
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
#define GGML_F32x16_LOAD    _mm512_loadu_ps
#define GGML_F32x16_STORE   _mm512_storeu_ps
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
#define GGML_F32x16_ADD     _mm512_add_ps
#define GGML_F32x16_MUL     _mm512_mul_ps
#define GGML_F32x16_REDUCE(res, x)                                    \
do {                                                                  \
    int offset = GGML_F32_ARR >> 1;                                   \
    for (int i = 0; i < offset; ++i) {                                \
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
    }                                                                 \
    offset >>= 1;                                                     \
    for (int i = 0; i < offset; ++i) {                                \
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
    }                                                                 \
    offset >>= 1;                                                     \
    for (int i = 0; i < offset; ++i) {                                \
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
    }                                                                 \
    res = _mm512_reduce_add_ps(x[0]);                                 \
} while (0)

// TODO: is this optimal ?

#define GGML_F32_VEC        GGML_F32x16
#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE

// F16 AVX512

// F16 AVX

#define GGML_F16_STEP 64
#define GGML_F16_EPR  16

// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead

#define GGML_F32Cx16             __m512
#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)

// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
// so F16C guard isn't required
#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))

#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
#define GGML_F32Cx16_ADD         _mm512_add_ps
#define GGML_F32Cx16_MUL         _mm512_mul_ps
#define GGML_F32Cx16_REDUCE(res, x)                               \
do {                                                              \
    int offset = GGML_F32_ARR >> 1;                               \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
    }                                                             \
    res = _mm512_reduce_add_ps(x[0]);                             \
} while (0)

#define GGML_F16_VEC                GGML_F32Cx16
#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE

#elif defined(__AVX__)

#define GGML_SIMD

// F32 AVX

#define GGML_F32_STEP 32
#define GGML_F32_EPR  8

#define GGML_F32x8         __m256
#define GGML_F32x8_ZERO    _mm256_setzero_ps()
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
#define GGML_F32x8_LOAD    _mm256_loadu_ps
#define GGML_F32x8_STORE   _mm256_storeu_ps
#if defined(__FMA__)
    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
#else
    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
#endif
#define GGML_F32x8_ADD     _mm256_add_ps
#define GGML_F32x8_MUL     _mm256_mul_ps
#define GGML_F32x8_REDUCE(res, x)                                 \
do {                                                              \
    int offset = GGML_F32_ARR >> 1;                               \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
    }                                                             \
    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
                                 _mm256_extractf128_ps(x[0], 1)); \
    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
} while (0)
// TODO: is this optimal ?

#define GGML_F32_VEC        GGML_F32x8
#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE

// F16 AVX

#define GGML_F16_STEP 32
#define GGML_F16_EPR  8

// F16 arithmetic is not supported by AVX, so we use F32 instead

#define GGML_F32Cx8             __m256
#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)

#if defined(__F16C__)
// the  _mm256_cvt intrinsics require F16C
#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
#else
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
    float tmp[8];

    for (int i = 0; i < 8; i++) {
        tmp[i] = GGML_FP16_TO_FP32(x[i]);
    }

    return _mm256_loadu_ps(tmp);
}
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
    float arr[8];

    _mm256_storeu_ps(arr, y);

    for (int i = 0; i < 8; i++)
        x[i] = GGML_FP32_TO_FP16(arr[i]);
}
#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
#endif

#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
#define GGML_F32Cx8_ADD         _mm256_add_ps
#define GGML_F32Cx8_MUL         _mm256_mul_ps
#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE

#define GGML_F16_VEC                GGML_F32Cx8
#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE

#elif defined(__POWER9_VECTOR__)

#define GGML_SIMD

// F32 POWER9

#define GGML_F32_STEP 32
#define GGML_F32_EPR  4

#define GGML_F32x4              vector float
#define GGML_F32x4_ZERO         0.0f
#define GGML_F32x4_SET1         vec_splats
#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
#define GGML_F32x4_ADD          vec_add
#define GGML_F32x4_MUL          vec_mul
#define GGML_F32x4_REDUCE(res, x)              \
{                                              \
    int offset = GGML_F32_ARR >> 1;            \
    for (int i = 0; i < offset; ++i) {         \
        x[i] = vec_add(x[i], x[offset+i]);     \
    }                                          \
    offset >>= 1;                              \
    for (int i = 0; i < offset; ++i) {         \
        x[i] = vec_add(x[i], x[offset+i]);     \
    }                                          \
    offset >>= 1;                              \
    for (int i = 0; i < offset; ++i) {         \
        x[i] = vec_add(x[i], x[offset+i]);     \
    }                                          \
    res = vec_extract(x[0], 0) +               \
          vec_extract(x[0], 1) +               \
          vec_extract(x[0], 2) +               \
          vec_extract(x[0], 3);                \
}

#define GGML_F32_VEC        GGML_F32x4
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE

// F16 POWER9
#define GGML_F16_STEP       GGML_F32_STEP
#define GGML_F16_EPR        GGML_F32_EPR
#define GGML_F16_VEC        GGML_F32x4
#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
// Use vec_xl, not vec_ld, in case the load address is not aligned.
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
  vec_extract_fp32_from_shortl(vec_xl(0, p))
#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
#define GGML_F16_VEC_STORE(p, r, i)                             \
  if (i & 0x1)                                                  \
    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
                                   r[i - GGML_ENDIAN_BYTE(0)]), \
            0, p - GGML_F16_EPR)

#elif defined(__wasm_simd128__)

#define GGML_SIMD

// F32 WASM

#define GGML_F32_STEP 16
#define GGML_F32_EPR  4

#define GGML_F32x4              v128_t
#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
#define GGML_F32x4_LOAD         wasm_v128_load
#define GGML_F32x4_STORE        wasm_v128_store
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
#define GGML_F32x4_ADD          wasm_f32x4_add
#define GGML_F32x4_MUL          wasm_f32x4_mul
#define GGML_F32x4_REDUCE(res, x)                  \
{                                                  \
    int offset = GGML_F32_ARR >> 1;                \
    for (int i = 0; i < offset; ++i) {             \
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
    }                                              \
    offset >>= 1;                                  \
    for (int i = 0; i < offset; ++i) {             \
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
    }                                              \
    offset >>= 1;                                  \
    for (int i = 0; i < offset; ++i) {             \
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
    }                                              \
    res = wasm_f32x4_extract_lane(x[0], 0) +       \
          wasm_f32x4_extract_lane(x[0], 1) +       \
          wasm_f32x4_extract_lane(x[0], 2) +       \
          wasm_f32x4_extract_lane(x[0], 3);        \
}

#define GGML_F32_VEC        GGML_F32x4
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE

// F16 WASM

#define GGML_F16_STEP 16
#define GGML_F16_EPR  4

inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
    float tmp[4];

    tmp[0] = GGML_FP16_TO_FP32(p[0]);
    tmp[1] = GGML_FP16_TO_FP32(p[1]);
    tmp[2] = GGML_FP16_TO_FP32(p[2]);
    tmp[3] = GGML_FP16_TO_FP32(p[3]);

    return wasm_v128_load(tmp);
}

inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
    float tmp[4];

    wasm_v128_store(tmp, x);

    p[0] = GGML_FP32_TO_FP16(tmp[0]);
    p[1] = GGML_FP32_TO_FP16(tmp[1]);
    p[2] = GGML_FP32_TO_FP16(tmp[2]);
    p[3] = GGML_FP32_TO_FP16(tmp[3]);
}

#define GGML_F16x4             v128_t
#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
#define GGML_F16x4_FMA         GGML_F32x4_FMA
#define GGML_F16x4_ADD         wasm_f32x4_add
#define GGML_F16x4_MUL         wasm_f32x4_mul
#define GGML_F16x4_REDUCE(res, x)                  \
{                                                  \
    int offset = GGML_F16_ARR >> 1;                \
    for (int i = 0; i < offset; ++i) {             \
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
    }                                              \
    offset >>= 1;                                  \
    for (int i = 0; i < offset; ++i) {             \
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
    }                                              \
    offset >>= 1;                                  \
    for (int i = 0; i < offset; ++i) {             \
        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
    }                                              \
    res = wasm_f32x4_extract_lane(x[0], 0) +       \
          wasm_f32x4_extract_lane(x[0], 1) +       \
          wasm_f32x4_extract_lane(x[0], 2) +       \
          wasm_f32x4_extract_lane(x[0], 3);        \
}

#define GGML_F16_VEC                GGML_F16x4
#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE

#elif defined(__SSE3__)

#define GGML_SIMD

// F32 SSE

#define GGML_F32_STEP 32
#define GGML_F32_EPR  4

#define GGML_F32x4         __m128
#define GGML_F32x4_ZERO    _mm_setzero_ps()
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
#define GGML_F32x4_LOAD    _mm_loadu_ps
#define GGML_F32x4_STORE   _mm_storeu_ps
#if defined(__FMA__)
    // TODO: Does this work?
    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
#else
    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
#endif
#define GGML_F32x4_ADD     _mm_add_ps
#define GGML_F32x4_MUL     _mm_mul_ps
#define GGML_F32x4_REDUCE(res, x)                                 \
{                                                                 \
    int offset = GGML_F32_ARR >> 1;                               \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
    }                                                             \
    offset >>= 1;                                                 \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
    }                                                             \
    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
}
// TODO: is this optimal ?

#define GGML_F32_VEC        GGML_F32x4
#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE

// F16 SSE

#define GGML_F16_STEP 32
#define GGML_F16_EPR  4

static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
    float tmp[4];

    tmp[0] = GGML_FP16_TO_FP32(x[0]);
    tmp[1] = GGML_FP16_TO_FP32(x[1]);
    tmp[2] = GGML_FP16_TO_FP32(x[2]);
    tmp[3] = GGML_FP16_TO_FP32(x[3]);

    return _mm_loadu_ps(tmp);
}

static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
    float arr[4];

    _mm_storeu_ps(arr, y);

    x[0] = GGML_FP32_TO_FP16(arr[0]);
    x[1] = GGML_FP32_TO_FP16(arr[1]);
    x[2] = GGML_FP32_TO_FP16(arr[2]);
    x[3] = GGML_FP32_TO_FP16(arr[3]);
}

#define GGML_F32Cx4             __m128
#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
#define GGML_F32Cx4_ADD         _mm_add_ps
#define GGML_F32Cx4_MUL         _mm_mul_ps
#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE

#define GGML_F16_VEC                 GGML_F32Cx4
#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE

#endif

// GGML_F32_ARR / GGML_F16_ARR
//   number of registers to use per step
#ifdef GGML_SIMD
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
#endif

//
// fundamental operations
//

inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }

inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }

inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }

inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }

inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }

static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
   assert(nrc == 1);
   UNUSED(nrc);
   UNUSED(bx);
   UNUSED(by);
   UNUSED(bs);

#ifdef GGML_SIMD
    float sumf = 0.0f;
    const int np = (n & ~(GGML_F32_STEP - 1));

    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };

    GGML_F32_VEC ax[GGML_F32_ARR];
    GGML_F32_VEC ay[GGML_F32_ARR];

    for (int i = 0; i < np; i += GGML_F32_STEP) {
        for (int j = 0; j < GGML_F32_ARR; j++) {
            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);

            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
        }
    }

    // reduce sum0..sum3 to sum0
    GGML_F32_VEC_REDUCE(sumf, sum);

    // leftovers
    for (int i = np; i < n; ++i) {
        sumf += x[i]*y[i];
    }
#else
    // scalar
    ggml_float sumf = 0.0;
    for (int i = 0; i < n; ++i) {
        sumf += (ggml_float)(x[i]*y[i]);
    }
#endif

    *s = sumf;
}

static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);

    ggml_float sumf = 0.0;

#if defined(GGML_SIMD)
    const int np = (n & ~(GGML_F16_STEP - 1));

    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };

    GGML_F16_VEC ax[GGML_F16_ARR];
    GGML_F16_VEC ay[GGML_F16_ARR];

    for (int i = 0; i < np; i += GGML_F16_STEP) {
        for (int j = 0; j < GGML_F16_ARR; j++) {
            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);

            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
        }
    }

    // reduce sum0..sum3 to sum0
    GGML_F16_VEC_REDUCE(sumf, sum);

    // leftovers
    for (int i = np; i < n; ++i) {
        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
    }
#else
    for (int i = 0; i < n; ++i) {
        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
    }
#endif

    *s = sumf;
}

// compute GGML_VEC_DOT_UNROLL dot products at once
// xs - x row stride in bytes
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };

    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];

    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
    }

#if defined(GGML_SIMD)
    const int np = (n & ~(GGML_F16_STEP - 1));

    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };

    GGML_F16_VEC ax[GGML_F16_ARR];
    GGML_F16_VEC ay[GGML_F16_ARR];

    for (int i = 0; i < np; i += GGML_F16_STEP) {
        for (int j = 0; j < GGML_F16_ARR; j++) {
            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);

            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);

                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
            }
        }
    }

    // reduce sum0..sum3 to sum0
    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
    }

    // leftovers
    for (int i = np; i < n; ++i) {
        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
        }
    }
#else
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
        }
    }
#endif

    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
        s[i] = sumf[i];
    }
}

inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
#if defined(GGML_SIMD)
    const int np = (n & ~(GGML_F32_STEP - 1));

    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);

    GGML_F32_VEC ax[GGML_F32_ARR];
    GGML_F32_VEC ay[GGML_F32_ARR];

    for (int i = 0; i < np; i += GGML_F32_STEP) {
        for (int j = 0; j < GGML_F32_ARR; j++) {
            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);

            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
        }
    }

    // leftovers
    for (int i = np; i < n; ++i) {
        y[i] += x[i]*v;
    }
#else
    // scalar
    for (int i = 0; i < n; ++i) {
        y[i] += x[i]*v;
    }
#endif
}

// xs and vs are byte strides of x and v
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {

    const float * restrict x[GGML_VEC_MAD_UNROLL];
    const float * restrict v[GGML_VEC_MAD_UNROLL];

    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
        x[i] = (const float *) ((const char *) xv + i*xs);
        v[i] = (const float *) ((const char *) vv + i*vs);
    }

#if defined(GGML_SIMD)
    const int np = (n & ~(GGML_F32_STEP - 1));

    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];

    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
    }

    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
    GGML_F32_VEC ay[GGML_F32_ARR];

    for (int i = 0; i < np; i += GGML_F32_STEP) {
        for (int j = 0; j < GGML_F32_ARR; j++) {
            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);

            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
            }

            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
        }
    }

    // leftovers
    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
        for (int i = np; i < n; ++i) {
            y[i] += x[k][i]*v[k][0];
        }
    }
#else
    // scalar
    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
        for (int i = 0; i < n; ++i) {
            y[i] += x[k][i]*v[k][0];
        }
    }
#endif
}

//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
#if defined(GGML_USE_ACCELERATE)
    vDSP_vsmul(y, 1, &v, y, 1, n);
#elif defined(GGML_SIMD)
    const int np = (n & ~(GGML_F32_STEP - 1));

    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);

    GGML_F32_VEC ay[GGML_F32_ARR];

    for (int i = 0; i < np; i += GGML_F32_STEP) {
        for (int j = 0; j < GGML_F32_ARR; j++) {
            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);

            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
        }
    }

    // leftovers
    for (int i = np; i < n; ++i) {
        y[i] *= v;
    }
#else
    // scalar
    for (int i = 0; i < n; ++i) {
        y[i] *= v;
    }
#endif
}

inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
// TODO: optimize performance
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }

static const float GELU_COEF_A     = 0.044715f;
static const float GELU_QUICK_COEF = -1.702f;
static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;

inline static float ggml_gelu_f32(float x) {
    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
}

inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
    const uint16_t * i16 = (const uint16_t *) x;
    for (int i = 0; i < n; ++i) {
        y[i] = ggml_table_gelu_f16[i16[i]];
    }
}

#ifdef GGML_GELU_FP16
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
    uint16_t t;
    for (int i = 0; i < n; ++i) {
        if (x[i] <= -10.0f) {
            y[i] = 0.0f;
        } else if (x[i] >= 10.0f) {
            y[i] = x[i];
        } else {
            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
            memcpy(&t, &fp16, sizeof(uint16_t));
            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
        }
    }
}
#else
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
    for (int i = 0; i < n; ++i) {
        y[i] = ggml_gelu_f32(x[i]);
    }
}
#endif

inline static float ggml_gelu_quick_f32(float x) {
    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
}

//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
//    const uint16_t * i16 = (const uint16_t *) x;
//    for (int i = 0; i < n; ++i) {
//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
//    }
//}

#ifdef GGML_GELU_QUICK_FP16
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
    uint16_t t;
    for (int i = 0; i < n; ++i) {
        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
        memcpy(&t, &fp16, sizeof(uint16_t));
        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
    }
}
#else
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
    for (int i = 0; i < n; ++i) {
        y[i] = ggml_gelu_quick_f32(x[i]);
    }
}
#endif

// Sigmoid Linear Unit (SiLU) function
inline static float ggml_silu_f32(float x) {
    return x/(1.0f + expf(-x));
}

//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
//    const uint16_t * i16 = (const uint16_t *) x;
//    for (int i = 0; i < n; ++i) {
//        y[i] = ggml_table_silu_f16[i16[i]];
//    }
//}

#ifdef GGML_SILU_FP16
inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
    uint16_t t;
    for (int i = 0; i < n; ++i) {
        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
        memcpy(&t, &fp16, sizeof(uint16_t));
        y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
    }
}
#else
inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
    for (int i = 0; i < n; ++i) {
        y[i] = ggml_silu_f32(x[i]);
    }
}
#endif

inline static float ggml_silu_backward_f32(float x, float dy) {
    const float s = 1.0f/(1.0f + expf(-x));
    return dy*s*(1.0f + x*(1.0f - s));
}

#ifdef GGML_SILU_FP16
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
    for (int i = 0; i < n; ++i) {
        // we did not use x[i] to compute forward silu but its f16 equivalent
        // take derivative at f16 of x[i]:
        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
        float usedx = GGML_FP16_TO_FP32(fp16);
        dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
    }
}
#else
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
    for (int i = 0; i < n; ++i) {
        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
    }
}
#endif

inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
#ifndef GGML_USE_ACCELERATE
    ggml_float sum = 0.0;
    for (int i = 0; i < n; ++i) {
        sum += (ggml_float)x[i];
    }
    *s = sum;
#else
    vDSP_sve(x, 1, s, n);
#endif
}

inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
    ggml_float sum = 0.0;
    for (int i = 0; i < n; ++i) {
        sum += (ggml_float)x[i];
    }
    *s = sum;
}

inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
    float sum = 0.0f;
    for (int i = 0; i < n; ++i) {
        sum += GGML_FP16_TO_FP32(x[i]);
    }
    *s = sum;
}

inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
#ifndef GGML_USE_ACCELERATE
    float max = -INFINITY;
    for (int i = 0; i < n; ++i) {
        max = MAX(max, x[i]);
    }
    *s = max;
#else
    vDSP_maxv(x, 1, s, n);
#endif
}

inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
    ggml_vec_norm_f32(n, s, x);
    *s = 1.f/(*s);
}

inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
    float max = -INFINITY;
    int idx = 0;
    for (int i = 0; i < n; ++i) {
        max = MAX(max, x[i]);
        if (max == x[i]) { idx = i; }
    }
    *s = idx;
}

//
// data types
//

static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "NONE",

    "DUP",
    "ADD",
    "ADD1",
    "ACC",
    "SUB",
    "MUL",
    "DIV",
    "SQR",
    "SQRT",
    "LOG",
    "SUM",
    "SUM_ROWS",
    "MEAN",
    "ARGMAX",
    "REPEAT",
    "REPEAT_BACK",
    "CONCAT",
    "SILU_BACK",
    "NORM",
    "RMS_NORM",
    "RMS_NORM_BACK",
    "GROUP_NORM",

    "MUL_MAT",
    "MUL_MAT_ID",
    "OUT_PROD",

    "SCALE",
    "SET",
    "CPY",
    "CONT",
    "RESHAPE",
    "VIEW",
    "PERMUTE",
    "TRANSPOSE",
    "GET_ROWS",
    "GET_ROWS_BACK",
    "DIAG",
    "DIAG_MASK_INF",
    "DIAG_MASK_ZERO",
    "SOFT_MAX",
    "SOFT_MAX_BACK",
    "ROPE",
    "ROPE_BACK",
    "ALIBI",
    "CLAMP",
    "CONV_TRANSPOSE_1D",
    "IM2COL",
    "CONV_TRANSPOSE_2D",
    "POOL_1D",
    "POOL_2D",
    "UPSCALE",
    "PAD",
    "ARANGE",
    "TIMESTEP_EMBEDDING",
    "ARGSORT",
    "LEAKY_RELU",

    "FLASH_ATTN",
    "FLASH_FF",
    "FLASH_ATTN_BACK",
    "SSM_CONV",
    "SSM_SCAN",
    "WIN_PART",
    "WIN_UNPART",
    "GET_REL_POS",
    "ADD_REL_POS",

    "UNARY",

    "MAP_UNARY",
    "MAP_BINARY",

    "MAP_CUSTOM1_F32",
    "MAP_CUSTOM2_F32",
    "MAP_CUSTOM3_F32",

    "MAP_CUSTOM1",
    "MAP_CUSTOM2",
    "MAP_CUSTOM3",

    "CROSS_ENTROPY_LOSS",
    "CROSS_ENTROPY_LOSS_BACK",
};

static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");

static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",

    "x",
    "x+y",
    "x+y",
    "view(x,nb,offset)+=y->x",
    "x-y",
    "x*y",
    "x/y",
    "x^2",
    "√x",
    "log(x)",
    "Σx",
    "Σx_k",
    "Σx/n",
    "argmax(x)",
    "repeat(x)",
    "repeat_back(x)",
    "concat(x, y)",
    "silu_back(x)",
    "norm(x)",
    "rms_norm(x)",
    "rms_norm_back(x)",
    "group_norm(x)",

    "X*Y",
    "X[i]*Y",
    "X*Y",

    "x*v",
    "y-\\>view(x)",
    "x-\\>y",
    "cont(x)",
    "reshape(x)",
    "view(x)",
    "permute(x)",
    "transpose(x)",
    "get_rows(x)",
    "get_rows_back(x)",
    "diag(x)",
    "diag_mask_inf(x)",
    "diag_mask_zero(x)",
    "soft_max(x)",
    "soft_max_back(x)",
    "rope(x)",
    "rope_back(x)",
    "alibi(x)",
    "clamp(x)",
    "conv_transpose_1d(x)",
    "im2col(x)",
    "conv_transpose_2d(x)",
    "pool_1d(x)",
    "pool_2d(x)",
    "upscale(x)",
    "pad(x)",
    "arange(start, stop, step)",
    "timestep_embedding(timesteps, dim, max_period)",
    "argsort(x)",
    "leaky_relu(x)",

    "flash_attn(x)",
    "flash_ff(x)",
    "flash_attn_back(x)",
    "ssm_conv(x)",
    "ssm_scan(x)",
    "win_part(x)",
    "win_unpart(x)",
    "get_rel_pos(x)",
    "add_rel_pos(x)",

    "unary(x)",

    "f(x)",
    "f(x,y)",

    "custom_f32(x)",
    "custom_f32(x,y)",
    "custom_f32(x,y,z)",

    "custom(x)",
    "custom(x,y)",
    "custom(x,y,z)",

    "cross_entropy_loss(x,y)",
    "cross_entropy_loss_back(x,y)",
};

static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");

static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");


static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "ABS",
    "SGN",
    "NEG",
    "STEP",
    "TANH",
    "ELU",
    "RELU",
    "GELU",
    "GELU_QUICK",
    "SILU",
    "HARDSWISH",
    "HARDSIGMOID",
};

static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");


static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");

// WARN:
// Mis-configuration can lead to problem that's hard to reason about:
// * At best  it crash or talks nosense.
// * At worst it talks slightly difference but hard to perceive.
//
// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
// Take care about compile options (e.g., GGML_USE_xxx).
static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 };
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };

static void ggml_setup_op_has_task_pass(void) {
    {   // INIT
        bool * p = GGML_OP_HAS_INIT;

        p[GGML_OP_ACC                    ] = true;
        p[GGML_OP_MUL_MAT                ] = true;
        p[GGML_OP_MUL_MAT_ID             ] = true;
        p[GGML_OP_OUT_PROD               ] = true;
        p[GGML_OP_SET                    ] = true;
        p[GGML_OP_GET_ROWS_BACK          ] = true;
        p[GGML_OP_DIAG_MASK_INF          ] = true;
        p[GGML_OP_DIAG_MASK_ZERO         ] = true;
        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
        p[GGML_OP_ADD_REL_POS            ] = true;
    }

    {   // FINALIZE
        bool * p = GGML_OP_HAS_FINALIZE;

        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
    }
}

//
// ggml context
//

struct ggml_context {
    size_t mem_size;
    void * mem_buffer;
    bool   mem_buffer_owned;
    bool   no_alloc;
    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers

    int    n_objects;

    struct ggml_object * objects_begin;
    struct ggml_object * objects_end;

    struct ggml_scratch scratch;
    struct ggml_scratch scratch_save;
};

struct ggml_context_container {
    bool used;

    struct ggml_context context;
};

//
// NUMA support
//

#define GGML_NUMA_MAX_NODES 8
#define GGML_NUMA_MAX_CPUS 512

struct ggml_numa_node {
    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
    uint32_t n_cpus;
};

struct ggml_numa_nodes {
    enum ggml_numa_strategy numa_strategy;
    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
    uint32_t n_nodes;
    uint32_t total_cpus; // hardware threads on system
    uint32_t current_node; // node on which main process is execting
#if defined(__gnu_linux__)
    cpu_set_t cpuset; // cpuset from numactl
#else
    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
#endif
};

//
// ggml state
//

struct ggml_state {
    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
    struct ggml_numa_nodes numa;
};

// global state
static struct ggml_state g_state;
static atomic_int g_state_barrier = 0;

// barrier via spin lock
inline static void ggml_critical_section_start(void) {
    int processing = atomic_fetch_add(&g_state_barrier, 1);

    while (processing > 0) {
        // wait for other threads to finish
        atomic_fetch_sub(&g_state_barrier, 1);
        sched_yield(); // TODO: reconsider this
        processing = atomic_fetch_add(&g_state_barrier, 1);
    }
}

// TODO: make this somehow automatically executed
//       some sort of "sentry" mechanism
inline static void ggml_critical_section_end(void) {
    atomic_fetch_sub(&g_state_barrier, 1);
}

#if defined(__gnu_linux__)
static cpu_set_t ggml_get_numa_affinity(void) {
    cpu_set_t cpuset;
    pthread_t thread;
    thread = pthread_self();
    CPU_ZERO(&cpuset);
    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    return cpuset;
}
#else
static uint32_t ggml_get_numa_affinity(void) {
    return 0; // no NUMA support
}
#endif

void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
    if (g_state.numa.n_nodes > 0) {
        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");

        return;
    }

#if defined(__gnu_linux__)
    struct stat st;
    char path[256];
    int rv;

    // set numa scheme
    g_state.numa.numa_strategy = numa_flag;

    GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);

    g_state.numa.cpuset = ggml_get_numa_affinity();

    // enumerate nodes
    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        if (stat(path, &st) != 0) { break; }
        ++g_state.numa.n_nodes;
    }

    // enumerate CPUs
    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        if (stat(path, &st) != 0) { break; }
        ++g_state.numa.total_cpus;
    }

    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);

    // figure out which node we're on
    uint current_cpu;
    int getcpu_ret = 0;
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
#else
    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
#   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
#       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
#   endif
    getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
#endif

    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
        g_state.numa.n_nodes = 0;
        return;
    }

    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);

    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
        struct ggml_numa_node * node = &g_state.numa.nodes[n];
        GGML_PRINT_DEBUG("CPUs on node %u:", n);
        node->n_cpus = 0;
        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
            if (stat(path, &st) == 0) {
                node->cpus[node->n_cpus++] = c;
                GGML_PRINT_DEBUG(" %u", c);
            }
        }
        GGML_PRINT_DEBUG("\n");
    }

    if (ggml_is_numa()) {
        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
        if (fptr != NULL) {
            char buf[42];
            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
            }
            fclose(fptr);
        }
    }
#else
    GGML_UNUSED(numa_flag);
    // TODO
#endif
}

bool ggml_is_numa(void) {
    return g_state.numa.n_nodes > 1;
}

////////////////////////////////////////////////////////////////////////////////

void ggml_print_object(const struct ggml_object * obj) {
    GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
            obj->type, obj->offs, obj->size, (const void *) obj->next);
}

void ggml_print_objects(const struct ggml_context * ctx) {
    struct ggml_object * obj = ctx->objects_begin;

    GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);

    while (obj != NULL) {
        ggml_print_object(obj);
        obj = obj->next;
    }

    GGML_PRINT("%s: --- end ---\n", __func__);
}

GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
}

GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
}

GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
    size_t nbytes;
    size_t blck_size = ggml_blck_size(tensor->type);
    if (blck_size == 1) {
        nbytes = ggml_type_size(tensor->type);
        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
        }
    }
    else {
        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
        }
    }

    return nbytes;
}

size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
}

GGML_CALL int ggml_blck_size(enum ggml_type type) {
    return type_traits[type].blck_size;
}

GGML_CALL size_t ggml_type_size(enum ggml_type type) {
    return type_traits[type].type_size;
}

GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
    assert(ne % ggml_blck_size(type) == 0);
    return ggml_type_size(type)*ne/ggml_blck_size(type);
}

double ggml_type_sizef(enum ggml_type type) {
    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
}

GGML_CALL const char * ggml_type_name(enum ggml_type type) {
    return type_traits[type].type_name;
}

GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
    return type_traits[type].is_quantized;
}

GGML_CALL const char * ggml_op_name(enum ggml_op op) {
    return GGML_OP_NAME[op];
}

const char * ggml_op_symbol(enum ggml_op op) {
    return GGML_OP_SYMBOL[op];
}

const char * ggml_unary_op_name(enum ggml_unary_op op) {
    return GGML_UNARY_OP_NAME[op];
}

GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
    if (t->op == GGML_OP_UNARY) {
        enum ggml_unary_op uop = ggml_get_unary_op(t);
        return ggml_unary_op_name(uop);
    }
    else {
        return ggml_op_name(t->op);
    }
}

GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
    return ggml_type_size(tensor->type);
}

bool ggml_is_scalar(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
}

bool ggml_is_vector(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
}

bool ggml_is_matrix(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
}

bool ggml_is_3d(const struct ggml_tensor * tensor) {
    return tensor->ne[3] == 1;
}

int ggml_n_dims(const struct ggml_tensor * tensor) {
    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
        if (tensor->ne[i] > 1) {
            return i + 1;
        }
    }
    return 1;
}

static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return (t0->ne[0]           == t1->ne[0])  &&
           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
           (t1->ne[3]%t0->ne[3] == 0);
}

static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return (t0->ne[1] == t1->ne[1])   &&
           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
           (t1->ne[3]%t0->ne[3] == 0);
}

enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
    enum ggml_type wtype = GGML_TYPE_COUNT;

    switch (ftype) {
        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
        case GGML_FTYPE_MOSTLY_Q2_K:          wtype = GGML_TYPE_Q2_K;  break;
        case GGML_FTYPE_MOSTLY_Q3_K:          wtype = GGML_TYPE_Q3_K;  break;
        case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
        case GGML_FTYPE_MOSTLY_Q5_K:          wtype = GGML_TYPE_Q5_K;  break;
        case GGML_FTYPE_MOSTLY_Q6_K:          wtype = GGML_TYPE_Q6_K;  break;
        case GGML_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_TYPE_IQ2_XXS;  break;
        case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_XXS:       wtype = GGML_TYPE_IQ3_XXS;  break;
        case GGML_FTYPE_MOSTLY_IQ1_S:         wtype = GGML_TYPE_IQ1_S;    break;
        case GGML_FTYPE_MOSTLY_IQ1_M:         wtype = GGML_TYPE_IQ1_M;    break;
        case GGML_FTYPE_MOSTLY_IQ4_NL:        wtype = GGML_TYPE_IQ4_NL;   break;
        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
    }

    GGML_ASSERT(wtype != GGML_TYPE_COUNT);

    return wtype;
}

size_t ggml_tensor_overhead(void) {
    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
}

GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
    return tensor->nb[0] > tensor->nb[1];
}

GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
        tensor->nb[0] == ggml_type_size(tensor->type) &&
        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}

static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
        tensor->nb[0] == ggml_type_size(tensor->type) &&
        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}

GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
}

static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
        tensor->nb[0] == ggml_type_size(tensor->type) &&
        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}

GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
        if (tensor->ne[i] == 0) {
            // empty if any dimension has no elements
            return true;
        }
    }
    return false;
}

bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
        (t0->ne[0] == t1->ne[0] ) &&
        (t0->ne[1] == t1->ne[1] ) &&
        (t0->ne[2] == t1->ne[2] ) &&
        (t0->ne[3] == t1->ne[3] );
}

// check if t1 can be represented as a repeatition of t0
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return ggml_is_empty(t0) ? ggml_is_empty(t1) :
        (t1->ne[0]%t0->ne[0] == 0) &&
        (t1->ne[1]%t0->ne[1] == 0) &&
        (t1->ne[2]%t0->ne[2] == 0) &&
        (t1->ne[3]%t0->ne[3] == 0);
}

static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
}

static inline int ggml_up32(int n) {
    return (n + 31) & ~31;
}

//static inline int ggml_up64(int n) {
//    return (n + 63) & ~63;
//}

static inline int ggml_up(int n, int m) {
    // assert m is a power of 2
    GGML_ASSERT((m & (m - 1)) == 0);
    return (n + m - 1) & ~(m - 1);
}

// assert that pointer is aligned to GGML_MEM_ALIGN
#define ggml_assert_aligned(ptr) \
    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)

////////////////////////////////////////////////////////////////////////////////

struct ggml_context * ggml_init(struct ggml_init_params params) {
    // make this function thread safe
    ggml_critical_section_start();

    static bool is_first_call = true;

    if (is_first_call) {
        // initialize time system (required on Windows)
        ggml_time_init();

        // initialize GELU, Quick GELU, SILU and EXP F32 tables
        {
            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

            ggml_fp16_t ii;
            for (int i = 0; i < (1 << 16); ++i) {
                uint16_t ui = i;
                memcpy(&ii, &ui, sizeof(ii));
                const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
                ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
                ggml_table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
            }

            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
        }

        // initialize g_state
        {
            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

            g_state = (struct ggml_state) {
                /*.contexts =*/ { { 0 } },
                /*.numa =*/ {
                    .n_nodes = 0,
                    .total_cpus = 0,
                },
            };

            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
                g_state.contexts[i].used = false;
            }

            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
        }

#if defined(GGML_USE_CLBLAST)
        ggml_cl_init();
#elif defined(GGML_USE_VULKAN)
        ggml_vk_init_cpu_assist();
#endif

        ggml_setup_op_has_task_pass();

        is_first_call = false;
    }

    // find non-used context in g_state
    struct ggml_context * ctx = NULL;

    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
        if (!g_state.contexts[i].used) {
            g_state.contexts[i].used = true;
            ctx = &g_state.contexts[i].context;

            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
            break;
        }
    }

    if (ctx == NULL) {
        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);

        ggml_critical_section_end();

        return NULL;
    }

    // allow to call ggml_init with 0 size
    if (params.mem_size == 0) {
        params.mem_size = GGML_MEM_ALIGN;
    }

    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);

    *ctx = (struct ggml_context) {
        /*.mem_size           =*/ mem_size,
        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
        /*.no_alloc           =*/ params.no_alloc,
        /*.no_alloc_save      =*/ params.no_alloc,
        /*.n_objects          =*/ 0,
        /*.objects_begin      =*/ NULL,
        /*.objects_end        =*/ NULL,
        /*.scratch            =*/ { 0, 0, NULL, },
        /*.scratch_save       =*/ { 0, 0, NULL, },
    };

    GGML_ASSERT(ctx->mem_buffer != NULL);

    ggml_assert_aligned(ctx->mem_buffer);

    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);

    ggml_critical_section_end();

    return ctx;
}

void ggml_free(struct ggml_context * ctx) {
    if (ctx == NULL) {
        return;
    }

    // make this function thread safe
    ggml_critical_section_start();

    bool found = false;

    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
        if (&g_state.contexts[i].context == ctx) {
            g_state.contexts[i].used = false;

            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
                    __func__, i, ggml_used_mem(ctx));

            if (ctx->mem_buffer_owned) {
                GGML_ALIGNED_FREE(ctx->mem_buffer);
            }

            found = true;
            break;
        }
    }

    if (!found) {
        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
    }

    ggml_critical_section_end();
}

size_t ggml_used_mem(const struct ggml_context * ctx) {
    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
}

size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;

    ctx->scratch = scratch;

    return result;
}

bool ggml_get_no_alloc(struct ggml_context * ctx) {
    return ctx->no_alloc;
}

void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
    ctx->no_alloc = no_alloc;
}

void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
    return ctx->mem_buffer;
}

size_t ggml_get_mem_size(const struct ggml_context * ctx) {
    return ctx->mem_size;
}

size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
    size_t max_size = 0;

    for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
        size_t bytes = ggml_nbytes(tensor);
        max_size = MAX(max_size, bytes);
    }

    return max_size;
}

// IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace
// operators when using scratch buffers
// TODO: implement a better way
static void ggml_scratch_save(struct ggml_context * ctx) {
    // this is needed to allow opt tensors to store their data
    // TODO: again, need to find a better way
    ctx->no_alloc_save = ctx->no_alloc;
    ctx->no_alloc      = false;

    ctx->scratch_save = ctx->scratch;
    ctx->scratch.data = NULL;
}

static void ggml_scratch_load(struct ggml_context * ctx) {
    ctx->no_alloc = ctx->no_alloc_save;

    ctx->scratch = ctx->scratch_save;
}

////////////////////////////////////////////////////////////////////////////////

static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
    // always insert objects at the end of the context's memory pool
    struct ggml_object * obj_cur = ctx->objects_end;

    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
    const size_t cur_end  = cur_offs + cur_size;

    // align to GGML_MEM_ALIGN
    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);

    char * const mem_buffer = ctx->mem_buffer;
    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);

    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                __func__, cur_end + size_needed, ctx->mem_size);
        assert(false);
        return NULL;
    }

    *obj_new = (struct ggml_object) {
        .offs = cur_end + GGML_OBJECT_SIZE,
        .size = size_needed,
        .next = NULL,
        .type = type,
    };

    ggml_assert_aligned(mem_buffer + obj_new->offs);

    if (obj_cur != NULL) {
        obj_cur->next = obj_new;
    } else {
        // this is the first object in this context
        ctx->objects_begin = obj_new;
    }

    ctx->objects_end = obj_new;

    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);

    return obj_new;
}

static struct ggml_tensor * ggml_new_tensor_impl(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int                   n_dims,
        const int64_t       * ne,
        struct ggml_tensor  * view_src,
        size_t                view_offs) {

    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);

    // find the base tensor and absolute offset
    if (view_src != NULL && view_src->view_src != NULL) {
        view_offs += view_src->view_offs;
        view_src   = view_src->view_src;
    }

    size_t data_size = ggml_row_size(type, ne[0]);
    for (int i = 1; i < n_dims; i++) {
        data_size *= ne[i];
    }

    GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));

    void * data = view_src != NULL ? view_src->data : NULL;
    if (data != NULL) {
        data = (char *) data + view_offs;
    }

    size_t obj_alloc_size = 0;

    if (view_src == NULL && !ctx->no_alloc) {
        if (ctx->scratch.data != NULL) {
            // allocate tensor data in the scratch buffer
            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
                GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
                        __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
                assert(false);
                return NULL;
            }

            data = (char * const) ctx->scratch.data + ctx->scratch.offs;

            ctx->scratch.offs += data_size;
        } else {
            // allocate tensor data in the context's memory pool
            obj_alloc_size = data_size;
        }
    }

    struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);

    // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here

    struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);

    *result = (struct ggml_tensor) {
        /*.type         =*/ type,
        /*.backend      =*/ GGML_BACKEND_TYPE_CPU,
        /*.buffer       =*/ NULL,
        /*.ne           =*/ { 1, 1, 1, 1 },
        /*.nb           =*/ { 0, 0, 0, 0 },
        /*.op           =*/ GGML_OP_NONE,
        /*.op_params    =*/ { 0 },
        /*.flags        =*/ 0,
        /*.grad         =*/ NULL,
        /*.src          =*/ { NULL },
        /*.perf_runs    =*/ 0,
        /*.perf_cycles  =*/ 0,
        /*.perf_time_us =*/ 0,
        /*.view_src     =*/ view_src,
        /*.view_offs    =*/ view_offs,
        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
        /*.name         =*/ { 0 },
        /*.extra        =*/ NULL,
        /*.padding      =*/ { 0 },
    };

    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
    //ggml_assert_aligned(result->data);

    for (int i = 0; i < n_dims; i++) {
        result->ne[i] = ne[i];
    }

    result->nb[0] = ggml_type_size(type);
    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
    for (int i = 2; i < GGML_MAX_DIMS; i++) {
        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
    }

    ctx->n_objects++;

    return result;
}

struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int                   n_dims,
        const int64_t       * ne) {
    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
}

struct ggml_tensor * ggml_new_tensor_1d(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int64_t ne0) {
    return ggml_new_tensor(ctx, type, 1, &ne0);
}

struct ggml_tensor * ggml_new_tensor_2d(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int64_t ne0,
        int64_t ne1) {
    const int64_t ne[2] = { ne0, ne1 };
    return ggml_new_tensor(ctx, type, 2, ne);
}

struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int64_t ne0,
        int64_t ne1,
        int64_t ne2) {
    const int64_t ne[3] = { ne0, ne1, ne2 };
    return ggml_new_tensor(ctx, type, 3, ne);
}

struct ggml_tensor * ggml_new_tensor_4d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int64_t ne0,
        int64_t ne1,
        int64_t ne2,
        int64_t ne3) {
    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
    return ggml_new_tensor(ctx, type, 4, ne);
}

struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
    ggml_scratch_save(ctx);

    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);

    ggml_scratch_load(ctx);

    ggml_set_i32(result, value);

    return result;
}

struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
    ggml_scratch_save(ctx);

    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);

    ggml_scratch_load(ctx);

    ggml_set_f32(result, value);

    return result;
}

struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
    return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
}

static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
    assert(params_size <= GGML_MAX_OP_PARAMS);
    memcpy(tensor->op_params, params, params_size);
}

static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
    return ((const int32_t *)(tensor->op_params))[i];
}

static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
    return ((const float *)(tensor->op_params))[i];
}

static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
    ((int32_t *)(tensor->op_params))[i] = value;
}

static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
    ((float *)(tensor->op_params))[i] = value;
}

struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
    memset(tensor->data, 0, ggml_nbytes(tensor));
    return tensor;
}

struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
    const int n     = ggml_nrows(tensor);
    const int nc    = tensor->ne[0];
    const size_t n1 = tensor->nb[1];

    char * const data = tensor->data;

    switch (tensor->type) {
        case GGML_TYPE_I8:
            {
                assert(tensor->nb[0] == sizeof(int8_t));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
                }
            } break;
        case GGML_TYPE_I16:
            {
                assert(tensor->nb[0] == sizeof(int16_t));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
                }
            } break;
        case GGML_TYPE_I32:
            {
                assert(tensor->nb[0] == sizeof(int32_t));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
                }
            } break;
        case GGML_TYPE_F16:
            {
                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
                }
            } break;
        case GGML_TYPE_F32:
            {
                assert(tensor->nb[0] == sizeof(float));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
                }
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }

    return tensor;
}

struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
    const int n     = ggml_nrows(tensor);
    const int nc    = tensor->ne[0];
    const size_t n1 = tensor->nb[1];

    char * const data = tensor->data;

    switch (tensor->type) {
        case GGML_TYPE_I8:
            {
                assert(tensor->nb[0] == sizeof(int8_t));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
                }
            } break;
        case GGML_TYPE_I16:
            {
                assert(tensor->nb[0] == sizeof(int16_t));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
                }
            } break;
        case GGML_TYPE_I32:
            {
                assert(tensor->nb[0] == sizeof(int32_t));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
                }
            } break;
        case GGML_TYPE_F16:
            {
                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
                }
            } break;
        case GGML_TYPE_F32:
            {
                assert(tensor->nb[0] == sizeof(float));
                for (int i = 0; i < n; i++) {
                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
                }
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }

    return tensor;
}

void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
    const int64_t ne2 = tensor->ne[2];
    const int64_t ne1 = tensor->ne[1];
    const int64_t ne0 = tensor->ne[0];

    const int64_t i3_ = (i/(ne2*ne1*ne0));
    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);

    if (i0) {
        * i0 = i0_;
    }
    if (i1) {
        * i1 = i1_;
    }
    if (i2) {
        * i2 = i2_;
    }
    if (i3) {
        * i3 = i3_;
    }
}

int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
    if (!ggml_is_contiguous(tensor)) {
        int64_t id[4] = { 0, 0, 0, 0 };
        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
    }
    switch (tensor->type) {
        case GGML_TYPE_I8:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
                return ((int8_t *)(tensor->data))[i];
            }
        case GGML_TYPE_I16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
                return ((int16_t *)(tensor->data))[i];
            }
        case GGML_TYPE_I32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
                return ((int32_t *)(tensor->data))[i];
            }
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
            }
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(float));
                return ((float *)(tensor->data))[i];
            }
        default:
            {
                GGML_ASSERT(false);
            }
    }

    return 0.0f;
}

void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
    if (!ggml_is_contiguous(tensor)) {
        int64_t id[4] = { 0, 0, 0, 0 };
        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
        return;
    }
    switch (tensor->type) {
        case GGML_TYPE_I8:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
                ((int8_t *)(tensor->data))[i] = value;
            } break;
        case GGML_TYPE_I16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
                ((int16_t *)(tensor->data))[i] = value;
            } break;
        case GGML_TYPE_I32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
                ((int32_t *)(tensor->data))[i] = value;
            } break;
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
            } break;
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(float));
                ((float *)(tensor->data))[i] = value;
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
    switch (tensor->type) {
        case GGML_TYPE_I8:
            return ((int8_t *) data)[0];
        case GGML_TYPE_I16:
            return ((int16_t *) data)[0];
        case GGML_TYPE_I32:
            return ((int32_t *) data)[0];
        case GGML_TYPE_F16:
            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
        case GGML_TYPE_F32:
            return ((float *) data)[0];
        default:
            GGML_ASSERT(false);
    }

    return 0.0f;
}

void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
    switch (tensor->type) {
        case GGML_TYPE_I8:
            {
                ((int8_t *)(data))[0] = value;
            } break;
        case GGML_TYPE_I16:
            {
                ((int16_t *)(data))[0] = value;
            } break;
        case GGML_TYPE_I32:
            {
                ((int32_t *)(data))[0] = value;
            } break;
        case GGML_TYPE_F16:
            {
                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
            } break;
        case GGML_TYPE_F32:
            {
                ((float *)(data))[0] = value;
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
    if (!ggml_is_contiguous(tensor)) {
        int64_t id[4] = { 0, 0, 0, 0 };
        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
    }
    switch (tensor->type) {
        case GGML_TYPE_I8:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
                return ((int8_t *)(tensor->data))[i];
            }
        case GGML_TYPE_I16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
                return ((int16_t *)(tensor->data))[i];
            }
        case GGML_TYPE_I32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
                return ((int32_t *)(tensor->data))[i];
            }
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
            }
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(float));
                return ((float *)(tensor->data))[i];
            }
        default:
            {
                GGML_ASSERT(false);
            }
    }

    return 0.0f;
}

void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
    if (!ggml_is_contiguous(tensor)) {
        int64_t id[4] = { 0, 0, 0, 0 };
        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
        return;
    }
    switch (tensor->type) {
        case GGML_TYPE_I8:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
                ((int8_t *)(tensor->data))[i] = value;
            } break;
        case GGML_TYPE_I16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
                ((int16_t *)(tensor->data))[i] = value;
            } break;
        case GGML_TYPE_I32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
                ((int32_t *)(tensor->data))[i] = value;
            } break;
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
            } break;
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(float));
                ((float *)(tensor->data))[i] = value;
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
    switch (tensor->type) {
        case GGML_TYPE_I8:
            return ((int8_t *) data)[0];
        case GGML_TYPE_I16:
            return ((int16_t *) data)[0];
        case GGML_TYPE_I32:
            return ((int32_t *) data)[0];
        case GGML_TYPE_F16:
            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
        case GGML_TYPE_F32:
            return ((float *) data)[0];
        default:
            GGML_ASSERT(false);
    }

    return 0.0f;
}

void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
    switch (tensor->type) {
        case GGML_TYPE_I8:
            {
                ((int8_t *)(data))[0] = value;
            } break;
        case GGML_TYPE_I16:
            {
                ((int16_t *)(data))[0] = value;
            } break;
        case GGML_TYPE_I32:
            {
                ((int32_t *)(data))[0] = value;
            } break;
        case GGML_TYPE_F16:
            {
                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
            } break;
        case GGML_TYPE_F32:
            {
                ((float *)(data))[0] = value;
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

void * ggml_get_data(const struct ggml_tensor * tensor) {
    return tensor->data;
}

float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
    assert(tensor->type == GGML_TYPE_F32);
    return (float *)(tensor->data);
}

GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
    GGML_ASSERT(tensor->op == GGML_OP_UNARY);
    return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
}

const char * ggml_get_name(const struct ggml_tensor * tensor) {
    return tensor->name;
}

struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
    strncpy(tensor->name, name, sizeof(tensor->name) - 1);
    tensor->name[sizeof(tensor->name) - 1] = '\0';
    return tensor;
}

struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
    va_list args;
    va_start(args, fmt);
    vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
    va_end(args);
    return tensor;
}

struct ggml_tensor * ggml_view_tensor(
        struct ggml_context * ctx,
        struct ggml_tensor  * src) {
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
    ggml_format_name(result, "%s (view)", src->name);

    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        result->nb[i] = src->nb[i];
    }

    return result;
}

struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
    struct ggml_object * obj = ctx->objects_begin;

    char * const mem_buffer = ctx->mem_buffer;

    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
            return (struct ggml_tensor *)(mem_buffer + obj->offs);
        }

        obj = obj->next;
    }

    return NULL;
}

struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
    obj = obj->next;

    char * const mem_buffer = ctx->mem_buffer;

    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
            return (struct ggml_tensor *)(mem_buffer + obj->offs);
        }

        obj = obj->next;
    }

    return NULL;
}

struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
    struct ggml_object * obj = ctx->objects_begin;

    char * const mem_buffer = ctx->mem_buffer;

    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
            struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
            if (strcmp(cur->name, name) == 0) {
                return cur;
            }
        }

        obj = obj->next;
    }

    return NULL;
}

////////////////////////////////////////////////////////////////////////////////

// ggml_dup

static struct ggml_tensor * ggml_dup_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        bool inplace) {
    bool is_node = false;

    if (!inplace && (a->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_DUP;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_dup(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
    return ggml_dup_impl(ctx, a, false);
}

struct ggml_tensor * ggml_dup_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
    return ggml_dup_impl(ctx, a, true);
}

// ggml_add

static struct ggml_tensor * ggml_add_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        bool inplace) {
    GGML_ASSERT(ggml_can_repeat(b, a));

    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
        // TODO: support backward pass for broadcasting
        GGML_ASSERT(ggml_are_same_shape(a, b));
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_ADD;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_add(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    return ggml_add_impl(ctx, a, b, false);
}

struct ggml_tensor * ggml_add_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    return ggml_add_impl(ctx, a, b, true);
}

// ggml_add_cast

static struct ggml_tensor * ggml_add_cast_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        enum   ggml_type     type) {
    // TODO: support less-strict constraint
    //       GGML_ASSERT(ggml_can_repeat(b, a));
    GGML_ASSERT(ggml_can_repeat_rows(b, a));
    GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16

    bool is_node = false;

    if (a->grad || b->grad) {
        // TODO: support backward pass for broadcasting
        GGML_ASSERT(ggml_are_same_shape(a, b));
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);

    result->op   = GGML_OP_ADD;
    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_add_cast(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        enum   ggml_type     type) {
    return ggml_add_cast_impl(ctx, a, b, type);
}

// ggml_add1

static struct ggml_tensor * ggml_add1_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        bool inplace) {
    GGML_ASSERT(ggml_is_scalar(b));
    GGML_ASSERT(ggml_is_padded_1d(a));

    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_ADD1;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_add1(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    return ggml_add1_impl(ctx, a, b, false);
}

struct ggml_tensor * ggml_add1_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    return ggml_add1_impl(ctx, a, b, true);
}

// ggml_acc

static struct ggml_tensor * ggml_acc_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        size_t               nb1,
        size_t               nb2,
        size_t               nb3,
        size_t               offset,
        bool inplace) {
    GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(a->type == GGML_TYPE_F32);
    GGML_ASSERT(b->type == GGML_TYPE_F32);

    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ACC;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_acc(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        size_t               nb1,
        size_t               nb2,
        size_t               nb3,
        size_t               offset) {
    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
}

struct ggml_tensor * ggml_acc_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        size_t               nb1,
        size_t               nb2,
        size_t               nb3,
        size_t               offset) {
    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
}

// ggml_sub

static struct ggml_tensor * ggml_sub_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        bool inplace) {
    GGML_ASSERT(ggml_are_same_shape(a, b));

    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_SUB;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_sub(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    return ggml_sub_impl(ctx, a, b, false);
}

struct ggml_tensor * ggml_sub_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    return ggml_sub_impl(ctx, a, b, true);
}

// ggml_mul

static struct ggml_tensor * ggml_mul_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        bool inplace) {
    GGML_ASSERT(ggml_can_repeat(b, a));

    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
        // TODO: support backward pass for broadcasting
        GGML_ASSERT(ggml_are_same_shape(a, b));
        is_node = true;
    }

    if (inplace) {
        GGML_ASSERT(!is_node);
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_MUL;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_mul(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_mul_impl(ctx, a, b, false);
}

struct ggml_tensor * ggml_mul_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_mul_impl(ctx, a, b, true);
}

// ggml_div

static struct ggml_tensor * ggml_div_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        bool inplace) {
    GGML_ASSERT(ggml_can_repeat(b, a));

    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }

    if (inplace) {
        GGML_ASSERT(!is_node);
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_DIV;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_div(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_div_impl(ctx, a, b, false);
}

struct ggml_tensor * ggml_div_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_div_impl(ctx, a, b, true);
}

// ggml_sqr

static struct ggml_tensor * ggml_sqr_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        bool inplace) {
    bool is_node = false;

    if (!inplace && (a->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_SQR;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_sqr(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sqr_impl(ctx, a, false);
}

struct ggml_tensor * ggml_sqr_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sqr_impl(ctx, a, true);
}

// ggml_sqrt

static struct ggml_tensor * ggml_sqrt_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        bool inplace) {
    bool is_node = false;

    if (!inplace && (a->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_SQRT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_sqrt(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sqrt_impl(ctx, a, false);
}

struct ggml_tensor * ggml_sqrt_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sqrt_impl(ctx, a, true);
}

// ggml_log

static struct ggml_tensor * ggml_log_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool inplace) {
    bool is_node = false;

    if (!inplace && (a->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_LOG;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_log(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_log_impl(ctx, a, false);
}

struct ggml_tensor * ggml_log_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_log_impl(ctx, a, true);
}

// ggml_sum

struct ggml_tensor * ggml_sum(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);

    result->op   = GGML_OP_SUM;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_sum_rows

struct ggml_tensor * ggml_sum_rows(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    int64_t ne[GGML_MAX_DIMS] = { 1 };
    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
        ne[i] = a->ne[i];
    }

    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);

    result->op   = GGML_OP_SUM_ROWS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_mean

struct ggml_tensor * ggml_mean(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement
        is_node = true;
    }

    int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    result->op   = GGML_OP_MEAN;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_argmax

struct ggml_tensor * ggml_argmax(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
    GGML_ASSERT(ggml_is_matrix(a));
    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false);
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);

    result->op   = GGML_OP_ARGMAX;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_repeat

struct ggml_tensor * ggml_repeat(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    GGML_ASSERT(ggml_can_repeat(a, b));

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);

    result->op   = GGML_OP_REPEAT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_repeat_back

struct ggml_tensor * ggml_repeat_back(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    GGML_ASSERT(ggml_can_repeat(b, a));

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    if (ggml_are_same_shape(a, b) && !is_node) {
        return a;
    }

    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);

    result->op   = GGML_OP_REPEAT_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_concat

struct ggml_tensor * ggml_concat(
    struct ggml_context* ctx,
    struct ggml_tensor* a,
    struct ggml_tensor* b) {
    GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);

    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);

    result->op = GGML_OP_CONCAT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_abs

struct ggml_tensor * ggml_abs(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
}

struct ggml_tensor * ggml_abs_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
}

// ggml_sgn

struct ggml_tensor * ggml_sgn(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
}

struct ggml_tensor * ggml_sgn_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
}

// ggml_neg

struct ggml_tensor * ggml_neg(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
}

struct ggml_tensor * ggml_neg_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
}

// ggml_step

struct ggml_tensor * ggml_step(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
}

struct ggml_tensor * ggml_step_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
}

// ggml_tanh

struct ggml_tensor * ggml_tanh(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
}

struct ggml_tensor * ggml_tanh_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
}

// ggml_elu

struct ggml_tensor * ggml_elu(
    struct ggml_context * ctx,
    struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
}

struct ggml_tensor * ggml_elu_inplace(
    struct ggml_context * ctx,
    struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
}

// ggml_relu

struct ggml_tensor * ggml_relu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
}

struct ggml_tensor * ggml_relu_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
}

// ggml_leaky_relu

struct ggml_tensor * ggml_leaky_relu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a, float negative_slope, bool inplace) {
    bool is_node = false;

    if (!inplace && (a->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));

    result->op   = GGML_OP_LEAKY_RELU;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_gelu

struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
}

struct ggml_tensor * ggml_gelu_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
}

// ggml_gelu_quick

struct ggml_tensor * ggml_gelu_quick(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
}

struct ggml_tensor * ggml_gelu_quick_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
}

// ggml_silu

struct ggml_tensor * ggml_silu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
}

struct ggml_tensor * ggml_silu_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
}

// ggml_silu_back

struct ggml_tensor * ggml_silu_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    bool is_node = false;

    if (a->grad || b->grad) {
        // TODO: implement backward
        is_node = true;
    }

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_SILU_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml hardswish
struct ggml_tensor * ggml_hardswish(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
}

// ggml hardsigmoid
struct ggml_tensor * ggml_hardsigmoid(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
}

// ggml_norm

static struct ggml_tensor * ggml_norm_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float eps,
        bool inplace) {
    bool is_node = false;

    if (!inplace && (a->grad)) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, &eps, sizeof(eps));

    result->op   = GGML_OP_NORM;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float eps) {
    return ggml_norm_impl(ctx, a, eps, false);
}

struct ggml_tensor * ggml_norm_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float eps) {
    return ggml_norm_impl(ctx, a, eps, true);
}

// ggml_rms_norm

static struct ggml_tensor * ggml_rms_norm_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float eps,
        bool inplace) {
    bool is_node = false;

    if (!inplace && (a->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, &eps, sizeof(eps));

    result->op   = GGML_OP_RMS_NORM;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_rms_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float  eps) {
    return ggml_rms_norm_impl(ctx, a, eps, false);
}

struct ggml_tensor * ggml_rms_norm_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float eps) {
    return ggml_rms_norm_impl(ctx, a, eps, true);
}

// ggml_rms_norm_back

struct ggml_tensor * ggml_rms_norm_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        float  eps) {
    bool is_node = false;

    if (a->grad) {
        // TODO: implement backward
        is_node = true;
    }

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, &eps, sizeof(eps));

    result->op   = GGML_OP_RMS_NORM_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_group_norm

static struct ggml_tensor * ggml_group_norm_impl(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    int n_groups,
    bool inplace) {

    bool is_node = false;
    if (!inplace && (a->grad)) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op_params[0] = n_groups;

    result->op = GGML_OP_GROUP_NORM;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_group_norm(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    int n_groups) {
    return ggml_group_norm_impl(ctx, a, n_groups, false);
}

struct ggml_tensor * ggml_group_norm_inplace(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    int n_groups) {
    return ggml_group_norm_impl(ctx, a, n_groups, true);
}

// ggml_mul_mat

struct ggml_tensor * ggml_mul_mat(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_can_mul_mat(a, b));
    GGML_ASSERT(!ggml_is_transposed(a));

    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true;
    }

    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    result->op   = GGML_OP_MUL_MAT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

void ggml_mul_mat_set_prec(
        struct ggml_tensor * a,
        enum ggml_prec       prec) {
    const int32_t prec_i32 = (int32_t) prec;

    ggml_set_op_params_i32(a, 0, prec_i32);
}

// ggml_mul_mat_id

struct ggml_tensor * ggml_mul_mat_id(
        struct ggml_context * ctx,
        struct ggml_tensor  * const as[],
        int                   n_as,
        struct ggml_tensor  * ids,
        int                   id,
        struct ggml_tensor  * b) {

    GGML_ASSERT(ids->type == GGML_TYPE_I32);
    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
    GGML_ASSERT(ids->ne[1] == b->ne[1]);
    GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
    GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
    GGML_ASSERT(id >= 0 && id < ids->ne[0]);

    bool is_node = false;

    if (as[0]->grad || b->grad) {
        is_node = true;
    }

    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    ggml_set_op_params_i32(result, 0, id);
    ggml_set_op_params_i32(result, 1, n_as);

    result->op   = GGML_OP_MUL_MAT_ID;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = ids;
    result->src[1] = b;

    for (int i = 0; i < n_as; i++) {
        struct ggml_tensor * a = as[i];
        GGML_ASSERT(ggml_are_same_shape(as[0], a));
        GGML_ASSERT(ggml_can_mul_mat(a, b));
        GGML_ASSERT(!ggml_is_transposed(a));
        result->src[i + 2] = a;
    }

    return result;
}

// ggml_out_prod

struct ggml_tensor * ggml_out_prod(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_can_out_prod(a, b));
    GGML_ASSERT(!ggml_is_transposed(a));

    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true;
    }

    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    result->op   = GGML_OP_OUT_PROD;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_scale

static struct ggml_tensor * ggml_scale_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 s,
        bool inplace) {
    GGML_ASSERT(ggml_is_padded_1d(a));

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, &s, sizeof(s));

    result->op   = GGML_OP_SCALE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_scale(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        float                s) {
    return ggml_scale_impl(ctx, a, s, false);
}

struct ggml_tensor * ggml_scale_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        float                s) {
    return ggml_scale_impl(ctx, a, s, true);
}

// ggml_set

static struct ggml_tensor * ggml_set_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset,
        bool inplace) {
    GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));

    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true;
    }

    // make a view of the destination
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_SET;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_set(
        struct ggml_context * ctx,
        struct ggml_tensor *  a,
        struct ggml_tensor *  b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
}

struct ggml_tensor * ggml_set_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor *  a,
        struct ggml_tensor *  b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
}

struct ggml_tensor * ggml_set_1d(
        struct ggml_context * ctx,
        struct ggml_tensor *  a,
        struct ggml_tensor *  b,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
}

struct ggml_tensor * ggml_set_1d_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor *  a,
        struct ggml_tensor *  b,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
}

struct ggml_tensor * ggml_set_2d(
        struct ggml_context * ctx,
        struct ggml_tensor *  a,
        struct ggml_tensor *  b,
        size_t                nb1,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
}

struct ggml_tensor * ggml_set_2d_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor *  a,
        struct ggml_tensor *  b,
        size_t                nb1,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
}

// ggml_cpy

static struct ggml_tensor * ggml_cpy_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));

    bool is_node = false;

    if (a->grad || b->grad) {
        // inplace is false and either one have a grad
        is_node = true;
    }

    // make a view of the destination
    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
    if (strlen(b->name) > 0) {
        ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
    } else {
        ggml_format_name(result, "%s (copy)", a->name);
    }

    result->op   = GGML_OP_CPY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_cpy(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    return ggml_cpy_impl(ctx, a, b);
}

struct ggml_tensor * ggml_cast(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum   ggml_type      type) {
    bool is_node = false;

    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
    ggml_format_name(result, "%s (copy)", a->name);

    result->op   = GGML_OP_CPY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = result;

    return result;
}

// ggml_cont

static struct ggml_tensor * ggml_cont_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
    ggml_format_name(result, "%s (cont)", a->name);

    result->op   = GGML_OP_CONT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_cont(
        struct ggml_context * ctx,
        struct ggml_tensor * a) {
    return ggml_cont_impl(ctx, a);
}

// make contiguous, with new shape
GGML_API struct ggml_tensor * ggml_cont_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0) {
    return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
}

GGML_API struct ggml_tensor * ggml_cont_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1) {
    return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
}

GGML_API struct ggml_tensor * ggml_cont_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2) {
    return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
}

struct ggml_tensor * ggml_cont_4d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        int64_t               ne3) {
    GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));

    bool is_node = false;

    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
    ggml_format_name(result, "%s (cont)", a->name);

    result->op   = GGML_OP_CONT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_reshape

struct ggml_tensor * ggml_reshape(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
    GGML_ASSERT(ggml_is_contiguous(a));
    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    if (b->grad) {
        // gradient propagation is not supported
        //GGML_ASSERT(false);
    }

    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);

    result->op   = GGML_OP_RESHAPE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_reshape_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0) {
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_nelements(a) == ne0);

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    const int64_t ne[1] = { ne0 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);

    result->op   = GGML_OP_RESHAPE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_reshape_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1) {
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    const int64_t ne[2] = { ne0, ne1 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);

    result->op   = GGML_OP_RESHAPE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_reshape_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2) {
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    const int64_t ne[3] = { ne0, ne1, ne2 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);

    result->op   = GGML_OP_RESHAPE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_reshape_4d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        int64_t               ne3) {
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);

    result->op   = GGML_OP_RESHAPE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

static struct ggml_tensor * ggml_view_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_dims,
        const int64_t       * ne,
        size_t                offset) {

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
    ggml_format_name(result, "%s (view)", a->name);

    ggml_set_op_params(result, &offset, sizeof(offset));

    result->op   = GGML_OP_VIEW;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_view_1d

struct ggml_tensor * ggml_view_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        size_t                offset) {

    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);

    return result;
}

// ggml_view_2d

struct ggml_tensor * ggml_view_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        size_t                nb1,
        size_t                offset) {

    const int64_t ne[2] = { ne0, ne1 };

    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);

    result->nb[1] = nb1;
    result->nb[2] = result->nb[1]*ne1;
    result->nb[3] = result->nb[2];

    return result;
}

// ggml_view_3d

struct ggml_tensor * ggml_view_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        size_t                nb1,
        size_t                nb2,
        size_t                offset) {

    const int64_t ne[3] = { ne0, ne1, ne2 };

    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);

    result->nb[1] = nb1;
    result->nb[2] = nb2;
    result->nb[3] = result->nb[2]*ne2;

    return result;
}

// ggml_view_4d

struct ggml_tensor * ggml_view_4d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        int64_t               ne3,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset) {

    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };

    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);

    result->nb[1] = nb1;
    result->nb[2] = nb2;
    result->nb[3] = nb3;

    return result;
}

// ggml_permute

struct ggml_tensor * ggml_permute(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   axis0,
        int                   axis1,
        int                   axis2,
        int                   axis3) {
    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);

    GGML_ASSERT(axis0 != axis1);
    GGML_ASSERT(axis0 != axis2);
    GGML_ASSERT(axis0 != axis3);
    GGML_ASSERT(axis1 != axis2);
    GGML_ASSERT(axis1 != axis3);
    GGML_ASSERT(axis2 != axis3);

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
    ggml_format_name(result, "%s (permuted)", a->name);

    int ne[GGML_MAX_DIMS];
    int nb[GGML_MAX_DIMS];

    ne[axis0] = a->ne[0];
    ne[axis1] = a->ne[1];
    ne[axis2] = a->ne[2];
    ne[axis3] = a->ne[3];

    nb[axis0] = a->nb[0];
    nb[axis1] = a->nb[1];
    nb[axis2] = a->nb[2];
    nb[axis3] = a->nb[3];

    result->ne[0] = ne[0];
    result->ne[1] = ne[1];
    result->ne[2] = ne[2];
    result->ne[3] = ne[3];

    result->nb[0] = nb[0];
    result->nb[1] = nb[1];
    result->nb[2] = nb[2];
    result->nb[3] = nb[3];

    result->op   = GGML_OP_PERMUTE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    int32_t params[] = { axis0, axis1, axis2, axis3 };
    ggml_set_op_params(result, params, sizeof(params));

    return result;
}

// ggml_transpose

struct ggml_tensor * ggml_transpose(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
    ggml_format_name(result, "%s (transposed)", a->name);

    result->ne[0] = a->ne[1];
    result->ne[1] = a->ne[0];

    result->nb[0] = a->nb[1];
    result->nb[1] = a->nb[0];

    result->op   = GGML_OP_TRANSPOSE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_get_rows

struct ggml_tensor * ggml_get_rows(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(a->ne[2] == b->ne[1]);
    GGML_ASSERT(b->ne[3] == 1);
    GGML_ASSERT(b->type == GGML_TYPE_I32);

    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true;
    }

    // TODO: implement non F32 return
    enum ggml_type type = GGML_TYPE_F32;
    if (a->type == GGML_TYPE_I32) {
        type = a->type;
    }
    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);

    result->op   = GGML_OP_GET_ROWS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_get_rows_back

struct ggml_tensor * ggml_get_rows_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        struct ggml_tensor  * c) {
    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));

    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true;
    }

    // TODO: implement non F32 return
    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);

    result->op   = GGML_OP_GET_ROWS_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_diag

struct ggml_tensor * ggml_diag(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    GGML_ASSERT(a->ne[1] == 1);
    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);

    result->op   = GGML_OP_DIAG;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_diag_mask_inf

static struct ggml_tensor * ggml_diag_mask_inf_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past,
        bool                  inplace) {
    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[] = { n_past };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_DIAG_MASK_INF;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_diag_mask_inf(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past) {
    return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
}

struct ggml_tensor * ggml_diag_mask_inf_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past) {
    return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
}

// ggml_diag_mask_zero

static struct ggml_tensor * ggml_diag_mask_zero_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past,
        bool                  inplace) {
    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[] = { n_past };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_DIAG_MASK_ZERO;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_diag_mask_zero(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past) {
    return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
}

struct ggml_tensor * ggml_diag_mask_zero_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past) {
    return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
}

// ggml_soft_max

static struct ggml_tensor * ggml_soft_max_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * mask,
        struct ggml_tensor  * pos,
        float                 scale,
        float                 max_bias,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_contiguous(a));

    if (mask) {
        GGML_ASSERT(ggml_is_contiguous(mask));
        GGML_ASSERT(ggml_is_matrix(mask));
        GGML_ASSERT(ggml_can_repeat_rows(mask, a));
    }

    if (pos) {
        GGML_ASSERT(ggml_is_vector(pos));
        GGML_ASSERT(pos->type == GGML_TYPE_F32);
        GGML_ASSERT(pos->ne[0] == a->ne[0]);
    }

    if (max_bias > 0.0f) {
        GGML_ASSERT(pos);
    }

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    float params[] = { scale, max_bias };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_SOFT_MAX;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = mask;
    result->src[2] = pos;

    return result;
}

struct ggml_tensor * ggml_soft_max(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
}

struct ggml_tensor * ggml_soft_max_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
}

struct ggml_tensor * ggml_soft_max_ext(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * mask,
        struct ggml_tensor  * pos,
        float                 scale,
        float                 max_bias) {
    return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
}

// ggml_soft_max_back

static struct ggml_tensor * ggml_soft_max_back_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        bool                  inplace) {
    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true; // TODO : implement backward pass
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_SOFT_MAX_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_soft_max_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_soft_max_back_impl(ctx, a, b, false);
}

struct ggml_tensor * ggml_soft_max_back_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_soft_max_back_impl(ctx, a, b, true);
}

// ggml_rope

static struct ggml_tensor * ggml_rope_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow,
        float                 xpos_base,
        bool                  xpos_down,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_vector(b));
    GGML_ASSERT(b->type == GGML_TYPE_I32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);

    bool is_node = false;

    if (a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
    memcpy(params +  5, &freq_base,    sizeof(float));
    memcpy(params +  6, &freq_scale,   sizeof(float));
    memcpy(params +  7, &ext_factor,   sizeof(float));
    memcpy(params +  8, &attn_factor,  sizeof(float));
    memcpy(params +  9, &beta_fast,    sizeof(float));
    memcpy(params + 10, &beta_slow,    sizeof(float));
    memcpy(params + 11, &xpos_base,    sizeof(float));
    memcpy(params + 12, &xpos_down,    sizeof(bool));
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_rope(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
    return ggml_rope_impl(
        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
    );
}

struct ggml_tensor * ggml_rope_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
    return ggml_rope_impl(
        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
    );
}

struct ggml_tensor * ggml_rope_custom(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow) {
    return ggml_rope_impl(
        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
    );
}

struct ggml_tensor * ggml_rope_custom_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow) {
    return ggml_rope_impl(
        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
    );
}

struct ggml_tensor * ggml_rope_xpos_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        float                 base,
        bool                  down) {
    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
}

// ggml_rope_back

struct ggml_tensor * ggml_rope_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow,
        float                 xpos_base,
        bool                  xpos_down) {
    GGML_ASSERT(ggml_is_vector(b));
    GGML_ASSERT(b->type == GGML_TYPE_I32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);

    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");

    bool is_node = false;

    if (a->grad) {
        is_node = false; // TODO: implement backward
    }

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
    memcpy(params +  5, &freq_base,    sizeof(float));
    memcpy(params +  6, &freq_scale,   sizeof(float));
    memcpy(params +  7, &ext_factor,   sizeof(float));
    memcpy(params +  8, &attn_factor,  sizeof(float));
    memcpy(params +  9, &beta_fast,    sizeof(float));
    memcpy(params + 10, &beta_slow,    sizeof(float));
    memcpy(params + 11, &xpos_base,    sizeof(float));
    memcpy(params + 12, &xpos_down,    sizeof(bool));
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_alibi

struct ggml_tensor * ggml_alibi(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past,
        int                   n_head,
        float                 bias_max) {
    GGML_ASSERT(n_past >= 0);
    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    // TODO: when implement backward, fix this:
    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);

    int32_t op_params[3] = { n_past, n_head };
    memcpy(op_params + 2, &bias_max, sizeof(float));
    ggml_set_op_params(result, op_params, sizeof(op_params));

    result->op   = GGML_OP_ALIBI;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_clamp

struct ggml_tensor * ggml_clamp(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 min,
        float                 max) {
    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    // TODO: when implement backward, fix this:
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);

    float params[] = { min, max };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_CLAMP;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_conv_1d

static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
}

GGML_API struct ggml_tensor * ggml_conv_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   s0,
        int                   p0,
        int                   d0) {
    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]

    struct ggml_tensor * result =
        ggml_mul_mat(ctx,
                ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]

    result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]

    return result;
}

// ggml_conv_1d_ph

struct ggml_tensor* ggml_conv_1d_ph(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   s,
        int                   d) {
    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
}

// ggml_conv_transpose_1d

static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
}

GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   s0,
        int                   p0,
        int                   d0) {
    GGML_ASSERT(ggml_is_matrix(b));
    GGML_ASSERT(a->ne[2] == b->ne[1]);
    GGML_ASSERT(a->ne[3] == 1);

    GGML_ASSERT(p0 == 0);
    GGML_ASSERT(d0 == 1);

    bool is_node = false;

    if (a->grad || b->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    const int64_t ne[4] = {
        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
        a->ne[1], b->ne[2], 1,
    };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    int32_t params[] = { s0, p0, d0 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_CONV_TRANSPOSE_1D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_conv_depthwise
struct ggml_tensor * ggml_conv_depthwise_2d(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    struct ggml_tensor * b,
    int                  s0,
    int                  s1,
    int                  p0,
    int                  p1,
    int                  d0,
    int                  d1) {

    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]

    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]

    return result;
}
// ggml_conv_2d

// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
// a: [OC，IC, KH, KW]
// b: [N, IC, IH, IW]
// result: [N, OH, OW, IC*KH*KW]
struct ggml_tensor * ggml_im2col(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b,
    int                  s0,
    int                  s1,
    int                  p0,
    int                  p1,
    int                  d0,
    int                  d1,
    bool                 is_2D,
    enum ggml_type       dst_type) {

    if(is_2D) {
        GGML_ASSERT(a->ne[2] == b->ne[2]);
    } else {
        GGML_ASSERT(a->ne[1] == b->ne[1]);
    }
    bool is_node = false;

    if (a->grad || b->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);

    const int64_t ne[4] = {
        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
        OW,
        is_2D ? OH : b->ne[2],
        is_2D ?      b->ne[3] : 1,
    };

    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_IM2COL;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// a: [OC，IC, KH, KW]
// b: [N, IC, IH, IW]
// result: [N, OC, OH, OW]
struct ggml_tensor * ggml_conv_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                  s0,
        int                  s1,
        int                  p0,
        int                  p1,
        int                  d0,
        int                  d1) {
    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]

    struct ggml_tensor * result =
        ggml_mul_mat(ctx,
                ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]

    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
    result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]


    return result;
}

// ggml_conv_2d_sk_p0
struct ggml_tensor * ggml_conv_2d_sk_p0(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
}

// ggml_conv_2d_s1_ph

struct ggml_tensor * ggml_conv_2d_s1_ph(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
}

// ggml_conv_transpose_2d_p0

static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
    return (ins - 1) * s - 2 * p + ks;
}

struct ggml_tensor * ggml_conv_transpose_2d_p0(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   stride) {
    GGML_ASSERT(a->ne[3] == b->ne[2]);

    bool is_node = false;

    if (a->grad || b->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    const int64_t ne[4] = {
        ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
        ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
        a->ne[2], b->ne[3],
    };

    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    ggml_set_op_params_i32(result, 0, stride);

    result->op = GGML_OP_CONV_TRANSPOSE_2D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_pool_*

static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
    return (ins + 2 * p - ks) / s + 1;
}

// ggml_pool_1d

struct ggml_tensor * ggml_pool_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_op_pool     op,
        int                   k0,
        int                   s0,
        int                   p0) {

    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    const int64_t ne[4] = {
        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
        a->ne[1],
        a->ne[2],
        a->ne[3],
    };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    int32_t params[] = { op, k0, s0, p0 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_POOL_1D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_pool_2d

struct ggml_tensor * ggml_pool_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_op_pool     op,
        int                   k0,
        int                   k1,
        int                   s0,
        int                   s1,
        float                 p0,
        float                 p1) {

    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    struct ggml_tensor * result;
    const int64_t ne[3] = {
        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
        a->ne[2],
    };
    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);

    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op = GGML_OP_POOL_2D;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    return result;
}

// ggml_upscale

static struct ggml_tensor * ggml_upscale_impl(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    int scale_factor) {
    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
            a->ne[0] * scale_factor,
            a->ne[1] * scale_factor,
            a->ne[2], a->ne[3]);

    result->op = GGML_OP_UPSCALE;
    result->op_params[0] = scale_factor;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_pad(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    int p0, int p1, int p2, int p3) {
    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
            a->ne[0] + p0,
            a->ne[1] + p1,
            a->ne[2] + p2,
            a->ne[3] + p3);

    result->op = GGML_OP_PAD;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_upscale(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    int scale_factor) {
    return ggml_upscale_impl(ctx, a, scale_factor);
}

struct ggml_tensor * ggml_arange(
    struct ggml_context * ctx,
    float start,
    float stop,
    float step) {

    GGML_ASSERT(stop > start);

    const int64_t steps = (int64_t) ceilf((stop - start) / step);

    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);

    result->op = GGML_OP_ARANGE;
    ggml_set_op_params_f32(result, 0, start);
    ggml_set_op_params_f32(result, 1, stop);
    ggml_set_op_params_f32(result, 2, step);

    return result;
}

struct ggml_tensor * ggml_timestep_embedding(
            struct ggml_context * ctx,
            struct ggml_tensor  * timesteps,
            int                   dim,
            int                   max_period) {
    bool is_node = false;

    if (timesteps->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    int actual_dim = dim;
    if (dim % 2 != 0) {
        actual_dim = dim + 1;
    }

    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);

    result->op = GGML_OP_TIMESTEP_EMBEDDING;
    ggml_set_op_params_i32(result, 0, dim);
    ggml_set_op_params_i32(result, 1, max_period);

    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = timesteps;

    return result;
}

// ggml_argsort

struct ggml_tensor * ggml_argsort(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_sort_order  order) {
    bool is_node = false;

    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);

    ggml_set_op_params_i32(result, 0, (int32_t) order);

    result->op   = GGML_OP_ARGSORT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_top_k

struct ggml_tensor * ggml_top_k(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   k) {
    GGML_ASSERT(a->ne[0] >= k);

    struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);

    result = ggml_view_4d(ctx, result,
                k, result->ne[1], result->ne[2], result->ne[3],
                   result->nb[1], result->nb[2], result->nb[3],
                0);

    return result;
}

// ggml_flash_attn

struct ggml_tensor * ggml_flash_attn(
        struct ggml_context * ctx,
        struct ggml_tensor  * q,
        struct ggml_tensor  * k,
        struct ggml_tensor  * v,
        bool                  masked) {
    GGML_ASSERT(ggml_can_mul_mat(k, q));
    // TODO: check if vT can be multiplied by (k*qT)

    bool is_node = false;

    if (q->grad || k->grad || v->grad) {
        is_node = true;
    }

    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);

    int32_t t = masked ? 1 : 0;
    ggml_set_op_params(result, &t, sizeof(t));

    result->op   = GGML_OP_FLASH_ATTN;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = q;
    result->src[1] = k;
    result->src[2] = v;

    return result;
}

// ggml_flash_ff

struct ggml_tensor * ggml_flash_ff(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b0,
        struct ggml_tensor  * b1,
        struct ggml_tensor  * c0,
        struct ggml_tensor  * c1) {
    GGML_ASSERT(ggml_can_mul_mat(b0, a));
    // TODO: more checks

    bool is_node = false;

    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
        is_node = true;
    }

    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);

    result->op   = GGML_OP_FLASH_FF;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b0;
    result->src[2] = b1;
    result->src[3] = c0;
    result->src[4] = c1;

    return result;
}

// ggml_flash_attn_back

struct ggml_tensor * ggml_flash_attn_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * q,
        struct ggml_tensor  * k,
        struct ggml_tensor  * v,
        struct ggml_tensor  * d,
        bool                  masked) {
    GGML_ASSERT(ggml_can_mul_mat(k, q));
    // TODO: check if vT can be multiplied by (k*qT)

    // d shape [D,N,ne2,ne3]
    // q shape [D,N,ne2,ne3]
    // k shape [D,M,kvne2,ne3]
    // v shape [M,D,kvne2,ne3]

    const int64_t     D = q->ne[0];
    const int64_t     N = q->ne[1];
    const int64_t     M = k->ne[1];
    const int64_t   ne2 = q->ne[2];
    const int64_t   ne3 = q->ne[3];
    const int64_t kvne2 = k->ne[2];

    GGML_ASSERT(k->ne[0] == D);
    GGML_ASSERT(v->ne[0] == M);
    GGML_ASSERT(v->ne[1] == D);
    GGML_ASSERT(d->ne[0] == D);
    GGML_ASSERT(d->ne[1] == N);
    GGML_ASSERT(k->ne[2] == kvne2);
    GGML_ASSERT(k->ne[3] == ne3);
    GGML_ASSERT(v->ne[2] == kvne2);
    GGML_ASSERT(v->ne[3] == ne3);
    GGML_ASSERT(d->ne[2] == ne2);
    GGML_ASSERT(d->ne[3] == ne3);

    GGML_ASSERT(ne2 % kvne2 == 0);

    bool is_node = false;

    if (q->grad || k->grad || v->grad) {
        // when using this operation (in backwards pass) these grads are set.
        // we don't want to create (big) grad of our result, so is_node is false.
        is_node = false;
    }

    // store gradients of q, k and v as continuous tensors concatenated in result.
    // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
    const int64_t elem_q = ggml_nelements(q);
    const int64_t elem_k = ggml_nelements(k);
    const int64_t elem_v = ggml_nelements(v);

    enum ggml_type result_type = GGML_TYPE_F32;
    GGML_ASSERT(ggml_blck_size(result_type) == 1);
    const size_t tsize = ggml_type_size(result_type);

    const size_t offs_q = 0;
    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
    const size_t end    = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);

    const size_t nelements = (end + tsize - 1)/tsize;

    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);

    int32_t masked_i = masked ? 1 : 0;
    ggml_set_op_params(result, &masked_i, sizeof(masked_i));

    result->op   = GGML_OP_FLASH_ATTN_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = q;
    result->src[1] = k;
    result->src[2] = v;
    result->src[3] = d;

    return result;
}

// ggml_ssm_conv

struct ggml_tensor * ggml_ssm_conv(
        struct ggml_context * ctx,
        struct ggml_tensor  * s,
        struct ggml_tensor  * x,
        struct ggml_tensor  * c,
        struct ggml_tensor  * sq) {
    GGML_ASSERT(ggml_is_3d(s));
    GGML_ASSERT(ggml_is_matrix(x));
    GGML_ASSERT(ggml_is_matrix(c));
    GGML_ASSERT(ggml_is_matrix(sq));
    GGML_ASSERT(sq->type == GGML_TYPE_I32);

    const int64_t d_conv   = c->ne[0];
    const int64_t d_inner  = c->ne[1];
    const int64_t n_tokens = x->ne[1];
    const int64_t n_kv     = s->ne[2];

    GGML_ASSERT( s->ne[0] == d_conv - 1);
    GGML_ASSERT( s->ne[1] == d_inner);
    GGML_ASSERT( x->ne[0] == d_inner);
    GGML_ASSERT(sq->ne[0] == n_kv);
    GGML_ASSERT(sq->ne[1] == n_tokens);

    bool is_node = false;

    if (s->grad || x->grad || c->grad || sq->grad) {
        GGML_ASSERT(false); // TODO: implement
        is_node = true;
    }

    // 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));

    result->op   = GGML_OP_SSM_CONV;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = s;
    result->src[1] = x;
    result->src[2] = c;
    result->src[3] = sq;

    return result;
}

// ggml_ssm_scan

struct ggml_tensor * ggml_ssm_scan(
        struct ggml_context * ctx,
        struct ggml_tensor  * s,
        struct ggml_tensor  * x,
        struct ggml_tensor  * dt,
        struct ggml_tensor  * A,
        struct ggml_tensor  * B,
        struct ggml_tensor  * C,
        struct ggml_tensor  * sq) {
    GGML_ASSERT(ggml_is_contiguous(s));
    GGML_ASSERT(ggml_is_contiguous(x));
    GGML_ASSERT(ggml_is_contiguous(dt));
    GGML_ASSERT(ggml_is_contiguous(A));
    GGML_ASSERT(sq->type == GGML_TYPE_I32);
    GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
    GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
    GGML_ASSERT(ggml_are_same_shape(x, dt));

    {
        const int64_t d_state  = s->ne[0];
        const int64_t d_inner  = s->ne[1];
        const int64_t n_tokens = x->ne[1];

        GGML_ASSERT(x->ne[0] == d_inner);
        GGML_ASSERT(A->ne[0] == d_state);
        GGML_ASSERT(A->ne[1] == d_inner);
        GGML_ASSERT(B->ne[0] == d_state);
        GGML_ASSERT(B->ne[1] == n_tokens);
        GGML_ASSERT(C->ne[0] == d_state);
        GGML_ASSERT(C->ne[1] == n_tokens);
    }

    bool is_node = false;

    if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
        GGML_ASSERT(false); // TODO: implement
        is_node = true;
    }

    // 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));

    result->op   = GGML_OP_SSM_SCAN;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = s;
    result->src[1] = x;
    result->src[2] = dt;
    result->src[3] = A;
    result->src[4] = B;
    result->src[5] = C;
    result->src[6] = sq;

    return result;
}

// ggml_win_part

struct ggml_tensor * ggml_win_part(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   w) {
    GGML_ASSERT(a->ne[3] == 1);
    GGML_ASSERT(a->type  == GGML_TYPE_F32);

    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    // padding
    const int px = (w - a->ne[1]%w)%w;
    const int py = (w - a->ne[2]%w)%w;

    const int npx = (px + a->ne[1])/w;
    const int npy = (py + a->ne[2])/w;
    const int np  = npx*npy;

    const int64_t ne[4] = { a->ne[0], w, w, np, };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    int32_t params[] = { npx, npy, w };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_WIN_PART;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_win_unpart

struct ggml_tensor * ggml_win_unpart(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   w0,
        int                   h0,
        int                   w) {
    GGML_ASSERT(a->type == GGML_TYPE_F32);

    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);

    int32_t params[] = { w };
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_WIN_UNPART;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_get_rel_pos

struct ggml_tensor * ggml_get_rel_pos(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   qh,
        int                   kh) {
    GGML_ASSERT(qh == kh);
    GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);

    bool is_node = false;

    if (a->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }

    const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);

    result->op   = GGML_OP_GET_REL_POS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

// ggml_add_rel_pos

static struct ggml_tensor * ggml_add_rel_pos_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * pw,
        struct ggml_tensor  * ph,
        bool                  inplace) {
    GGML_ASSERT(ggml_are_same_shape(pw, ph));
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_is_contiguous(pw));
    GGML_ASSERT(ggml_is_contiguous(ph));
    GGML_ASSERT(ph->type == GGML_TYPE_F32);
    GGML_ASSERT(pw->type == GGML_TYPE_F32);
    GGML_ASSERT(pw->ne[3] == a->ne[2]);
    GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
    GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);

    bool is_node = false;

    if (!inplace && (a->grad || pw->grad || ph->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);

    result->op   = GGML_OP_ADD_REL_POS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = pw;
    result->src[2] = ph;

    return result;
}

struct ggml_tensor * ggml_add_rel_pos(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * pw,
        struct ggml_tensor  * ph) {
    return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
}

struct ggml_tensor * ggml_add_rel_pos_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * pw,
        struct ggml_tensor  * ph) {
    return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
}

// gmml_unary

static struct ggml_tensor * ggml_unary_impl(
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        enum ggml_unary_op op,
        bool inplace) {
    bool is_node = false;

    if (!inplace && (a->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params_i32(result, 0, (int32_t) op);

    result->op   = GGML_OP_UNARY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_unary(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_unary_op op) {
    return ggml_unary_impl(ctx, a, op, false);
}

struct ggml_tensor * ggml_unary_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_unary_op op) {
    return ggml_unary_impl(ctx, a, op, true);
}

// ggml_map_unary

static struct ggml_tensor * ggml_map_unary_impl_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t fun,
        bool   inplace) {
    bool is_node = false;

    if (!inplace && a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));

    result->op = GGML_OP_MAP_UNARY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_map_unary_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t fun) {
    return ggml_map_unary_impl_f32(ctx, a, fun, false);
}

struct ggml_tensor * ggml_map_unary_inplace_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t fun) {
    return ggml_map_unary_impl_f32(ctx, a, fun, true);
}

// ggml_map_binary

static struct ggml_tensor * ggml_map_binary_impl_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t fun,
        bool   inplace) {
    GGML_ASSERT(ggml_are_same_shape(a, b));

    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));

    result->op = GGML_OP_MAP_BINARY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_map_binary_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t fun) {
    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
}

struct ggml_tensor * ggml_map_binary_inplace_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t fun) {
    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
}

// ggml_map_custom1_f32

static struct ggml_tensor * ggml_map_custom1_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun,
        bool   inplace) {
    bool is_node = false;

    if (!inplace && a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));

    result->op = GGML_OP_MAP_CUSTOM1_F32;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_map_custom1_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun) {
    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
}

struct ggml_tensor * ggml_map_custom1_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun) {
    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
}

// ggml_map_custom2_f32

static struct ggml_tensor * ggml_map_custom2_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun,
        bool   inplace) {
    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));

    result->op = GGML_OP_MAP_CUSTOM2_F32;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_map_custom2_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun) {
    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
}

struct ggml_tensor * ggml_map_custom2_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun) {
    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
}

// ggml_map_custom3_f32

static struct ggml_tensor * ggml_map_custom3_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun,
        bool   inplace) {
    bool is_node = false;

    if (!inplace && (a->grad || b->grad || c->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));

    result->op = GGML_OP_MAP_CUSTOM3_F32;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;

    return result;
}

struct ggml_tensor * ggml_map_custom3_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun) {
    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
}

struct ggml_tensor * ggml_map_custom3_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun) {
    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
}

// ggml_map_custom1
struct ggml_map_custom1_op_params {
    ggml_custom1_op_t fun;
    int n_tasks;
    void * userdata;
};

static struct ggml_tensor * ggml_map_custom1_impl(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_t       fun,
        int                            n_tasks,
        void                         * userdata,
        bool                           inplace) {
    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);

    bool is_node = false;

    if (!inplace && a->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    struct ggml_map_custom1_op_params params = {
        /*.fun      =*/ fun,
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
    ggml_set_op_params(result, (const void *) &params, sizeof(params));

    result->op = GGML_OP_MAP_CUSTOM1;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
}

struct ggml_tensor * ggml_map_custom1(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_t       fun,
        int                            n_tasks,
        void                         * userdata) {
    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
}

struct ggml_tensor * ggml_map_custom1_inplace(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_t       fun,
        int                            n_tasks,
        void                         * userdata) {
    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
}

// ggml_map_custom2

struct ggml_map_custom2_op_params {
    ggml_custom2_op_t fun;
    int n_tasks;
    void * userdata;
};

static struct ggml_tensor * ggml_map_custom2_impl(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_t       fun,
        int                            n_tasks,
        void                         * userdata,
        bool                           inplace) {
    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);

    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    struct ggml_map_custom2_op_params params = {
        /*.fun      =*/ fun,
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
    ggml_set_op_params(result, (const void *) &params, sizeof(params));

    result->op = GGML_OP_MAP_CUSTOM2;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

struct ggml_tensor * ggml_map_custom2(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_t       fun,
        int                            n_tasks,
        void                         * userdata) {
    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
}

struct ggml_tensor * ggml_map_custom2_inplace(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_t       fun,
        int                            n_tasks,
        void                         * userdata) {
    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
}

// ggml_map_custom3

struct ggml_map_custom3_op_params {
    ggml_custom3_op_t fun;
    int n_tasks;
    void * userdata;
};

static struct ggml_tensor * ggml_map_custom3_impl(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_t       fun,
        int                            n_tasks,
        void                         * userdata,
        bool                           inplace) {
    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);

    bool is_node = false;

    if (!inplace && (a->grad || b->grad || c->grad)) {
        is_node = true;
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    struct ggml_map_custom3_op_params params = {
        /*.fun      =*/ fun,
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
    ggml_set_op_params(result, (const void *) &params, sizeof(params));

    result->op = GGML_OP_MAP_CUSTOM3;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;

    return result;
}

struct ggml_tensor * ggml_map_custom3(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_t       fun,
        int                            n_tasks,
        void                         * userdata) {
    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
}

struct ggml_tensor * ggml_map_custom3_inplace(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_t       fun,
        int                            n_tasks,
        void                         * userdata) {
    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
}

// ggml_cross_entropy_loss

struct ggml_tensor * ggml_cross_entropy_loss(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b) {
    GGML_ASSERT(ggml_are_same_shape(a, b));
    bool is_node = false;

    if (a->grad || b->grad) {
        is_node = true;
    }

    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);

    result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

// ggml_cross_entropy_loss_back

struct ggml_tensor * ggml_cross_entropy_loss_back(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        struct ggml_tensor          * c) {
    GGML_ASSERT(ggml_are_same_shape(a, b));
    GGML_ASSERT(ggml_is_scalar(c));

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

    result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
    result->grad = NULL;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;

    return result;
}

////////////////////////////////////////////////////////////////////////////////

void ggml_set_param(
        struct ggml_context * ctx,
        struct ggml_tensor * tensor) {
    tensor->flags |= GGML_TENSOR_FLAG_PARAM;

    GGML_ASSERT(tensor->grad == NULL);
    tensor->grad = ggml_dup_tensor(ctx, tensor);
    ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
}

// ggml_compute_forward_dup

static void ggml_compute_forward_dup_same_cont(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == dst->type);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const size_t nb00 = src0->nb[0];
    const size_t nb0 = dst->nb[0];

    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads

    // parallelize by elements
    const int ne = ggml_nelements(dst);
    const int dr = (ne + nth - 1) / nth;
    const int ie0 = dr * ith;
    const int ie1 = MIN(ie0 + dr, ne);

    if (ie0 < ie1) {
        memcpy(
            ((char *)  dst->data + ie0*nb0),
            ((char *) src0->data + ie0*nb00),
            (ie1 - ie0) * ggml_type_size(src0->type));
    }

}
static void ggml_compute_forward_dup_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_UNARY_OP_LOCALS

    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads

    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
        ggml_compute_forward_dup_same_cont(params, dst);
        return;
    }

    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
    const int dr = (nr + nth - 1) / nth;
    // row range for this thread
    const int ir0 = dr * ith;
    const int ir1 = MIN(ir0 + dr, nr);

    if (src0->type == dst->type &&
        ne00 == ne0 &&
        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
        // copy by rows
        const size_t rs = ne00*nb00;
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = ir0; i01 < ir1; i01++) {
                    memcpy(
                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
                        rs);
                }
            }
        }
        return;
    }

    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy

    if (ggml_is_contiguous(dst)) {
        if (nb00 == sizeof(ggml_fp16_t)) {
            if (dst->type == GGML_TYPE_F16) {
                size_t id = 0;
                const size_t rs = ne00 * nb00;
                char * dst_ptr = (char *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += rs * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
                            memcpy(dst_ptr + id, src0_ptr, rs);
                            id += rs;
                        }
                        id += rs * (ne01 - ir1);
                    }
                }
            } else if (dst->type == GGML_TYPE_F32) {
                size_t id = 0;
                float * dst_ptr = (float *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += ne00 * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                            for (int i00 = 0; i00 < ne00; i00++) {
                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
                                id++;
                            }
                        }
                        id += ne00 * (ne01 - ir1);
                    }
                }
            } else if (type_traits[dst->type].from_float) {
                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;

                size_t id = 0;
                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
                char * dst_ptr = (char *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += rs * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);

                            for (int i00 = 0; i00 < ne00; i00++) {
                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
                            }

                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
                            id += rs;
                        }
                        id += rs * (ne01 - ir1);
                    }
                }
            } else {
                GGML_ASSERT(false); // TODO: implement
            }
        } else {
            //printf("%s: this is not optimal - fix me\n", __func__);

            if (dst->type == GGML_TYPE_F32) {
                size_t id = 0;
                float * dst_ptr = (float *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += ne00 * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            for (int i00 = 0; i00 < ne00; i00++) {
                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);

                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
                                id++;
                            }
                        }
                        id += ne00 * (ne01 - ir1);
                    }
                }
            } else if (dst->type == GGML_TYPE_F16) {
                size_t id = 0;
                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += ne00 * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            for (int i00 = 0; i00 < ne00; i00++) {
                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);

                                dst_ptr[id] = *src0_ptr;
                                id++;
                            }
                        }
                        id += ne00 * (ne01 - ir1);
                    }
                }
            } else {
                GGML_ASSERT(false); // TODO: implement
            }
        }
        return;
    }

    // dst counters
    int64_t i10 = 0;
    int64_t i11 = 0;
    int64_t i12 = 0;
    int64_t i13 = 0;

    if (dst->type == GGML_TYPE_F16) {
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                i10 += ne00 * ir0;
                while (i10 >= ne0) {
                    i10 -= ne0;
                    if (++i11 == ne1) {
                        i11 = 0;
                        if (++i12 == ne2) {
                            i12 = 0;
                            if (++i13 == ne3) {
                                i13 = 0;
                            }
                        }
                    }
                }
                for (int64_t i01 = ir0; i01 < ir1; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);

                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));

                        if (++i10 == ne00) {
                            i10 = 0;
                            if (++i11 == ne01) {
                                i11 = 0;
                                if (++i12 == ne02) {
                                    i12 = 0;
                                    if (++i13 == ne03) {
                                        i13 = 0;
                                    }
                                }
                            }
                        }
                    }
                }
                i10 += ne00 * (ne01 - ir1);
                while (i10 >= ne0) {
                    i10 -= ne0;
                    if (++i11 == ne1) {
                        i11 = 0;
                        if (++i12 == ne2) {
                            i12 = 0;
                            if (++i13 == ne3) {
                                i13 = 0;
                            }
                        }
                    }
                }
            }
        }
    } else if (dst->type == GGML_TYPE_F32) {
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                i10 += ne00 * ir0;
                while (i10 >= ne0) {
                    i10 -= ne0;
                    if (++i11 == ne1) {
                        i11 = 0;
                        if (++i12 == ne2) {
                            i12 = 0;
                            if (++i13 == ne3) {
                                i13 = 0;
                            }
                        }
                    }
                }
                for (int64_t i01 = ir0; i01 < ir1; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);

                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);

                        if (++i10 == ne0) {
                            i10 = 0;
                            if (++i11 == ne1) {
                                i11 = 0;
                                if (++i12 == ne2) {
                                    i12 = 0;
                                    if (++i13 == ne3) {
                                        i13 = 0;
                                    }
                                }
                            }
                        }
                    }
                }
                i10 += ne00 * (ne01 - ir1);
                while (i10 >= ne0) {
                    i10 -= ne0;
                    if (++i11 == ne1) {
                        i11 = 0;
                        if (++i12 == ne2) {
                            i12 = 0;
                            if (++i13 == ne3) {
                                i13 = 0;
                            }
                        }
                    }
                }
            }
        }
    } else {
        GGML_ASSERT(false); // TODO: implement
    }
}

static void ggml_compute_forward_dup_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_UNARY_OP_LOCALS

    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads

    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
        ggml_compute_forward_dup_same_cont(params, dst);
        return;
    }

    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
    const int dr = (nr + nth - 1) / nth;
    // row range for this thread
    const int ir0 = dr * ith;
    const int ir1 = MIN(ir0 + dr, nr);

    if (src0->type == dst->type &&
        ne00 == ne0 &&
        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
        // copy by rows
        const size_t rs = ne00*nb00;
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = ir0; i01 < ir1; i01++) {
                    memcpy(
                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
                        rs);
                }
            }
        }
        return;
    }

    if (ggml_is_contiguous(dst)) {
        // TODO: simplify
        if (nb00 == sizeof(float)) {
            if (dst->type == GGML_TYPE_F32) {
                size_t id = 0;
                const size_t rs = ne00 * nb00;
                char * dst_ptr = (char *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += rs * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
                            memcpy(dst_ptr + id, src0_ptr, rs);
                            id += rs;
                        }
                        id += rs * (ne01 - ir1);
                    }
                }
            } else if (type_traits[dst->type].from_float) {
                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;

                size_t id = 0;
                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
                char * dst_ptr = (char *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += rs * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
                            id += rs;
                        }
                        id += rs * (ne01 - ir1);
                    }
                }
            } else {
                GGML_ASSERT(false); // TODO: implement
            }
        } else {
            //printf("%s: this is not optimal - fix me\n", __func__);

            if (dst->type == GGML_TYPE_F32) {
                size_t id = 0;
                float * dst_ptr = (float *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += ne00 * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            for (int i00 = 0; i00 < ne00; i00++) {
                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);

                                dst_ptr[id] = *src0_ptr;
                                id++;
                            }
                        }
                        id += ne00 * (ne01 - ir1);
                    }
                }
            } else if (dst->type == GGML_TYPE_F16) {
                size_t id = 0;
                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;

                for (int i03 = 0; i03 < ne03; i03++) {
                    for (int i02 = 0; i02 < ne02; i02++) {
                        id += ne00 * ir0;
                        for (int i01 = ir0; i01 < ir1; i01++) {
                            for (int i00 = 0; i00 < ne00; i00++) {
                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);

                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
                                id++;
                            }
                        }
                        id += ne00 * (ne01 - ir1);
                    }
                }
            } else {
                GGML_ASSERT(false); // TODO: implement
            }
        }

        return;
    }

    // dst counters

    int64_t i10 = 0;
    int64_t i11 = 0;
    int64_t i12 = 0;
    int64_t i13 = 0;

    if (dst->type == GGML_TYPE_F32) {
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                i10 += ne00 * ir0;
                while (i10 >= ne0) {
                    i10 -= ne0;
                    if (++i11 == ne1) {
                        i11 = 0;
                        if (++i12 == ne2) {
                            i12 = 0;
                            if (++i13 == ne3) {
                                i13 = 0;
                            }
                        }
                    }
                }
                for (int64_t i01 = ir0; i01 < ir1; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);

                        memcpy(dst_ptr, src0_ptr, sizeof(float));

                        if (++i10 == ne0) {
                            i10 = 0;
                            if (++i11 == ne1) {
                                i11 = 0;
                                if (++i12 == ne2) {
                                    i12 = 0;
                                    if (++i13 == ne3) {
                                        i13 = 0;
                                    }
                                }
                            }
                        }
                    }
                }
                i10 += ne00 * (ne01 - ir1);
                while (i10 >= ne0) {
                    i10 -= ne0;
                    if (++i11 == ne1) {
                        i11 = 0;
                        if (++i12 == ne2) {
                            i12 = 0;
                            if (++i13 == ne3) {
                                i13 = 0;
                            }
                        }
                    }
                }
            }
        }
    } else if (dst->type == GGML_TYPE_F16) {
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                i10 += ne00 * ir0;
                while (i10 >= ne0) {
                    i10 -= ne0;
                    if (++i11 == ne1) {
                        i11 = 0;
                        if (++i12 == ne2) {
                            i12 = 0;
                            if (++i13 == ne3) {
                                i13 = 0;
                            }
                        }
                    }
                }
                for (int64_t i01 = ir0; i01 < ir1; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);

                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);

                        if (++i10 == ne0) {
                            i10 = 0;
                            if (++i11 == ne1) {
                                i11 = 0;
                                if (++i12 == ne2) {
                                    i12 = 0;
                                    if (++i13 == ne3) {
                                        i13 = 0;
                                    }
                                }
                            }
                        }
                    }
                }
                i10 += ne00 * (ne01 - ir1);
                while (i10 >= ne0) {
                    i10 -= ne0;
                    if (++i11 == ne1) {
                        i11 = 0;
                        if (++i12 == ne2) {
                            i12 = 0;
                            if (++i13 == ne3) {
                                i13 = 0;
                            }
                        }
                    }
                }
            }
        }
    } else {
        GGML_ASSERT(false); // TODO: implement
    }
}

// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
static void ggml_compute_forward_dup_bytes(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
    GGML_ASSERT(src0->type == dst->type);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
        ggml_compute_forward_dup_same_cont(params, dst);
        return;
    }

    GGML_TENSOR_UNARY_OP_LOCALS;

    const size_t type_size = ggml_type_size(src0->type);
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads


    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
    const int dr = (nr + nth - 1) / nth;
    // row range for this thread
    const int ir0 = dr * ith;
    const int ir1 = MIN(ir0 + dr, nr);

    if (src0->type == dst->type &&
        ne00 == ne0 &&
        nb00 == type_size && nb0 == type_size) {
        // copy by rows
        const size_t rs = ne00 * type_size;
        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = ir0; i01 < ir1; i01++) {
                    memcpy(
                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
                        rs);
                }
            }
        }
        return;
    }

    if (ggml_is_contiguous(dst)) {
        size_t id = 0;
        char * dst_ptr = (char *) dst->data;
        const size_t rs = ne00 * type_size;

        if (nb00 == type_size) {
            // src0 is contigous on first dimension, copy by rows
            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    id += rs * ir0;
                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
                        memcpy(dst_ptr + id, src0_ptr, rs);
                        id += rs;
                    }
                    id += rs * (ne01 - ir1);
                }
            }
        } else {
            //printf("%s: this is not optimal - fix me\n", __func__);

            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    id += rs * ir0;
                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
                        for (int64_t i00 = 0; i00 < ne00; i00++) {
                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
                            memcpy(dst_ptr + id, src0_ptr, type_size);

                            id += type_size;
                        }
                    }
                    id += rs * (ne01 - ir1);
                }
            }
        }

        return;
    }

    // dst counters

    int64_t i10 = 0;
    int64_t i11 = 0;
    int64_t i12 = 0;
    int64_t i13 = 0;

    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            i10 += ne00 * ir0;
            while (i10 >= ne0) {
                i10 -= ne0;
                if (++i11 == ne1) {
                    i11 = 0;
                    if (++i12 == ne2) {
                        i12 = 0;
                        if (++i13 == ne3) {
                            i13 = 0;
                        }
                    }
                }
            }
            for (int64_t i01 = ir0; i01 < ir1; i01++) {
                for (int64_t i00 = 0; i00 < ne00; i00++) {
                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);

                    memcpy(dst_ptr, src0_ptr, type_size);

                    if (++i10 == ne0) {
                        i10 = 0;
                        if (++i11 == ne1) {
                            i11 = 0;
                            if (++i12 == ne2) {
                                i12 = 0;
                                if (++i13 == ne3) {
                                    i13 = 0;
                                }
                            }
                        }
                    }
                }
            }
            i10 += ne00 * (ne01 - ir1);
            while (i10 >= ne0) {
                i10 -= ne0;
                if (++i11 == ne1) {
                    i11 = 0;
                    if (++i12 == ne2) {
                        i12 = 0;
                        if (++i13 == ne3) {
                            i13 = 0;
                        }
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_dup(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    if (src0->type == dst->type) {
        ggml_compute_forward_dup_bytes(params, dst);
        return;
    }

    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_dup_f16(params, dst);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_dup_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_add

static void ggml_compute_forward_add_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

#ifdef GGML_USE_CLBLAST
    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
        // TODO: OpenCL kernel support full broadcast
        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
        if (ith == 0) {
            ggml_cl_add(src0, src1, dst);
        }
        return;
    }
#endif

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_BINARY_OP_LOCALS

    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    if (nb10 == sizeof(float)) {
        for (int ir = ir0; ir < ir1; ++ir) {
            // src1 is broadcastable across src0 and dst in i1, i2, i3
            const int64_t i03 = ir/(ne02*ne01);
            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);

            const int64_t i13 = i03 % ne13;
            const int64_t i12 = i02 % ne12;
            const int64_t i11 = i01 % ne11;
            const int64_t nr0 = ne00 / ne10;

            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);

            for (int64_t r = 0; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
#else
                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
            }
        }
    } else {
        // src1 is not contiguous
        for (int ir = ir0; ir < ir1; ++ir) {
            // src1 is broadcastable across src0 and dst in i1, i2, i3
            const int64_t i03 = ir/(ne02*ne01);
            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);

            const int64_t i13 = i03 % ne13;
            const int64_t i12 = i02 % ne12;
            const int64_t i11 = i01 % ne11;

            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);

            for (int64_t i0 = 0; i0 < ne0; ++i0) {
                const int64_t i10 = i0 % ne10;
                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);

                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
            }
        }
    }
}

static void ggml_compute_forward_add_f16_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_BINARY_OP_LOCALS

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    if (dst->type == GGML_TYPE_F32) {
        GGML_ASSERT( nb0 == sizeof(float));
    }
    else {
        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
    }

    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    if (nb10 == sizeof(float)) {
        if (dst->type == GGML_TYPE_F16) {
            for (int ir = ir0; ir < ir1; ++ir) {
                // src0, src1 and dst are same shape => same indices
                const int i3 = ir/(ne2*ne1);
                const int i2 = (ir - i3*ne2*ne1)/ne1;
                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);

                for (int i = 0; i < ne0; i++) {
                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
                }
            }
        } else {
            for (int ir = ir0; ir < ir1; ++ir) {
                // src0, src1 and dst are same shape => same indices
                const int i3 = ir/(ne2*ne1);
                const int i2 = (ir - i3*ne2*ne1)/ne1;
                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);

                for (int i = 0; i < ne0; i++) {
                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
                }
            }
        }
    }
    else {
        // src1 is not contiguous
        GGML_ASSERT(false);
    }
}

static void ggml_compute_forward_add_f16_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_BINARY_OP_LOCALS

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F16);
    GGML_ASSERT(dst->type  == GGML_TYPE_F16);

    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    if (nb10 == sizeof(ggml_fp16_t)) {
        for (int ir = ir0; ir < ir1; ++ir) {
            // src0, src1 and dst are same shape => same indices
            const int i3 = ir/(ne2*ne1);
            const int i2 = (ir - i3*ne2*ne1)/ne1;
            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
            ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);

            for (int i = 0; i < ne0; i++) {
                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
            }
        }
    }
    else {
        // src1 is not contiguous
        GGML_ASSERT(false);
    }
}

static void ggml_compute_forward_add_q_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_BINARY_OP_LOCALS

    const int ith = params->ith;
    const int nth = params->nth;

    const enum ggml_type type = src0->type;
    const enum ggml_type dtype = dst->type;
    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
    ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float;

    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == sizeof(float));

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

    GGML_ASSERT(ggml_is_quantized(src0->type));
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;

    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 indices
        const int i03 = ir/(ne02*ne01);
        const int i02 = (ir - i03*ne02*ne01)/ne01;
        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);

        // src1 and dst are same shape as src0 => same indices
        const int i13 = i03;
        const int i12 = i02;
        const int i11 = i01;

        const int i3 = i03;
        const int i2 = i02;
        const int i1 = i01;

        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));

        assert(ne00 % 32 == 0);

        // unquantize row from src0 to temp buffer
        dequantize_row_q(src0_row, wdata, ne00);
        // add src1
        ggml_vec_acc_f32(ne00, wdata, src1_row);
        // quantize row to dst
        if (quantize_row_q != NULL) {
            quantize_row_q(wdata, dst_row, ne00);
        } else {
            memcpy(dst_row, wdata, ne0*nb0);
        }
    }
}

static void ggml_compute_forward_add(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                if (src1->type == GGML_TYPE_F32) {
                    ggml_compute_forward_add_f32(params, dst);
                }
                else {
                    GGML_ASSERT(false);
                }
            } break;
        case GGML_TYPE_F16:
            {
                if (src1->type == GGML_TYPE_F16) {
                    ggml_compute_forward_add_f16_f16(params, dst);
                }
                else if (src1->type == GGML_TYPE_F32) {
                    ggml_compute_forward_add_f16_f32(params, dst);
                }
                else {
                    GGML_ASSERT(false);
                }
            } break;
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
            {
                ggml_compute_forward_add_q_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_add1

static void ggml_compute_forward_add1_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_is_scalar(src1));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_UNARY_OP_LOCALS

    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 and dst are same shape => same indices
        const int i3 = ir/(ne2*ne1);
        const int i2 = (ir - i3*ne2*ne1)/ne1;
        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

#ifdef GGML_USE_ACCELERATE
        UNUSED(ggml_vec_add1_f32);

        vDSP_vadd(
                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
                (float *) ((char *) src1->data), 0,
                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
                ne0);
#else
        ggml_vec_add1_f32(ne0,
                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
               *(float *) src1->data);
#endif
    }
}

static void ggml_compute_forward_add1_f16_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_is_scalar(src1));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // scalar to add
    const float v = *(float *) src1->data;

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_UNARY_OP_LOCALS

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type  == GGML_TYPE_F16);

    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 and dst are same shape => same indices
        const int i3 = ir/(ne2*ne1);
        const int i2 = (ir - i3*ne2*ne1)/ne1;
        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
        for (int i = 0; i < ne0; i++) {
            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
        }
    }
}

static void ggml_compute_forward_add1_f16_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_is_scalar(src1));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // scalar to add
    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_UNARY_OP_LOCALS

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F16);
    GGML_ASSERT(dst->type  == GGML_TYPE_F16);

    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 and dst are same shape => same indices
        const int i3 = ir/(ne2*ne1);
        const int i2 = (ir - i3*ne2*ne1)/ne1;
        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
        for (int i = 0; i < ne0; i++) {
            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
        }
    }
}

static void ggml_compute_forward_add1_q_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_is_scalar(src1));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // scalar to add
    const float v = *(float *) src1->data;

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_UNARY_OP_LOCALS

    const enum ggml_type type = src0->type;
    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;

    // we don't support permuted src0
    GGML_ASSERT(nb00 == ggml_type_size(type));

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

    GGML_ASSERT(ggml_is_quantized(src0->type));
    GGML_ASSERT(dst->type == src0->type);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;

    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 and dst are same shape => same indices
        const int i3 = ir/(ne2*ne1);
        const int i2 = (ir - i3*ne2*ne1)/ne1;
        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));

        assert(ne0 % 32 == 0);

        // unquantize row from src0 to temp buffer
        dequantize_row_q(src0_row, wdata, ne0);
        // add src1
        ggml_vec_acc1_f32(ne0, wdata, v);
        // quantize row to dst
        quantize_row_q(wdata, dst_row, ne0);
    }
}

static void ggml_compute_forward_add1(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_add1_f32(params, dst);
            } break;
        case GGML_TYPE_F16:
            {
                if (src1->type == GGML_TYPE_F16) {
                    ggml_compute_forward_add1_f16_f16(params, dst);
                }
                else if (src1->type == GGML_TYPE_F32) {
                    ggml_compute_forward_add1_f16_f32(params, dst);
                }
                else {
                    GGML_ASSERT(false);
                }
            } break;
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
            {
                ggml_compute_forward_add1_q_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_acc

static void ggml_compute_forward_acc_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));

    // view src0 and dst with these strides and data offset inbytes during acc
    // nb0 is implicitly element_size because src0 and dst are contiguous
    size_t nb1     = ((int32_t *) dst->op_params)[0];
    size_t nb2     = ((int32_t *) dst->op_params)[1];
    size_t nb3     = ((int32_t *) dst->op_params)[2];
    size_t offset  = ((int32_t *) dst->op_params)[3];
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];

    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
        if (params->ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
            ((char *)  dst->data),
            ((char *) src0->data),
            ggml_nbytes(dst));
    }

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr = ggml_nrows(src1);
    const int nc = src1->ne[0];

    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)

    // src0 and dst as viewed during acc
    const size_t nb0 = ggml_element_size(src0);

    const size_t nb00 = nb0;
    const size_t nb01 = nb1;
    const size_t nb02 = nb2;
    const size_t nb03 = nb3;

    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_nbytes(dst));
    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));

    GGML_ASSERT(nb10 == sizeof(float));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 and dst are viewed with shape of src1 and offset
        // => same indices
        const int i3 = ir/(ne12*ne11);
        const int i2 = (ir - i3*ne12*ne11)/ne11;
        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);

#ifdef GGML_USE_ACCELERATE
        vDSP_vadd(
                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
#else
        ggml_vec_add_f32(nc,
                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
#endif
    }
}

static void ggml_compute_forward_acc(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_acc_f32(params, dst);
            } break;
        case GGML_TYPE_F16:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_sub

static void ggml_compute_forward_sub_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int nr  = ggml_nrows(src0);

    GGML_TENSOR_BINARY_OP_LOCALS

    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));

    if (nb10 == sizeof(float)) {
        for (int ir = 0; ir < nr; ++ir) {
            // src0, src1 and dst are same shape => same indices
            const int i3 = ir/(ne2*ne1);
            const int i2 = (ir - i3*ne2*ne1)/ne1;
            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

#ifdef GGML_USE_ACCELERATE
            vDSP_vsub(
                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
                    ne0);
#else
            ggml_vec_sub_f32(ne0,
                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
#endif
                // }
            // }
        }
    } else {
        // src1 is not contiguous
        for (int ir = 0; ir < nr; ++ir) {
            // src0, src1 and dst are same shape => same indices
            const int i3 = ir/(ne2*ne1);
            const int i2 = (ir - i3*ne2*ne1)/ne1;
            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
            for (int i0 = 0; i0 < ne0; i0++) {
                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);

                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
            }
        }
    }
}

static void ggml_compute_forward_sub(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_sub_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_mul

static void ggml_compute_forward_mul_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }
    const int ith = params->ith;
    const int nth = params->nth;

#if defined(GGML_USE_CLBLAST)
    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
        // TODO: OpenCL kernel support full broadcast
        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
        if (ith == 0) {
            ggml_cl_mul(src0, src1, dst);
        }
        return;
    }
#endif

    const int64_t nr = ggml_nrows(src0);

    GGML_TENSOR_BINARY_OP_LOCALS

    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));

    if (nb10 == sizeof(float)) {
        for (int64_t ir = ith; ir < nr; ir += nth) {
            // src0 and dst are same shape => same indices
            const int64_t i03 = ir/(ne02*ne01);
            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);

            const int64_t i13 = i03 % ne13;
            const int64_t i12 = i02 % ne12;
            const int64_t i11 = i01 % ne11;
            const int64_t nr0 = ne00 / ne10;

            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);

            for (int64_t r = 0 ; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
                UNUSED(ggml_vec_mul_f32);

                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
#else
                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
            }
        }
    } else {
        // src1 is not contiguous
        for (int64_t ir = ith; ir < nr; ir += nth) {
            // src0 and dst are same shape => same indices
            // src1 is broadcastable across src0 and dst in i1, i2, i3
            const int64_t i03 = ir/(ne02*ne01);
            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);

            const int64_t i13 = i03 % ne13;
            const int64_t i12 = i02 % ne12;
            const int64_t i11 = i01 % ne11;

            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);

            for (int64_t i0 = 0; i0 < ne00; ++i0) {
                const int64_t i10 = i0 % ne10;
                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);

                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
            }
        }
    }
}

static void ggml_compute_forward_mul(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_mul_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_div

static void ggml_compute_forward_div_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t nr = ggml_nrows(src0);

    GGML_TENSOR_BINARY_OP_LOCALS

    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));

    if (nb10 == sizeof(float)) {
        for (int64_t ir = ith; ir < nr; ir += nth) {
            // src0 and dst are same shape => same indices
            const int64_t i03 = ir/(ne02*ne01);
            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);

            const int64_t i13 = i03 % ne13;
            const int64_t i12 = i02 % ne12;
            const int64_t i11 = i01 % ne11;
            const int64_t nr0 = ne00 / ne10;

            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);

            for (int64_t r = 0; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
                UNUSED(ggml_vec_div_f32);

                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
#else
                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
            }
        }
    } else {
        // src1 is not contiguous
        for (int64_t ir = ith; ir < nr; ir += nth) {
            // src0 and dst are same shape => same indices
            // src1 is broadcastable across src0 and dst in i1, i2, i3
            const int64_t i03 = ir/(ne02*ne01);
            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);

            const int64_t i13 = i03 % ne13;
            const int64_t i12 = i02 % ne12;
            const int64_t i11 = i01 % ne11;

            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);

            for (int64_t i0 = 0; i0 < ne00; ++i0) {
                const int64_t i10 = i0 % ne10;
                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);

                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
            }
        }
    }
}

static void ggml_compute_forward_div(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_div_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_sqr

static void ggml_compute_forward_sqr_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n     = ggml_nrows(src0);
    const int nc    = src0->ne[0];

    assert( dst->nb[0] == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_sqr_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_sqr(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_sqr_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_sqrt

static void ggml_compute_forward_sqrt_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert( dst->nb[0] == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_sqrt_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_sqrt(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_sqrt_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_log

static void ggml_compute_forward_log_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(params->ith == 0);
    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    GGML_ASSERT( dst->nb[0] == sizeof(float));
    GGML_ASSERT(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_log_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_log(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_log_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_sum

static void ggml_compute_forward_sum_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_is_scalar(dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    assert(ggml_is_scalar(dst));
    assert(src0->nb[0] == sizeof(float));

    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)

    ggml_float sum     = 0;
    ggml_float row_sum = 0;

    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = 0; i01 < ne01; i01++) {
                ggml_vec_sum_f32_ggf(ne00,
                        &row_sum,
                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
                sum += row_sum;
            }
        }
    }
    ((float *) dst->data)[0] = sum;
}

static void ggml_compute_forward_sum_f16(
    const struct ggml_compute_params * params,
          struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_is_scalar(dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    assert(src0->nb[0] == sizeof(ggml_fp16_t));

    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)

    float sum = 0;
    float row_sum = 0;

    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = 0; i01 < ne01; i01++) {
                ggml_vec_sum_f16_ggf(ne00,
                    &row_sum,
                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
                sum += row_sum;
            }
        }
    }
    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
}

static void ggml_compute_forward_sum(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_sum_f32(params, dst);
            } break;
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_sum_f16(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_sum_rows

static void ggml_compute_forward_sum_rows_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));
    GGML_ASSERT(dst->nb[0] == sizeof(float));

    GGML_TENSOR_UNARY_OP_LOCALS

    GGML_ASSERT(ne0 == 1);
    GGML_ASSERT(ne1 == ne01);
    GGML_ASSERT(ne2 == ne02);
    GGML_ASSERT(ne3 == ne03);

    for (int64_t i3 = 0; i3 < ne03; i3++) {
        for (int64_t i2 = 0; i2 < ne02; i2++) {
            for (int64_t i1 = 0; i1 < ne01; i1++) {
                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
                float row_sum = 0;
                ggml_vec_sum_f32(ne00, &row_sum, src_row);
                dst_row[0] = row_sum;
            }
        }
    }
}

static void ggml_compute_forward_sum_rows(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_sum_rows_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_mean

static void ggml_compute_forward_mean_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    assert(src0->nb[0] == sizeof(float));

    GGML_TENSOR_UNARY_OP_LOCALS

    assert(ne0 == 1);
    assert(ne1 == ne01);
    assert(ne2 == ne02);
    assert(ne3 == ne03);

    UNUSED(ne0);
    UNUSED(ne1);
    UNUSED(ne2);
    UNUSED(ne3);

    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = 0; i01 < ne01; i01++) {
                ggml_vec_sum_f32(ne00,
                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));

                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
            }
        }
    }
}

static void ggml_compute_forward_mean(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_mean_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_argmax

static void ggml_compute_forward_argmax_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    assert(src0->nb[0] == sizeof(float));
    assert(dst->nb[0] == sizeof(float));

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];

    const size_t nb01 = src0->nb[1];
    const size_t nb0 = dst->nb[0];

    for (int64_t i1 = 0; i1 < ne01; i1++) {
        float * src = (float *) ((char *) src0->data + i1*nb01);
        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
        int v = 0;
        ggml_vec_argmax_f32(ne00, &v, src);
        dst_[0] = v;
    }
}

static void ggml_compute_forward_argmax(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_argmax_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_repeat

static void ggml_compute_forward_repeat_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(params->ith == 0);
    GGML_ASSERT(ggml_can_repeat(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_UNARY_OP_LOCALS

    // guaranteed to be an integer due to the check in ggml_can_repeat
    const int nr0 = (int)(ne0/ne00);
    const int nr1 = (int)(ne1/ne01);
    const int nr2 = (int)(ne2/ne02);
    const int nr3 = (int)(ne3/ne03);

    // TODO: support for transposed / permuted tensors
    GGML_ASSERT(nb0  == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));

    // TODO: maybe this is not optimal?
    for                         (int i3 = 0; i3 < nr3;  i3++) {
        for                     (int k3 = 0; k3 < ne03; k3++) {
            for                 (int i2 = 0; i2 < nr2;  i2++) {
                for             (int k2 = 0; k2 < ne02; k2++) {
                    for         (int i1 = 0; i1 < nr1;  i1++) {
                        for     (int k1 = 0; k1 < ne01; k1++) {
                            for (int i0 = 0; i0 < nr0;  i0++) {
                                ggml_vec_cpy_f32(ne00,
                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
                            }
                        }
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_repeat_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(params->ith == 0);
    GGML_ASSERT(ggml_can_repeat(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_UNARY_OP_LOCALS

    // guaranteed to be an integer due to the check in ggml_can_repeat
    const int nr0 = (int)(ne0/ne00);
    const int nr1 = (int)(ne1/ne01);
    const int nr2 = (int)(ne2/ne02);
    const int nr3 = (int)(ne3/ne03);

    // TODO: support for transposed / permuted tensors
    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));

    // TODO: maybe this is not optimal?
    for                         (int i3 = 0; i3 < nr3;  i3++) {
        for                     (int k3 = 0; k3 < ne03; k3++) {
            for                 (int i2 = 0; i2 < nr2;  i2++) {
                for             (int k2 = 0; k2 < ne02; k2++) {
                    for         (int i1 = 0; i1 < nr1;  i1++) {
                        for     (int k1 = 0; k1 < ne01; k1++) {
                            for (int i0 = 0; i0 < nr0;  i0++) {
                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
                                // ggml_vec_cpy_f16(ne00, y, x)
                                for (int i = 0; i < ne00; ++i) {
                                    y[i]  = x[i];
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_repeat(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F16:
        case GGML_TYPE_I16:
            {
                ggml_compute_forward_repeat_f16(params, dst);
            } break;
        case GGML_TYPE_F32:
        case GGML_TYPE_I32:
            {
                ggml_compute_forward_repeat_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_repeat_back

static void ggml_compute_forward_repeat_back_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(params->ith == 0);
    GGML_ASSERT(ggml_can_repeat(dst, src0));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_UNARY_OP_LOCALS

    // guaranteed to be an integer due to the check in ggml_can_repeat
    const int nr0 = (int)(ne00/ne0);
    const int nr1 = (int)(ne01/ne1);
    const int nr2 = (int)(ne02/ne2);
    const int nr3 = (int)(ne03/ne3);

    // TODO: support for transposed / permuted tensors
    GGML_ASSERT(nb0  == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));

    if (ggml_is_contiguous(dst)) {
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
    } else {
        for         (int k3 = 0; k3 < ne3; k3++) {
            for     (int k2 = 0; k2 < ne2; k2++) {
                for (int k1 = 0; k1 < ne1; k1++) {
                    ggml_vec_set_f32(ne0,
                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
                        0);
                }
            }
        }
    }

    // TODO: maybe this is not optimal?
    for                         (int i3 = 0; i3 < nr3; i3++) {
        for                     (int k3 = 0; k3 < ne3; k3++) {
            for                 (int i2 = 0; i2 < nr2; i2++) {
                for             (int k2 = 0; k2 < ne2; k2++) {
                    for         (int i1 = 0; i1 < nr1; i1++) {
                        for     (int k1 = 0; k1 < ne1; k1++) {
                            for (int i0 = 0; i0 < nr0; i0++) {
                                ggml_vec_acc_f32(ne0,
                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
                            }
                        }
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_repeat_back(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_repeat_back_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_concat

static void ggml_compute_forward_concat_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_BINARY_OP_LOCALS

    // TODO: support for transposed / permuted tensors
    GGML_ASSERT(nb0  == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
    GGML_ASSERT(nb10 == sizeof(float));

    for (int i3 = 0; i3 < ne3; i3++) {
        for (int i2 = ith; i2 < ne2; i2 += nth) {
            if (i2 < ne02) { // src0
                for (int i1 = 0; i1 < ne1; i1++) {
                    for (int i0 = 0; i0 < ne0; i0++) {
                        const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);

                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
                        *y = *x;
                    }
                }
            } // src1
            else {
                for (int i1 = 0; i1 < ne1; i1++) {
                    for (int i0 = 0; i0 < ne0; i0++) {
                        const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);

                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
                        *y = *x;
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_concat(
    const struct ggml_compute_params* params,
    struct ggml_tensor* dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
        case GGML_TYPE_I32:
            {
                ggml_compute_forward_concat_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_abs

static void ggml_compute_forward_abs_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_abs_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_abs(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_abs_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_sgn

static void ggml_compute_forward_sgn_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_sgn_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_sgn(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_sgn_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_neg

static void ggml_compute_forward_neg_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_neg_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_neg(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_neg_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_step

static void ggml_compute_forward_step_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_step_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_step(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_step_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_tanh

static void ggml_compute_forward_tanh_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_tanh_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_tanh(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_tanh_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_elu

static void ggml_compute_forward_elu_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_elu_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_elu(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_elu_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_relu

static void ggml_compute_forward_relu_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_relu_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_relu(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_relu_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_gelu

static void ggml_compute_forward_gelu_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int i1 = ir0; i1 < ir1; i1++) {
        ggml_vec_gelu_f32(nc,
                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
                (float *) ((char *) src0->data + i1*(src0->nb[1])));

#ifndef NDEBUG
        for (int k = 0; k < nc; k++) {
            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
            UNUSED(x);
            assert(!isnan(x));
            assert(!isinf(x));
        }
#endif
    }
}

static void ggml_compute_forward_gelu(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_gelu_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_gelu_quick

static void ggml_compute_forward_gelu_quick_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int i1 = ir0; i1 < ir1; i1++) {
        ggml_vec_gelu_quick_f32(nc,
                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
                (float *) ((char *) src0->data + i1*(src0->nb[1])));

#ifndef NDEBUG
        for (int k = 0; k < nc; k++) {
            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
            UNUSED(x);
            assert(!isnan(x));
            assert(!isinf(x));
        }
#endif
    }
}

static void ggml_compute_forward_gelu_quick(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_gelu_quick_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_silu

static void ggml_compute_forward_silu_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int i1 = ir0; i1 < ir1; i1++) {
        ggml_vec_silu_f32(nc,
                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
                (float *) ((char *) src0->data + i1*(src0->nb[1])));

#ifndef NDEBUG
        for (int k = 0; k < nc; k++) {
            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
            UNUSED(x);
            assert(!isnan(x));
            assert(!isinf(x));
        }
#endif
    }
}

static void ggml_compute_forward_silu(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_silu_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}
// ggml_compute_forward_leaky_relu

static void ggml_compute_forward_leaky_relu_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    float negative_slope;
    memcpy(&negative_slope, dst->op_params, sizeof(float));

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_leaky_relu_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
    }
}

static void ggml_compute_forward_leaky_relu(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_leaky_relu_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_silu_back

static void ggml_compute_forward_silu_back_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * grad = dst->src[1];

    GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_are_same_shape(src0, grad));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int i1 = ir0; i1 < ir1; i1++) {
        ggml_vec_silu_backward_f32(nc,
                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
                (float *) ((char *) src0->data + i1*(src0->nb[1])),
                (float *) ((char *) grad->data + i1*(grad->nb[1])));

#ifndef NDEBUG
        for (int k = 0; k < nc; k++) {
            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
            UNUSED(x);
            assert(!isnan(x));
            assert(!isinf(x));
        }
#endif
    }
}

static void ggml_compute_forward_silu_back(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_silu_back_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}


static void ggml_compute_forward_hardswish_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_hardswish_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}
static void ggml_compute_forward_hardswish(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_hardswish_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

static void ggml_compute_forward_hardsigmoid_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        ggml_vec_hardsigmoid_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_hardsigmoid(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_hardsigmoid_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}


// ggml_compute_forward_norm

static void ggml_compute_forward_norm_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_UNARY_OP_LOCALS

    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

    GGML_ASSERT(eps > 0.0f);

    // TODO: optimize
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);

                ggml_float sum = 0.0;
                for (int64_t i00 = 0; i00 < ne00; i00++) {
                    sum += (ggml_float)x[i00];
                }

                float mean = sum/ne00;

                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);

                ggml_float sum2 = 0.0;
                for (int64_t i00 = 0; i00 < ne00; i00++) {
                    float v = x[i00] - mean;
                    y[i00] = v;
                    sum2 += (ggml_float)(v*v);
                }

                float variance = sum2/ne00;
                const float scale = 1.0f/sqrtf(variance + eps);

                ggml_vec_scale_f32(ne00, y, scale);
            }
        }
    }
}

static void ggml_compute_forward_norm(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_norm_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_group_rms_norm

static void ggml_compute_forward_rms_norm_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_UNARY_OP_LOCALS

    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

    GGML_ASSERT(eps > 0.0f);

    // TODO: optimize
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);

                ggml_float sum = 0.0;
                for (int64_t i00 = 0; i00 < ne00; i00++) {
                    sum += (ggml_float)(x[i00] * x[i00]);
                }

                const float mean = sum/ne00;

                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);

                memcpy(y, x, ne00 * sizeof(float));
                // for (int i00 = 0; i00 < ne00; i00++) {
                //     y[i00] = x[i00];
                // }

                const float scale = 1.0f/sqrtf(mean + eps);

                ggml_vec_scale_f32(ne00, y, scale);
            }
        }
    }
}

static void ggml_compute_forward_rms_norm(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_rms_norm_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

static void ggml_compute_forward_rms_norm_back_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_BINARY_OP_LOCALS

    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

    // TODO: optimize
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                // src1 is same shape as src0 => same indices
                const int64_t i11 = i01;
                const int64_t i12 = i02;
                const int64_t i13 = i03;

                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);

                ggml_float sum_xx  = 0.0;
                ggml_float sum_xdz = 0.0;

                for (int64_t i00 = 0; i00 < ne00; i00++) {
                    sum_xx  += (ggml_float)(x[i00] * x[i00]);
                    sum_xdz += (ggml_float)(x[i00] * dz[i00]);
                }

                //const float mean     = (float)(sum_xx)/ne00;
                const float mean_eps = (float)(sum_xx)/ne00 + eps;
                const float sum_eps  = (float)(sum_xx) + eps*ne00;
                //const float mean_xdz = (float)(sum_xdz)/ne00;
                // we could cache rms from forward pass to improve performance.
                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
                //const float rms      = sqrtf(mean_eps);
                const float rrms     = 1.0f / sqrtf(mean_eps);
                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)

                {
                    // z = rms_norm(x)
                    //
                    // rms_norm(src0) =
                    //     scale(
                    //         src0,
                    //         div(
                    //             1,
                    //             sqrt(
                    //                 add(
                    //                     scale(
                    //                         sum(
                    //                             sqr(
                    //                                 src0)),
                    //                         (1.0/N)),
                    //                     eps))));

                    // postorder:
                    // ## op    args         grad
                    // 00 param src0         grad[#00]
                    // 01 const 1
                    // 02 sqr   (#00)        grad[#02]
                    // 03 sum   (#02)        grad[#03]
                    // 04 const 1/N
                    // 05 scale (#03, #04)   grad[#05]
                    // 06 const eps
                    // 07 add   (#05, #06)   grad[#07]
                    // 08 sqrt  (#07)        grad[#08]
                    // 09 div   (#01,#08)    grad[#09]
                    // 10 scale (#00,#09)    grad[#10]
                    //
                    // backward pass, given grad[#10]
                    // #10: scale
                    // grad[#00] += scale(grad[#10],#09)
                    // grad[#09] += sum(mul(grad[#10],#00))
                    // #09: div
                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
                    // #08: sqrt
                    // grad[#07] += mul(grad[#08], div(0.5, #08))
                    // #07: add
                    // grad[#05] += grad[#07]
                    // #05: scale
                    // grad[#03] += scale(grad[#05],#04)
                    // #03: sum
                    // grad[#02] += repeat(grad[#03], #02)
                    // #02:
                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
                    //
                    // substitute and simplify:
                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
                    // grad[#02] = repeat(grad[#03], #02)
                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
                    // a = b*c + d*e
                    // a = b*c*f/f + d*e*f/f
                    // a = (b*c*f + d*e*f)*(1/f)
                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
                    // a = (b + d*e/c)*c
                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
                }
                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
                // post-order:
                // dx := x
                // dx := scale(dx,-mean_xdz/mean_eps)
                // dx := add(dx, dz)
                // dx := scale(dx, rrms)
                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);

                ggml_vec_cpy_f32  (ne00, dx, x);
                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
                ggml_vec_acc_f32  (ne00, dx, dz);
                ggml_vec_scale_f32(ne00, dx, rrms);
            }
        }
    }
}

static void ggml_compute_forward_rms_norm_back(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_rms_norm_back_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_group_norm

static void ggml_compute_forward_group_norm_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_UNARY_OP_LOCALS

    const float eps = 1e-6f; // TODO: make this a parameter

    // TODO: optimize

    int n_channels = src0->ne[2];
    int n_groups = dst->op_params[0];
    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
    for (int i = ith; i < n_groups; i += nth) {
        int start = i * n_channels_per_group;
        int end = start + n_channels_per_group;
        if (end > n_channels) {
            end = n_channels;
        }
        int step = end - start;

        for (int64_t i03 = 0; i03 < ne03; i03++) {
            ggml_float sum = 0.0;
            for (int64_t i02 = start; i02 < end; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);

                    ggml_float sumr = 0.0;
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        sumr += (ggml_float)x[i00];
                    }
                    sum += sumr;
                }
            }
            const float mean = sum / (ne00 * ne01 * step);

            ggml_float sum2 = 0.0;
            for (int64_t i02 = start; i02 < end; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);

                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);

                    ggml_float sumr = 0.0;
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        float v = x[i00] - mean;
                        y[i00] = v;
                        sumr += (ggml_float)(v * v);
                    }
                    sum2 += sumr;
                }
            }
            const float variance = sum2 / (ne00 * ne01 * step);
            const float scale = 1.0f / sqrtf(variance + eps);

            for (int64_t i02 = start; i02 < end; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
                    ggml_vec_scale_f32(ne00, y, scale);
                }
            }
        }
    }
}

static void ggml_compute_forward_group_norm(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_group_norm_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_mul_mat

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    //const int64_t ne00 = src0->ne[0];
    //const int64_t ne01 = src0->ne[1];

    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];

    // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
    //       all the experts for each batch element and the processing would become incredibly slow
    // TODO: find the optimal values for these
    if (dst->op != GGML_OP_MUL_MAT_ID &&
        ggml_is_contiguous(src0) &&
        ggml_is_contiguous(src1) &&
      //src0->type == GGML_TYPE_F32 &&
        src1->type == GGML_TYPE_F32 &&
        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {

        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
        return true;
    }

    return false;
}
#endif

static void ggml_compute_forward_mul_mat(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_BINARY_OP_LOCALS

    const int ith = params->ith;
    const int nth = params->nth;

    const enum ggml_type type = src0->type;

    const bool src1_cont = ggml_is_contiguous(src1);

    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
    int64_t           const vec_dot_num_rows      = type_traits[type].nrows;

    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
    GGML_ASSERT(ne3 == ne13);

    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows

#if defined(GGML_USE_CLBLAST)
    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
        }
        return;
    }
#endif

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
        const int64_t ne_plane      = ne01*ne00;
        const size_t  desired_wsize = ne13*ne12*ne_plane*sizeof(float);
        UNUSED(desired_wsize);

        if (params->type == GGML_TASK_TYPE_INIT) {
            if (type != GGML_TYPE_F32) {
                assert(params->wsize >= desired_wsize);
                // parallelize by src0 rows
                for (int64_t i13 = 0; i13 < ne13; i13++) {
                    for (int64_t i12 = 0; i12 < ne12; i12++) {
                        // broadcast src0 into src1 across 2nd,3rd dimension
                        const int64_t i03 = i13/r3;
                        const int64_t i02 = i12/r2;

                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                              ggml_to_float_t  const to_float = type_traits[type].to_float;

                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
                        }
                    }
                }
            }
            return;
        }

        if (params->type == GGML_TASK_TYPE_FINALIZE) {
            return;
        }

        // perform sgemm, parallelization controlled by blas lib
        if (ith != 0) {
            return;
        }

        //const int64_t tgemm0 = ggml_perf_time_us();
        for (int64_t i13 = 0; i13 < ne13; i13++) {
            for (int64_t i12 = 0; i12 < ne12; i12++) {
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;

                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);

                if (type != GGML_TYPE_F32) {
                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                }

                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                          ne1, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
            }
        }
        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);

        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);

        return;
    }
#endif

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
        }
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);

            assert(params->wsize >= ne11*ne12*ne13*row_size);
            GGML_ASSERT(src1->type == GGML_TYPE_F32);

            for (int64_t i13 = 0; i13 < ne13; ++i13) {
                for (int64_t i12 = 0; i12 < ne12; ++i12) {
                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
                        wdata += row_size;
                    }
                }
            }
        }

        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
    const size_t row_size = ggml_row_size(vec_dot_type, ne10);

    const int64_t nr0 = ne01;          // src0 rows
    const int64_t nr1 = ne1*ne12*ne13; // src1 rows

    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);

    // distribute the thread work across the inner or outer loop based on which one is larger

    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows

    const int64_t ith0 = ith % nth0;
    const int64_t ith1 = ith / nth0;

    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;

    const int64_t ir010 = dr0*ith0;
    const int64_t ir011 = MIN(ir010 + dr0, nr0);

    const int64_t ir110 = dr1*ith1;
    const int64_t ir111 = MIN(ir110 + dr1, nr1);

    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);

    // threads with no work simply yield (not sure if it helps)
    if (ir010 >= ir011 || ir110 >= ir111) {
        sched_yield();
        return;
    }

    assert(ne12 % ne02 == 0);
    assert(ne13 % ne03 == 0);

    // block-tiling attempt
    const int64_t blck_0 = 16;
    const int64_t blck_1 = 16;

    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
    int64_t nrc = vec_dot_num_rows;
    // TODO: currently the mmla kernels support only even numbered rows/cols.
    // this check can be removed once they are extended to support odd numbered rows/cols too
    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
        nrc = 1;
    }

    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;

    // attempt to reduce false-sharing (does not seem to make a difference)
    // 16 * 2, accounting for mmla kernels
    float tmp[32];

    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
                const int64_t i13 = (ir1/(ne12*ne1));
                const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
                const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);

                // broadcast src0 into src1
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;

                const int64_t i1 = i11;
                const int64_t i2 = i12;
                const int64_t i3 = i13;

                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);

                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
                //       the original src1 data pointer, so we should index using the indices directly
                // TODO: this is a bit of a hack, we should probably have a better way to handle this
                const char * src1_col = (const char *) wdata +
                    (src1_cont || src1->type != vec_dot_type
                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
                     : (i11*nb11 + i12*nb12 + i13*nb13));
                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));

                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
                //}

                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
                    vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
                }

                for (int cn = 0; cn < nrc; ++cn) {
                    memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
                }
            }
        }
    }
}

// ggml_compute_forward_mul_mat_id

static void ggml_compute_forward_mul_mat_id(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * ids = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS

    GGML_TENSOR_BINARY_OP_LOCALS

    const int ith = params->ith;
    const int nth = params->nth;

    const enum ggml_type type = src0->type;

    const bool src1_cont = ggml_is_contiguous(src1);

    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;

    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
    GGML_ASSERT(ne3 == ne13);

    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

    // row groups
    const int id   = ggml_get_op_params_i32(dst, 0);
    const int n_as = ggml_get_op_params_i32(dst, 1);

    char * wdata_src1_end = (src1->type == vec_dot_type) ?
            (char *) params->wdata :
            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));

    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
    int64_t * matrix_rows       = matrix_row_counts + n_as;     // [n_as][ne11]

    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]

   if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
        }
        char * wdata = params->wdata;
        if (src1->type != vec_dot_type) {
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);

            assert(params->wsize >= ne11*ne12*ne13*row_size);
            assert(src1->type == GGML_TYPE_F32);

            for (int64_t i13 = 0; i13 < ne13; ++i13) {
                for (int64_t i12 = 0; i12 < ne12; ++i12) {
                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
                        wdata += row_size;
                    }
                }
            }
        }

        // initialize matrix_row_counts
        GGML_ASSERT(wdata == wdata_src1_end);
        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));

        // group rows by src0 matrix
        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
            const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);

            GGML_ASSERT(row_id >= 0 && row_id < n_as);
            MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
            matrix_row_counts[row_id] += 1;
        }

        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // compute each matrix multiplication in sequence
    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
        const int64_t cne1 = matrix_row_counts[cur_a];

        if (cne1 == 0) {
            continue;
        }

        const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];

        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
        const size_t row_size = ggml_row_size(vec_dot_type, ne10);

        const int64_t nr0 = ne01;           // src0 rows
        const int64_t nr1 = cne1*ne12*ne13; // src1 rows

        //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);

        // distribute the thread work across the inner or outer loop based on which one is larger

        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows

        const int64_t ith0 = ith % nth0;
        const int64_t ith1 = ith / nth0;

        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;

        const int64_t ir010 = dr0*ith0;
        const int64_t ir011 = MIN(ir010 + dr0, nr0);

        const int64_t ir110 = dr1*ith1;
        const int64_t ir111 = MIN(ir110 + dr1, nr1);

        //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);

        // threads with no work simply yield (not sure if it helps)
        if (ir010 >= ir011 || ir110 >= ir111) {
            sched_yield();
            continue;
        }

        assert(ne12 % ne02 == 0);
        assert(ne13 % ne03 == 0);

        // block-tiling attempt
        const int64_t blck_0 = 16;
        const int64_t blck_1 = 16;

        // attempt to reduce false-sharing (does not seem to make a difference)
        float tmp[16];

        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
                    const int64_t  i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
                    const int64_t  i12 = (ir1 - i13*ne12*cne1)/cne1;
                    const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);

                    // broadcast src0 into src1
                    const int64_t i03 = i13/r3;
                    const int64_t i02 = i12/r2;

                    const int64_t i1 = i11;
                    const int64_t i2 = i12;
                    const int64_t i3 = i13;

                    const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);

                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
                    //       the original src1 data pointer, so we should index using the indices directly
                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
                    const char * src1_col = (const char *) wdata +
                        (src1_cont || src1->type != vec_dot_type
                        ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
                        : (i11*nb11 + i12*nb12 + i13*nb13));

                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));

                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
                    //}

                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
                    }
                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
                }
            }
        }
    }

    #undef MMID_MATRIX_ROW
}

// ggml_compute_forward_out_prod

static void ggml_compute_forward_out_prod_f32(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    // int64_t t0 = ggml_perf_time_us();
    // UNUSED(t0);

    GGML_TENSOR_BINARY_OP_LOCALS

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_ASSERT(ne0  == ne00);
    GGML_ASSERT(ne1  == ne10);
    GGML_ASSERT(ne2  == ne02);
    GGML_ASSERT(ne02 == ne12);
    GGML_ASSERT(ne3  == ne13);
    GGML_ASSERT(ne03 == ne13);

    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == sizeof(float));

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    // GGML_ASSERT(nb0 <= nb1);
    // GGML_ASSERT(nb1 <= nb2);
    // GGML_ASSERT(nb2 <= nb3);

    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows

    // TODO: #if defined(GGML_USE_CLBLAST)

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    bool use_blas = ggml_is_matrix(src0) &&
        ggml_is_matrix(src1) &&
        ggml_is_contiguous(src0) &&
        (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
#endif

    if (params->type == GGML_TASK_TYPE_INIT) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
        if (use_blas) {
            return;
        }
#endif
        if (ith != 0) {
            return;
        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (use_blas) {
        if (params->ith != 0) { // All threads other than the first do no work.
            return;
        }
        // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
        // src0: (k,n)
        // src1: (k,m)
        // dst:  (m,n)
        //
        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
        // Also expressed as (major,minor)
        // a: (m,k): so src1 transposed
        // b: (k,n): so src0
        // c: (m,n)
        //
        // However, if ggml_is_transposed(src1) is true, then
        // src1->data already contains a transposed version, so sgemm mustn't
        // transpose it further.

        int n = src0->ne[0];
        int k = src0->ne[1];
        int m = src1->ne[0];

        int transposeA, lda;

        if (!ggml_is_transposed(src1)) {
            transposeA = CblasTrans;
            lda = m;
        } else {
            transposeA = CblasNoTrans;
            lda = k;
        }

        float * a = (float *) ((char *) src1->data);
        float * b = (float *) ((char *) src0->data);
        float * c = (float *) ((char *) dst->data);

        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);

        return;
    }
#endif

    // dst[:,:,:,:] = 0
    // for i2,i3:
    //   for i1:
    //     for i01:
    //       for i0:
    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]

    // parallelize by last three dimensions

    // total rows in dst
    const int64_t nr = ne1*ne2*ne3;

    // rows per thread
    const int64_t dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int64_t ir0 = dr*ith;
    const int64_t ir1 = MIN(ir0 + dr, nr);

    // block-tiling attempt
    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
    const int64_t blck_1 = 16;

    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
        const int64_t bir1 = MIN(bir + blck_1, ir1);
        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
            for (int64_t ir = bir; ir < bir1; ++ir) {
                // dst indices
                const int64_t i3 = ir/(ne2*ne1);
                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);

                const int64_t i02 = i2;
                const int64_t i03 = i3;

                //const int64_t i10 = i1;
                const int64_t i12 = i2;
                const int64_t i13 = i3;

#if GGML_VEC_MAD_UNROLL > 2
                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
                    const int64_t i11 = i01;

                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));

                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
                }
                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
                    const int64_t i11 = i01;

                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));

                    ggml_vec_mad_f32(ne0, d, s0, *s1);
                }
#else
                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
                    const int64_t i11 = i01;

                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));

                    ggml_vec_mad_f32(ne0, d, s0, *s1);
                }
#endif
            }
        }
    }

    //int64_t t1 = ggml_perf_time_us();
    //static int64_t acc = 0;
    //acc += t1 - t0;
    //if (t1 - t0 > 10) {
    //    printf("\n");
    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);

    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
    //}
}

static void ggml_compute_forward_out_prod_q_f32(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    // int64_t t0 = ggml_perf_time_us();
    // UNUSED(t0);

    GGML_TENSOR_BINARY_OP_LOCALS;

    const int ith = params->ith;
    const int nth = params->nth;

    const enum ggml_type type = src0->type;
    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;

    GGML_ASSERT(ne02 == ne12);
    GGML_ASSERT(ne03 == ne13);
    GGML_ASSERT(ne2  == ne12);
    GGML_ASSERT(ne3  == ne13);

    // we don't support permuted src0 dim0
    GGML_ASSERT(nb00 == ggml_type_size(type));

    // dst dim0 cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    // GGML_ASSERT(nb0 <= nb1);
    // GGML_ASSERT(nb1 <= nb2);
    // GGML_ASSERT(nb2 <= nb3);

    GGML_ASSERT(ne0 == ne00);
    GGML_ASSERT(ne1 == ne10);
    GGML_ASSERT(ne2 == ne02);
    GGML_ASSERT(ne3 == ne03);

    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows

    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // parallelize by last three dimensions

    // total rows in dst
    const int64_t nr = ne1*ne2*ne3;

    // rows per thread
    const int64_t dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int64_t ir0 = dr*ith;
    const int64_t ir1 = MIN(ir0 + dr, nr);

    // dst[:,:,:,:] = 0
    // for i2,i3:
    //   for i1:
    //     for i01:
    //       for i0:
    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]

    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;

    for (int64_t ir = ir0; ir < ir1; ++ir) {
        // dst indices
        const int64_t i3 = ir/(ne2*ne1);
        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);

        const int64_t i02 = i2;
        const int64_t i03 = i3;

        //const int64_t i10 = i1;
        const int64_t i12 = i2;
        const int64_t i13 = i3;

        for (int64_t i01 = 0; i01 < ne01; ++i01) {
            const int64_t i11 = i01;

            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));

            dequantize_row_q(s0, wdata, ne0);
            ggml_vec_mad_f32(ne0, d, wdata, *s1);
        }
    }

    //int64_t t1 = ggml_perf_time_us();
    //static int64_t acc = 0;
    //acc += t1 - t0;
    //if (t1 - t0 > 10) {
    //    printf("\n");
    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);

    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
    //}
}

static void ggml_compute_forward_out_prod(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
            {
                ggml_compute_forward_out_prod_q_f32(params, dst);
            } break;
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(false); // todo
                // ggml_compute_forward_out_prod_f16_f32(params, dst);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_out_prod_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_scale

static void ggml_compute_forward_scale_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // scale factor
    float v;
    memcpy(&v, dst->op_params, sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    const size_t nb01 = src0->nb[1];

    const size_t nb1 = dst->nb[1];

    for (int i1 = ir0; i1 < ir1; i1++) {
        if (dst->data != src0->data) {
            // src0 is same shape as dst => same indices
            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
        }
        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
    }
}

static void ggml_compute_forward_scale(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_scale_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_set

static void ggml_compute_forward_set_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));

    // view src0 and dst with these strides and data offset inbytes during set
    // nb0 is implicitly element_size because src0 and dst are contiguous
    size_t nb1     = ((int32_t *) dst->op_params)[0];
    size_t nb2     = ((int32_t *) dst->op_params)[1];
    size_t nb3     = ((int32_t *) dst->op_params)[2];
    size_t offset  = ((int32_t *) dst->op_params)[3];
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];

    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
        if (params->ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
            ((char *)  dst->data),
            ((char *) src0->data),
            ggml_nbytes(dst));
    }

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr = ggml_nrows(src1);
    const int nc = src1->ne[0];

    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)

    // src0 and dst as viewed during set
    const size_t nb0 = ggml_element_size(src0);

    const int im0 = (ne10 == 0 ? 0 : ne10-1);
    const int im1 = (ne11 == 0 ? 0 : ne11-1);
    const int im2 = (ne12 == 0 ? 0 : ne12-1);
    const int im3 = (ne13 == 0 ? 0 : ne13-1);

    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));

    GGML_ASSERT(nb10 == sizeof(float));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int ir = ir0; ir < ir1; ++ir) {
        // src0 and dst are viewed with shape of src1 and offset
        // => same indices
        const int i3 = ir/(ne12*ne11);
        const int i2 = (ir - i3*ne12*ne11)/ne11;
        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);

        ggml_vec_cpy_f32(nc,
                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
    }
}

static void ggml_compute_forward_set(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_set_f32(params, dst);
            } break;
        case GGML_TYPE_F16:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_cpy

static void ggml_compute_forward_cpy(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    ggml_compute_forward_dup(params, dst);
}

// ggml_compute_forward_cont

static void ggml_compute_forward_cont(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    ggml_compute_forward_dup(params, dst);
}

// ggml_compute_forward_reshape

static void ggml_compute_forward_reshape(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    // NOP
    UNUSED(params);
    UNUSED(dst);
}

// ggml_compute_forward_view

static void ggml_compute_forward_view(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * dst) {
    // NOP
    UNUSED(params);
    UNUSED(dst);
}

// ggml_compute_forward_permute

static void ggml_compute_forward_permute(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * dst) {
    // NOP
    UNUSED(params);
    UNUSED(dst);
}

// ggml_compute_forward_transpose

static void ggml_compute_forward_transpose(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * dst) {
    // NOP
    UNUSED(params);
    UNUSED(dst);
}

// ggml_compute_forward_get_rows

static void ggml_compute_forward_get_rows_q(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_BINARY_OP_LOCALS

    const int64_t nc = ne00;
    const int64_t nr = ggml_nelements(src1);

    const enum ggml_type type = src0->type;
    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;

    assert(ne0  == nc);
    assert(ne02 == ne11);
    assert(nb00 == ggml_type_size(type));
    assert(ggml_nrows(dst) == nr);

    const int ith = params->ith;
    const int nth = params->nth;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int64_t i = ir0; i < ir1; ++i) {
        const int64_t i12 = i/(ne11*ne10);
        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);

        dequantize_row_q(
                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
    }
}

static void ggml_compute_forward_get_rows_f16(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_BINARY_OP_LOCALS

    const int64_t nc = ne00;
    const int64_t nr = ggml_nelements(src1);

    assert(ne0  == nc);
    assert(ne02 == ne11);
    assert(nb00 == sizeof(ggml_fp16_t));
    assert(ggml_nrows(dst) == nr);

    const int ith = params->ith;
    const int nth = params->nth;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int64_t i = ir0; i < ir1; ++i) {
        const int64_t i12 = i/(ne11*ne10);
        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);

        ggml_fp16_to_fp32_row(
                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
    }
}

static void ggml_compute_forward_get_rows_f32(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_BINARY_OP_LOCALS

    const int64_t nc = ne00;
    const int64_t nr = ggml_nelements(src1);

    assert(ne0  == nc);
    assert(ne02 == ne11);
    assert(nb00 == sizeof(float));
    assert(ggml_nrows(dst) == nr);

    const int ith = params->ith;
    const int nth = params->nth;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int64_t i = ir0; i < ir1; ++i) {
        const int64_t i12 = i/(ne11*ne10);
        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);

        ggml_vec_cpy_f32(nc,
                (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
                (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
    }
}

static void ggml_compute_forward_get_rows(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
            {
                ggml_compute_forward_get_rows_q(params, dst);
            } break;
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_get_rows_f16(params, dst);
            } break;
        case GGML_TYPE_F32:
        case GGML_TYPE_I32:
            {
                ggml_compute_forward_get_rows_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }

    //static bool first = true;
    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
    //if (first) {
    //    first = false;
    //} else {
    //    for (int k = 0; k < dst->ne[1]; ++k) {
    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
    //            for (int i = 0; i < 16; ++i) {
    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
    //            }
    //            printf("\n");
    //        }
    //        printf("\n");
    //    }
    //    printf("\n");
    //    exit(0);
    //}
}

// ggml_compute_forward_get_rows_back

static void ggml_compute_forward_get_rows_back_f32_f16(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(params->ith == 0);
    GGML_ASSERT(ggml_is_contiguous(dst));

    // ggml_compute_forward_dup_same_cont(params, opt0, dst);

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (params->ith != 0) {
            return;
        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int nc = src0->ne[0];
    const int nr = ggml_nelements(src1);

    GGML_ASSERT( dst->ne[0] == nc);
    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));

    for (int i = 0; i < nr; ++i) {
        const int r = ((int32_t *) src1->data)[i];

        for (int j = 0; j < nc; ++j) {
            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
        }
    }
}

static void ggml_compute_forward_get_rows_back_f32(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(params->ith == 0);
    GGML_ASSERT(ggml_is_contiguous(dst));

    // ggml_compute_forward_dup_same_cont(params, opt0, dst);

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (params->ith != 0) {
            return;
        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int nc = src0->ne[0];
    const int nr = ggml_nelements(src1);

    GGML_ASSERT( dst->ne[0] == nc);
    GGML_ASSERT(src0->nb[0] == sizeof(float));

    for (int i = 0; i < nr; ++i) {
        const int r = ((int32_t *) src1->data)[i];

        ggml_vec_add_f32(nc,
                (float *) ((char *)  dst->data + r*dst->nb[1]),
                (float *) ((char *)  dst->data + r*dst->nb[1]),
                (float *) ((char *) src0->data + i*src0->nb[1]));
    }
}

static void ggml_compute_forward_get_rows_back(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_get_rows_back_f32_f16(params, dst);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_get_rows_back_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }

    //static bool first = true;
    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
    //if (first) {
    //    first = false;
    //} else {
    //    for (int k = 0; k < dst->ne[1]; ++k) {
    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
    //            for (int i = 0; i < 16; ++i) {
    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
    //            }
    //            printf("\n");
    //        }
    //        printf("\n");
    //    }
    //    printf("\n");
    //    exit(0);
    //}
}

// ggml_compute_forward_diag

static void ggml_compute_forward_diag_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // TODO: handle transposed/permuted matrices

    GGML_TENSOR_UNARY_OP_LOCALS

    GGML_ASSERT(ne00 == ne0);
    GGML_ASSERT(ne00 == ne1);
    GGML_ASSERT(ne01 == 1);
    GGML_ASSERT(ne02 == ne2);
    GGML_ASSERT(ne03 == ne3);

    GGML_ASSERT(nb00 == sizeof(float));
    GGML_ASSERT(nb0  == sizeof(float));

    for (int i3 = 0; i3 < ne3; i3++) {
        for (int i2 = 0; i2 < ne2; i2++) {
            for (int i1 = 0; i1 < ne1; i1++) {
                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
                for (int i0 = 0; i0 < i1; i0++) {
                    d[i0] = 0;
                }
                d[i1] = s[i1];
                for (int i0 = i1+1; i0 < ne0; i0++) {
                    d[i0] = 0;
                }
            }
        }
    }
}

static void ggml_compute_forward_diag(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_diag_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_diag_mask_inf

static void ggml_compute_forward_diag_mask_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const float value) {

    const struct ggml_tensor * src0 = dst->src[0];

    const int ith = params->ith;
    const int nth = params->nth;

    const int  n_past  = ((int32_t *) dst->op_params)[0];
    const bool inplace = src0->data == dst->data;

    GGML_ASSERT(n_past >= 0);

    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
        if (ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
        GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
        memcpy(
            ((char *)  dst->data),
            ((char *) src0->data),
            ggml_nbytes(dst));
    }

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // TODO: handle transposed/permuted matrices

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    const int nr = src0->ne[1];
    const int nz = n/nr;

    GGML_ASSERT( dst->nb[0] == sizeof(float));
    GGML_ASSERT(src0->nb[0] == sizeof(float));

    for (int k = 0; k < nz; k++) {
        for (int j = ith; j < nr; j += nth) {
            for (int i = n_past; i < nc; i++) {
                if (i > n_past + j) {
                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
                }
            }
        }
    }
}

static void ggml_compute_forward_diag_mask_inf(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

static void ggml_compute_forward_diag_mask_zero(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_diag_mask_f32(params, dst, 0);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_soft_max

static void ggml_compute_forward_soft_max_f32(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    const struct ggml_tensor * src2 = dst->src[2];

    assert(ggml_is_contiguous(dst));
    assert(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    float scale    = 1.0f;
    float max_bias = 0.0f;

    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));

    // TODO: handle transposed/permuted matrices

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_UNARY_OP_LOCALS

    const int64_t ne11 = src1 ? src1->ne[1] : 1;

    // TODO: is this supposed to be ceil instead of floor?
    //       https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
    const uint32_t n_head_kv   = ne02;
    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));

    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;

    // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
    float * pos = src2 ? (float *) src2->data : src0->data;

    for (int i1 = ir0; i1 < ir1; i1++) {
        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);

        // broadcast the mask across rows
        float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;

        ggml_vec_cpy_f32  (nc, wp, sp);
        ggml_vec_scale_f32(nc, wp, scale);
        if (mp) {
            ggml_vec_acc_f32(nc, wp, mp);
        }

        // ALiBi bias
        if (max_bias > 0.0f) {
            const uint32_t h  = (i1/ne01)%ne02; // head
            const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);

            for (int i = 0; i < nc; i++) {
                wp[i] = wp[i] + slope*pos[i];
            }
        }

#ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            //printf("p[%d] = %f\n", i, p[i]);
            assert(!isnan(wp[i]));
        }
#endif

        float max = -INFINITY;
        ggml_vec_max_f32(nc, &max, wp);

        ggml_float sum = 0.0;

        uint16_t scvt;
        for (int i = 0; i < nc; i++) {
            if (wp[i] == -INFINITY) {
                dp[i] = 0.0f;
            } else {
                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
                ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
                memcpy(&scvt, &s, sizeof(scvt));
                const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
                sum += (ggml_float)val;
                dp[i] = val;
            }
        }

        assert(sum > 0.0);

        sum = 1.0/sum;
        ggml_vec_scale_f32(nc, dp, sum);

#ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            assert(!isnan(dp[i]));
            assert(!isinf(dp[i]));
        }
#endif
    }
}

static void ggml_compute_forward_soft_max(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_soft_max_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_soft_max_back

static void ggml_compute_forward_soft_max_back_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    GGML_ASSERT(ggml_is_contiguous(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_are_same_shape(src1, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // TODO: handle transposed/permuted matrices

    const int ith = params->ith;
    const int nth = params->nth;

    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int i1 = ir0; i1 < ir1; i1++) {
        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);

#ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            //printf("p[%d] = %f\n", i, p[i]);
            assert(!isnan(dy[i]));
            assert(!isnan(y[i]));
        }
#endif
        // Jii = yi - yi*yi
        // Jij = -yi*yj
        // J = diag(y)-y.T*y
        // dx = J * dy
        // dxk = sum_i(Jki * dyi)
        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
        // dxk = -yk * dot(y, dy) + yk*dyk
        // dxk = yk * (- dot(y, dy) + dyk)
        // dxk = yk * (dyk - dot(y, dy))
        //
        // post-order:
        // dot_y_dy := dot(y, dy)
        // dx := dy
        // dx := dx - dot_y_dy
        // dx := dx * y

        // linear runtime, no additional memory
        float dot_y_dy = 0;
        ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
        ggml_vec_cpy_f32 (nc, dx, dy);
        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
        ggml_vec_mul_f32 (nc, dx, dx, y);

#ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            assert(!isnan(dx[i]));
            assert(!isinf(dx[i]));
        }
#endif
    }
}

static void ggml_compute_forward_soft_max_back(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_soft_max_back_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_alibi

static void ggml_compute_forward_alibi_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_head = ((int32_t *) dst->op_params)[1];
    float max_bias;
    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));

    const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
    const int64_t ne1 = src0->ne[1]; // seq_len_without_past
    const int64_t ne2 = src0->ne[2]; // n_head -> this is k
    //const int64_t ne3 = src0->ne[3]; // 1 -> bsz

    const int64_t n  = ggml_nrows(src0);
    const int64_t ne2_ne3 = n/ne1; // ne2*ne3

    const size_t nb0 = src0->nb[0];
    const size_t nb1 = src0->nb[1];
    const size_t nb2 = src0->nb[2];
    //const int nb3 = src0->nb[3];

    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(n_head == ne2);

    // add alibi to src0 (KQ_scaled)
    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));

    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);

    for (int64_t k = 0; k < ne2_ne3; k++) {
        // TODO: k*nb2 or k*nb3
        float m_k;

        if (k < n_heads_log2_floor) {
            m_k = powf(m0, k + 1);
        } else {
            m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
        }

        for (int64_t i = 0; i < ne0; i++) {
            for (int64_t j = 0; j < ne1; j++) {
                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
                pdst[0] = i * m_k + src[0];
            }
        }
    }
}

static void ggml_compute_forward_alibi_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_head = ((int32_t *) dst->op_params)[1];
    float max_bias;
    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));

    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
    const int ne1 = src0->ne[1]; // seq_len_without_past
    const int ne2 = src0->ne[2]; // n_head -> this is k
    //const int ne3 = src0->ne[3]; // 1 -> bsz

    const int n  = ggml_nrows(src0);
    const int ne2_ne3 = n/ne1; // ne2*ne3

    const int nb0 = src0->nb[0];
    const int nb1 = src0->nb[1];
    const int nb2 = src0->nb[2];
    //const int nb3 = src0->nb[3];

    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
    //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
    GGML_ASSERT(n_head == ne2);

    // add alibi to src0 (KQ_scaled)
    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));

    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);

    for (int k = 0; k < ne2_ne3; k++) {
        // TODO: k*nb2 or k*nb3
        float m_k;

        if (k < n_heads_log2_floor) {
            m_k = powf(m0, k + 1);
        } else {
            m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
        }

        for (int i = 0; i < ne0; i++) {
            for (int j = 0; j < ne1; j++) {
                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
                float       *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);

                // we return F32
                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
            }
        }
    }
}

static void ggml_compute_forward_alibi(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_alibi_f16(params, dst);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_alibi_f32(params, dst);
            } break;
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q8_K:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
        case GGML_TYPE_I64:
        case GGML_TYPE_F64:
        case GGML_TYPE_COUNT:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_clamp

static void ggml_compute_forward_clamp_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    float min;
    float max;
    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    const size_t nb00 = src0->nb[0];
    const size_t nb01 = src0->nb[1];

    const size_t nb0 = dst->nb[0];
    const size_t nb1 = dst->nb[1];

    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));

    for (int j = ith; j < n; j += nth) {
        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);

        for (int i = 0; i < nc; i++) {
            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
        }
    }
}

static void ggml_compute_forward_clamp(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_clamp_f32(params, dst);
            } break;
        case GGML_TYPE_F16:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q8_1:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:
        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q8_K:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
        case GGML_TYPE_I64:
        case GGML_TYPE_F64:
        case GGML_TYPE_COUNT:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_rope

static float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
    return 1 - MIN(1, MAX(0, y));
}

// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
static void rope_yarn(
    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
    float * cos_theta, float * sin_theta
) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
    if (ext_factor != 0.0f) {
        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;

        // Get n-d magnitude scaling corrected for interpolation
        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
    }
    *cos_theta = cosf(theta) * mscale;
    *sin_theta = sinf(theta) * mscale;
}

// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
}

static void ggml_rope_cache_init(
     float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
     float * cache, float sin_sign, float theta_scale
) {
    float theta = theta_base;
    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
        rope_yarn(
            theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
        );
        cache[i0 + 1] *= sin_sign;

        theta *= theta_scale;
    }
}

GGML_CALL void ggml_rope_yarn_corr_dims(
    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
) {
    // start and end correction dims
    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
    dims[0] = MAX(0, start);
    dims[1] = MIN(n_dims - 1, end);
}

static void ggml_compute_forward_rope_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const bool forward) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;

    // these two only relevant for xPos RoPE:
    float xpos_base;
    bool  xpos_down;

    //const int n_past     = ((int32_t *) dst->op_params)[0];
    const int n_dims     = ((int32_t *) dst->op_params)[1];
    const int mode       = ((int32_t *) dst->op_params)[2];
    const int n_ctx      = ((int32_t *) dst->op_params)[3];
    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];

    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));

    GGML_TENSOR_UNARY_OP_LOCALS

    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);

    GGML_ASSERT(nb00 == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr = ggml_nrows(dst);

    GGML_ASSERT(n_dims <= ne0);
    GGML_ASSERT(n_dims % 2 == 0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    // row index used to determine which thread to use
    int ir = 0;

    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const float inv_ndims = -1.f/n_dims;
    float corr_dims[2];
    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);

    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

    // backward process uses inverse rotation by cos and sin.
    // cos and sin build a rotation matrix, where the inverse is the transpose.
    // this essentially just switches the sign of sin.
    const float sin_sign = forward ? 1.0f : -1.0f;

    const int32_t * pos = (const int32_t *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            const int64_t p = pos[i2];

            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
            }

            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;

                float theta_base = (float)p;

                if (is_glm) {
                    theta_base = MIN(p, n_ctx - 2);
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
                        const float sin_theta = sinf(theta_base) * sin_sign;
                        const float cos_block_theta = cosf(block_theta);
                        const float sin_block_theta = sinf(block_theta) * sin_sign;

                        theta_base *= theta_scale;
                        block_theta *= theta_scale;

                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);

                        const float x0 = src[0];
                        const float x1 = src[n_dims/2];
                        const float x2 = src[n_dims];
                        const float x3 = src[n_dims/2*3];

                        dst_data[0]          = x0*cos_theta - x1*sin_theta;
                        dst_data[n_dims/2]   = x0*sin_theta + x1*cos_theta;
                        dst_data[n_dims]     = x2*cos_block_theta - x3*sin_block_theta;
                        dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
                    }
                } else if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cache[i0 + 0];
                        const float sin_theta = cache[i0 + 1];

                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                        if (xpos_down) zeta = 1.0f / zeta;

                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                        const float x0 = src[0];
                        const float x1 = src[1];

                        dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
                    }
                } else {
                    // TODO: this might be wrong for ne0 != n_dims - need double check
                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
                    theta_base *= freq_scale;
                    for (int64_t ic = 0; ic < ne0; ic += 2) {
                        if (ic < n_dims) {
                            const int64_t ib = 0;

                            // simplified from `(ib * n_dims + ic) * inv_ndims`
                            float cur_rot = inv_ndims * ic - ib;

                            float cos_theta, sin_theta;
                            rope_yarn(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
                            sin_theta *= sin_sign;

                            theta_base *= theta_scale;

                            const int64_t i0 = ib*n_dims + ic/2;

                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                            const float x0 = src[0];
                            const float x1 = src[n_dims/2];

                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
                        } else {
                            const int64_t i0 = ic;

                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                            dst_data[0] = src[0];
                            dst_data[1] = src[1];
                        }
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_rope_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const bool forward) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;

    //const int n_past     = ((int32_t *) dst->op_params)[0];
    const int n_dims     = ((int32_t *) dst->op_params)[1];
    const int mode       = ((int32_t *) dst->op_params)[2];
    const int n_ctx      = ((int32_t *) dst->op_params)[3];
    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));

    GGML_TENSOR_UNARY_OP_LOCALS

    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);

    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));

    const int ith = params->ith;
    const int nth = params->nth;

    const int nr = ggml_nrows(dst);

    GGML_ASSERT(n_dims <= ne0);
    GGML_ASSERT(n_dims % 2 == 0);

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    // row index used to determine which thread to use
    int ir = 0;

    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const float inv_ndims = -1.f/n_dims;
    float corr_dims[2];
    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);

    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

    // backward process uses inverse rotation by cos and sin.
    // cos and sin build a rotation matrix, where the inverse is the transpose.
    // this essentially just switches the sign of sin.
    const float sin_sign = forward ? 1.0f : -1.0f;

    const int32_t * pos = (const int32_t *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
            const int64_t p = pos[i2];

            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
            }

            for (int64_t i1 = 0; i1 < ne1; i1++) {
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;

                float theta_base = (float)p;

                if (is_glm) {
                    theta_base = MIN(p, n_ctx - 2);
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
                        const float sin_theta = sinf(theta_base) * sin_sign;
                        const float cos_block_theta = cosf(block_theta);
                        const float sin_block_theta = sinf(block_theta) * sin_sign;

                        theta_base *= theta_scale;
                        block_theta *= theta_scale;

                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);

                        const float x0 = GGML_FP16_TO_FP32(src[0]);
                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
                        const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
                        const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);

                        dst_data[0]          = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                        dst_data[n_dims/2]   = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                        dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
                    }
                } else if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
                        const float cos_theta = cache[i0 + 0];
                        const float sin_theta = cache[i0 + 1];

                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                        const float x0 = GGML_FP16_TO_FP32(src[0]);
                        const float x1 = GGML_FP16_TO_FP32(src[1]);

                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                    }
                } else {
                    // TODO: this might be wrong for ne0 != n_dims - need double check
                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
                    theta_base *= freq_scale;
                    for (int64_t ic = 0; ic < ne0; ic += 2) {
                        if (ic < n_dims) {
                            const int64_t ib = 0;

                            // simplified from `(ib * n_dims + ic) * inv_ndims`
                            float cur_rot = inv_ndims * ic - ib;

                            float cos_theta, sin_theta;
                            rope_yarn(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
                            sin_theta *= sin_sign;

                            theta_base *= theta_scale;

                            const int64_t i0 = ib*n_dims + ic/2;

                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                            const float x0 = GGML_FP16_TO_FP32(src[0]);
                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);

                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                        } else {
                            const int64_t i0 = ic;

                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                            dst_data[0] = src[0];
                            dst_data[1] = src[1];
                        }
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_rope(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_rope_f16(params, dst, true);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_rope_f32(params, dst, true);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_rope_back

static void ggml_compute_forward_rope_back(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_rope_f16(params, dst, false);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_rope_f32(params, dst, false);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_conv_transpose_1d

static void ggml_compute_forward_conv_transpose_1d_f16_f32(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_BINARY_OP_LOCALS

    const int ith = params->ith;
    const int nth = params->nth;

    const int nk = ne00*ne01*ne02;

    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);

        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
        {
            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;

            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        dst_data[i00*ne02 + i02] = src[i00];
                    }
                }
            }
        }

        // permute source data (src1) from (L x Cin) to (Cin x L)
        {
            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
            ggml_fp16_t * dst_data = wdata;

            for (int64_t i11 = 0; i11 < ne11; i11++) {
                const float * const src = (float *)((char *) src1->data + i11*nb11);
                for (int64_t i10 = 0; i10 < ne10; i10++) {
                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
                }
            }
        }

        // need to zero dst since we are accumulating into it
        memset(dst->data, 0, ggml_nbytes(dst));

        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];

    // total rows in dst
    const int nr = ne1;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
    ggml_fp16_t * const wdata_src = wdata + nk;

    for (int i1 = ir0; i1 < ir1; i1++) {
        float * dst_data = (float *)((char *) dst->data + i1*nb1);
        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
        for (int i10 = 0; i10 < ne10; i10++) {
            const int i1n = i10*ne11;
            for (int i00 = 0; i00 < ne00; i00++) {
                float v = 0;
                ggml_vec_dot_f16(ne02, &v, 0,
                        (ggml_fp16_t *)    wdata_src + i1n, 0,
                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
                dst_data[i10*s0 + i00] += v;
            }
        }
    }
}

static void ggml_compute_forward_conv_transpose_1d_f32(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_BINARY_OP_LOCALS

    const int ith = params->ith;
    const int nth = params->nth;

    const int nk = ne00*ne01*ne02;

    GGML_ASSERT(nb00 == sizeof(float));
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);

        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
        {
            float * const wdata = (float *) params->wdata + 0;

            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
                    float * dst_data = wdata + i01*ne00*ne02;
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        dst_data[i00*ne02 + i02] = src[i00];
                    }
                }
            }
        }

        // prepare source data (src1)
        {
            float * const wdata = (float *) params->wdata + nk;
            float * dst_data = wdata;

            for (int64_t i11 = 0; i11 < ne11; i11++) {
                const float * const src = (float *)((char *) src1->data + i11*nb11);
                for (int64_t i10 = 0; i10 < ne10; i10++) {
                    dst_data[i10*ne11 + i11] = src[i10];
                }
            }
        }

        // need to zero dst since we are accumulating into it
        memset(dst->data, 0, ggml_nbytes(dst));

        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];

    // total rows in dst
    const int nr = ne1;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    float * const wdata     = (float *) params->wdata + 0;
    float * const wdata_src = wdata + nk;

    for (int i1 = ir0; i1 < ir1; i1++) {
        float * dst_data = (float *)((char *) dst->data + i1*nb1);
        float * wdata_kernel = wdata + i1*ne02*ne00;
        for (int i10 = 0; i10 < ne10; i10++) {
            const int i1n = i10*ne11;
            for (int i00 = 0; i00 < ne00; i00++) {
                float v = 0;
                ggml_vec_dot_f32(ne02, &v, 0,
                        wdata_src + i1n, 0,
                        wdata_kernel + i00*ne02, 0, 1);
                dst_data[i10*s0 + i00] += v;
            }
        }
    }
}

static void ggml_compute_forward_conv_transpose_1d(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_conv_transpose_1d_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// src0: kernel [OC, IC, KH, KW]
// src1: image [N, IC, IH, IW]
// dst:  result [N, OH, OW, IC*KH*KW]
static void ggml_compute_forward_im2col_f32(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_BINARY_OP_LOCALS;

    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t N  = is_2D ? ne13 : ne12;
    const int64_t IC = is_2D ? ne12 : ne11;
    const int64_t IH = is_2D ? ne11 : 1;
    const int64_t IW = ne10;

    const int64_t KH = is_2D ? ne01 : 1;
    const int64_t KW = ne00;

    const int64_t OH = is_2D ? ne2 : 1;
    const int64_t OW = ne1;

    int ofs0 = is_2D ? nb13 : nb12;
    int ofs1 = is_2D ? nb12 : nb11;

    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_TYPE_INIT) {
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
    {
        float * const wdata = (float *) dst->data;

        for (int64_t in = 0; in < N; in++) {
            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
                for (int64_t iow = 0; iow < OW; iow++) {
                    for (int64_t iic = ith; iic < IC; iic += nth) {

                        // micro kernel
                        float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]

                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
                            for (int64_t ikw = 0; ikw < KW; ikw++) {
                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
                                const int64_t iih = ioh*s1 + ikh*d1 - p1;

                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
                                } else {
                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}


// src0: kernel [OC, IC, KH, KW]
// src1: image [N, IC, IH, IW]
// dst:  result [N, OH, OW, IC*KH*KW]
static void ggml_compute_forward_im2col_f16(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F16);

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_BINARY_OP_LOCALS;

    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t N  = is_2D ? ne13 : ne12;
    const int64_t IC = is_2D ? ne12 : ne11;
    const int64_t IH = is_2D ? ne11 : 1;
    const int64_t IW = ne10;

    const int64_t KH = is_2D ? ne01 : 1;
    const int64_t KW = ne00;

    const int64_t OH = is_2D ? ne2 : 1;
    const int64_t OW = ne1;

    int ofs0 = is_2D ? nb13 : nb12;
    int ofs1 = is_2D ? nb12 : nb11;

    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_TYPE_INIT) {
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
    {
        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;

        for (int64_t in = 0; in < N; in++) {
            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
                for (int64_t iow = 0; iow < OW; iow++) {
                    for (int64_t iic = ith; iic < IC; iic += nth) {

                        // micro kernel
                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]

                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
                            for (int64_t ikw = 0; ikw < KW; ikw++) {
                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
                                const int64_t iih = ioh*s1 + ikh*d1 - p1;

                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
                                } else {
                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_im2col(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {
    switch (dst->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_im2col_f16(params, dst);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_im2col_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}


// ggml_compute_forward_conv_transpose_2d

static void ggml_compute_forward_conv_transpose_2d(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_BINARY_OP_LOCALS

    const int ith = params->ith;
    const int nth = params->nth;

    const int nk = ne00*ne01*ne02*ne03;

    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);

        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
        {
            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;

            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
                    for (int64_t i01 = 0; i01 < ne01; i01++) {
                        for (int64_t i00 = 0; i00 < ne00; i00++) {
                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
                        }
                    }
                }
            }
        }

        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
        {
            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
            for (int i12 = 0; i12 < ne12; i12++) {
                for (int i11 = 0; i11 < ne11; i11++) {
                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
                    for (int i10 = 0; i10 < ne10; i10++) {
                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
                    }
                }
            }
        }

        memset(dst->data, 0, ggml_nbytes(dst));

        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int32_t stride = ggml_get_op_params_i32(dst, 0);

    // total patches in dst
    const int np = ne2;

    // patches per thread
    const int dp = (np + nth - 1)/nth;

    // patch range for this thread
    const int ip0 = dp*ith;
    const int ip1 = MIN(ip0 + dp, np);

    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
    ggml_fp16_t * const wdata_src = wdata + nk;

    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
        float * dst_data = (float *)((char *) dst->data + i2*nb2);
        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
        for (int i11 = 0; i11 < ne11; i11++) {
            for (int i10 = 0; i10 < ne10; i10++) {
                const int i1n = i11*ne10*ne12 + i10*ne12;
                for (int i01 = 0; i01 < ne01; i01++) {
                    for (int i00 = 0; i00 < ne00; i00++) {
                        float v = 0;
                        ggml_vec_dot_f16(ne03, &v, 0,
                                wdata_src + i1n, 0,
                                wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
                    }
                }
            }
        }
    }
}

// ggml_compute_forward_pool_1d_sk_p0

static void ggml_compute_forward_pool_1d_sk_p0(
        const struct ggml_compute_params * params,
        const enum ggml_op_pool op,
        const int k,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src = dst->src[0];

    assert(src->type == GGML_TYPE_F32);
    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const char * cdata = (const char *)src->data;
    const char * const data_end = cdata + ggml_nbytes(src);
    float * drow = (float *)dst->data;

    const int64_t rs = dst->ne[0];

    while (cdata < data_end) {
        const float * const srow = (const float *)cdata;

        int j = 0;

        for (int64_t i = 0; i < rs; ++i) {
            switch (op) {
                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
            }
            for (int ki = 0; ki < k; ++ki) {
                switch (op) {
                    case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
                    case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
                    case GGML_OP_POOL_COUNT:                        GGML_ASSERT(false); break;
                }
                ++j;
            }
            switch (op) {
                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
                case GGML_OP_POOL_MAX:                       break;
                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
            }
        }

        cdata += src->nb[1];
        drow  += rs;
    }
}

// ggml_compute_forward_pool_1d

static void ggml_compute_forward_pool_1d(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const int32_t * opts = (const int32_t *)dst->op_params;
    enum ggml_op_pool op = opts[0];
    const int k0 = opts[1];
    const int s0 = opts[2];
    const int p0 = opts[3];
    GGML_ASSERT(p0 == 0); // padding not supported
    GGML_ASSERT(k0 == s0); // only s = k supported

    ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
}

// ggml_compute_forward_pool_2d

static void ggml_compute_forward_pool_2d(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src = dst->src[0];

    GGML_ASSERT(src->type == GGML_TYPE_F32);
    GGML_ASSERT(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int32_t * opts = (const int32_t *)dst->op_params;
    enum ggml_op_pool op = opts[0];
    const int k0 = opts[1];
    const int k1 = opts[2];
    const int s0 = opts[3];
    const int s1 = opts[4];
    const int p0 = opts[5];
    const int p1 = opts[6];
    const char * cdata = (const char*)src->data;
    const char * const data_end = cdata + ggml_nbytes(src);

    const int64_t px = dst->ne[0];
    const int64_t py = dst->ne[1];
    const int64_t pa = px * py;

    float * dplane = (float *)dst->data;

    const int ka = k0 * k1;
    const int offset0 = -p0;
    const int offset1 = -p1;

    while (cdata < data_end) {
        for (int oy = 0; oy < py; ++oy) {
            float * const drow = dplane + oy * px;
            for (int ox = 0; ox < px; ++ox) {
                float * const out =  drow + ox;
                switch (op) {
                    case GGML_OP_POOL_AVG:     *out = 0;        break;
                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
                }

                const int ix = offset0 + ox * s0;
                const int iy = offset1 + oy * s1;

                for (int ky = 0; ky < k1; ++ky) {
                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
                    for (int kx = 0; kx < k0; ++kx) {
                        int j = ix + kx;
                        if (j < 0 || j >= src->ne[0]) continue;
                        switch (op) {
                            case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
                            case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
                            case GGML_OP_POOL_COUNT:                GGML_ASSERT(false); break;
                        }
                    }
                }
                switch (op) {
                    case GGML_OP_POOL_AVG:           *out /= ka; break;
                    case GGML_OP_POOL_MAX:                       break;
                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
                }
            }
        }

        cdata  += src->nb[2];
        dplane += pa;
    }
}

// ggml_compute_forward_upscale

static void ggml_compute_forward_upscale_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_UNARY_OP_LOCALS

    const int scale_factor = dst->op_params[0];

    // TODO: optimize

    for (int64_t i3 = 0; i3 < ne3; i3++) {
        const int64_t i03 = i3;
        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
            const int64_t i02 = i2;
            for (int64_t i1 = 0; i1 < ne1; i1++) {
                const int64_t i01 = i1 / scale_factor;
                for (int64_t i0 = 0; i0 < ne0; i0++) {
                    const int64_t i00 = i0 / scale_factor;

                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);

                    *y = *x;
                }
            }
        }
    }
}

static void ggml_compute_forward_upscale(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_upscale_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_pad

static void ggml_compute_forward_pad_f32(
    const struct ggml_compute_params * params,
          struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));
    GGML_ASSERT( dst->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_UNARY_OP_LOCALS

    float * dst_ptr = (float *) dst->data;

    // TODO: optimize

    for (int64_t i2 = 0; i2 < ne2; ++i2) {
        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
            for (int64_t i0 = 0; i0 < ne0; ++i0) {
                for (int64_t i3 = 0; i3 < ne3; ++i3) {
                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;

                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);

                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
                        dst_ptr[dst_idx] = *src_ptr;
                    } else {
                        dst_ptr[dst_idx] = 0;
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_pad(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_pad_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}


// ggml_compute_forward_arange

static void ggml_compute_forward_arange_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_ASSERT(dst->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    const float start = ggml_get_op_params_f32(dst, 0);
    const float stop  = ggml_get_op_params_f32(dst, 1);
    const float step  = ggml_get_op_params_f32(dst, 2);

    const int64_t steps = (int64_t) ceilf((stop - start) / step);

    GGML_ASSERT(ggml_nelements(dst) == steps);

    for (int64_t i = ith; i < steps; i+= nth) {
        float value = start + step * i;
        ((float *)dst->data)[i] = value;
    }
}

static void ggml_compute_forward_arange(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {
    switch (dst->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_arange_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

static void ggml_compute_forward_timestep_embedding_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(src0->nb[0] == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_UNARY_OP_LOCALS

    const int dim = ggml_get_op_params_i32(dst, 0);
    const int max_period = ggml_get_op_params_i32(dst, 1);

    int half = dim / 2;

    for (int64_t i = 0; i < ne00; i++) {
        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
        for (int64_t j = ith; j < half; j += nth) {
            float timestep = ((float *)src0->data)[i];
            float freq = (float)expf(-logf(max_period) * j / half);
            float arg = timestep * freq;
            embed_data[j] = cosf(arg);
            embed_data[j + half] = sinf(arg);
        }
        if (dim % 2 != 0 && ith == 0) {
            embed_data[dim] = 0.f;
        }
    }
}

static void ggml_compute_forward_timestep_embedding(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_timestep_embedding_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_argsort

static void ggml_compute_forward_argsort_f32(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_UNARY_OP_LOCALS

    GGML_ASSERT(nb0 == sizeof(float));

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t nr = ggml_nrows(src0);

    enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);

    for (int64_t i = ith; i < nr; i += nth) {
        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
        const float * src_data = (float *)((char *) src0->data + i*nb01);

        for (int64_t j = 0; j < ne0; j++) {
            dst_data[j] = j;
        }

        // C doesn't have a functional sort, so we do a bubble sort instead
        for (int64_t j = 0; j < ne0; j++) {
            for (int64_t k = j + 1; k < ne0; k++) {
                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
                    int32_t tmp = dst_data[j];
                    dst_data[j] = dst_data[k];
                    dst_data[k] = tmp;
                }
            }
        }
    }
}

static void ggml_compute_forward_argsort(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_argsort_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_flash_attn

static void ggml_compute_forward_flash_attn_f32(
        const struct ggml_compute_params * params,
        const bool masked,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * q = dst->src[0];
    const struct ggml_tensor * k = dst->src[1];
    const struct ggml_tensor * v = dst->src[2];

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t D = neq0;
    const int64_t N = neq1;
    const int64_t P = nek1 - N;
    const int64_t M = P + N;

    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);

    GGML_ASSERT(ne0 == D);
    GGML_ASSERT(ne1 == N);
    GGML_ASSERT(P >= 0);

    GGML_ASSERT(nbq0 == sizeof(float));
    GGML_ASSERT(nbk0 == sizeof(float));
    GGML_ASSERT(nbv0 == sizeof(float));

    GGML_ASSERT(neq0 == D);
    GGML_ASSERT(nek0 == D);
    GGML_ASSERT(nev1 == D);

    GGML_ASSERT(neq1 == N);
    GGML_ASSERT(nek1 == N + P);
    GGML_ASSERT(nev1 == D);

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

    if (params->type == GGML_TASK_TYPE_INIT) {
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // parallelize by q rows using ggml_vec_dot_f32

    // total rows in q
    const int nr = neq1*neq2*neq3;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    const float scale = 1.0f/sqrtf(D);

    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);

    for (int ir = ir0; ir < ir1; ++ir) {
        // q indices
        const int iq3 = ir/(neq2*neq1);
        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);

        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);

        for (int i = M; i < Mup; ++i) {
            S[i] = -INFINITY;
        }

        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
        for (int64_t ic = 0; ic < masked_begin; ++ic) {
            // k indices
            const int ik3 = iq3;
            const int ik2 = iq2 % nek2;
            const int ik1 = ic;

            // S indices
            const int i1 = ik1;

            ggml_vec_dot_f32(neq0,
                    S + i1, 0,
                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
        }

        // scale
        ggml_vec_scale_f32(masked_begin, S, scale);

        for (int64_t i = masked_begin; i < M; i++) {
            S[i] = -INFINITY;
        }

        // softmax
        // exclude known -INF S[..] values from max and loop
        // dont forget to set their SW values to zero
        {
            float max = -INFINITY;
            ggml_vec_max_f32(masked_begin, &max, S);

            ggml_float sum = 0.0;
            {
#ifdef GGML_SOFT_MAX_ACCELERATE
                max = -max;
                vDSP_vsadd(S, 1, &max, S, 1, Mup);
                vvexpf(S, S, &Mup);
                ggml_vec_sum_f32(Mup, &sum, S);
#else
                uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };

                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
                    if (i >= masked_begin) {
                        break;
                    }
                    float * SS = S + i;

                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
                        if (i + j >= masked_begin) {
                            break;
                        } else if (SS[j] == -INFINITY) {
                            SS[j] = 0.0f;
                        } else {
#ifndef GGML_FLASH_ATTN_EXP_FP16
                            const float val = expf(SS[j] - max);
#else
                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                            memcpy(&scvt[j], &s, sizeof(uint16_t));
                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
#endif
                            sump[j] += (ggml_float)val;
                            SS[j] = val;
                        }
                    }
                }

                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
                    sum += sump[i];
                }
#endif
            }

            assert(sum > 0.0);

            sum = 1.0/sum;
            ggml_vec_scale_f32(masked_begin, S, sum);

#ifndef NDEBUG
            for (int i = 0; i < masked_begin; ++i) {
                assert(!isnan(S[i]));
                assert(!isinf(S[i]));
            }
#endif
        }

        for (int64_t ic = 0; ic < nev1; ++ic) {
            // dst indices
            const int i1 = iq1;
            const int i2 = iq2;
            const int i3 = iq3;

            // v indices
            const int iv2 = iq2 % nev2;
            const int iv3 = iq3;

            ggml_vec_dot_f32(masked_begin,
                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
                    S, 0, 1);
        }
    }
}

static void ggml_compute_forward_flash_attn_f16(
        const struct ggml_compute_params * params,
        const bool masked,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * q = dst->src[0];
    const struct ggml_tensor * k = dst->src[1];
    const struct ggml_tensor * v = dst->src[2];

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t D = neq0;
    const int64_t N = neq1;
    const int64_t P = nek1 - N;
    const int64_t M = P + N;

    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);

    GGML_ASSERT(ne0 == D);
    GGML_ASSERT(ne1 == N);
    GGML_ASSERT(P >= 0);

    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));

    GGML_ASSERT(neq0 == D);
    GGML_ASSERT(nek0 == D);
    GGML_ASSERT(nev1 == D);

    GGML_ASSERT(neq1 == N);
    GGML_ASSERT(nek1 == N + P);
    GGML_ASSERT(nev1 == D);

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

    if (params->type == GGML_TASK_TYPE_INIT) {
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // parallelize by q rows using ggml_vec_dot_f32

    // total rows in q
    const int nr = neq1*neq2*neq3;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    const float scale = 1.0f/sqrtf(D);

    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);

    for (int ir = ir0; ir < ir1; ++ir) {
        // q indices
        const int iq3 = ir/(neq2*neq1);
        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);

        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);

        for (int i = M; i < Mup; ++i) {
            S[i] = -INFINITY;
        }

        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
            for (int64_t ic = 0; ic < nek1; ++ic) {
                // k indices
                const int ik3 = iq3;
                const int ik2 = iq2 % nek2;
                const int ik1 = ic;

                // S indices
                const int i1 = ik1;

                ggml_vec_dot_f16(neq0,
                        S + i1, 0,
                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
            }
        } else {
            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
                // k indices
                const int ik3 = iq3;
                const int ik2 = iq2 % nek2;
                const int ik1 = ic;

                // S indices
                const int i1 = ik1;

                ggml_vec_dot_f16_unroll(neq0, nbk1,
                        S + i1,
                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
            }
        }

        // scale
        ggml_vec_scale_f32(nek1, S, scale);

        if (masked) {
            for (int64_t i = P; i < M; i++) {
                if (i > P + iq1) {
                    S[i] = -INFINITY;
                }
            }
        }

        // softmax
        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
        // dont forget to set their S values to zero
        {
            float max = -INFINITY;
            ggml_vec_max_f32(M, &max, S);

            ggml_float sum = 0.0;
            {
#ifdef GGML_SOFT_MAX_ACCELERATE
                max = -max;
                vDSP_vsadd(S, 1, &max, S, 1, Mup);
                vvexpf(S, S, &Mup);
                ggml_vec_sum_f32(Mup, &sum, S);
#else
                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };

                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
                    float * SS = S + i;

                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
                        if (SS[j] == -INFINITY) {
                            SS[j] = 0.0f;
                        } else {
                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                            memcpy(&scvt[j], &s, sizeof(uint16_t));
                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
                            sump[j] += (ggml_float)val;
                            SS[j] = val;
                        }
                    }
                }

                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
                    sum += sump[i];
                }
#endif
            }

            assert(sum > 0.0);

            sum = 1.0/sum;
            ggml_vec_scale_f32(M, S, sum);

#ifndef NDEBUG
            for (int i = 0; i < M; ++i) {
                assert(!isnan(S[i]));
                assert(!isinf(S[i]));
            }
#endif
        }

        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);

        for (int64_t i = 0; i < M; i++) {
            S16[i] = GGML_FP32_TO_FP16(S[i]);
        }

        // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
            for (int64_t ic = 0; ic < nev1; ++ic) {
                // dst indices
                const int i1 = iq1;
                const int i2 = iq2;
                const int i3 = iq3;

                // v indices
                const int iv2 = iq2 % nev2;
                const int iv3 = iq3;

                ggml_vec_dot_f16(nev0,
                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
                        S16, 0, 1);
            }
        } else {
            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
                // dst indices
                const int i1 = iq1;
                const int i2 = iq2;
                const int i3 = iq3;

                // v indices
                const int iv2 = iq2 % nev2;
                const int iv3 = iq3;

                ggml_vec_dot_f16_unroll(nev0, nbv1,
                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
                        ((char *)             v->data + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
                        S16);
            }
        }
    }
}

static void ggml_compute_forward_flash_attn(
        const struct ggml_compute_params * params,
        const bool masked,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * q = dst->src[0];

    switch (q->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_flash_attn_f16(params, masked, dst);
            } break;
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_flash_attn_f32(params, masked, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_flash_ff

static void ggml_compute_forward_flash_ff_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * a = dst->src[0];  // F16
    const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
    const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
    const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
    const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne)
    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb)
    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne)
    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb)
    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne)
    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb)
    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne)
    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb)
    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne)
    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb)
    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne)
    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb)

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t D = nea0;
    //const int64_t N = nea1;
    const int64_t M = neb01;

    GGML_ASSERT(ne0 == nea0);
    GGML_ASSERT(ne1 == nea1);
    GGML_ASSERT(ne2 == nea2);

    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));
    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nbb10 == sizeof(float));
    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nbc10 == sizeof(float));

    GGML_ASSERT(neb00 == D);
    GGML_ASSERT(neb01 == M);
    GGML_ASSERT(neb10 == M);
    GGML_ASSERT(neb11 == 1);

    GGML_ASSERT(nec00 == M);
    GGML_ASSERT(nec01 == D);
    GGML_ASSERT(nec10 == D);
    GGML_ASSERT(nec11 == 1);

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

    if (params->type == GGML_TASK_TYPE_INIT) {
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // parallelize by a rows using ggml_vec_dot_f32

    // total rows in a
    const int nr = nea1*nea2*nea3;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int ir = ir0; ir < ir1; ++ir) {
        // a indices
        const int ia3 = ir/(nea2*nea1);
        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);

        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);

        for (int64_t ic = 0; ic < neb01; ++ic) {
            // b0 indices
            const int ib03 = ia3;
            const int ib02 = ia2;
            const int ib01 = ic;

            // S indices
            const int i1 = ib01;

            ggml_vec_dot_f16(nea0,
                    S + i1, 0,
                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)), 0, 1);
        }

        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
        //ggml_vec_gelu_f32(neb01, S, S);

        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);

        for (int64_t i = 0; i < M; i++) {
            S16[i] = GGML_FP32_TO_FP16(S[i]);
        }

        ggml_vec_gelu_f16(neb01, S16, S16);

        {
            // dst indices
            const int i1 = ia1;
            const int i2 = ia2;
            const int i3 = ia3;

            for (int64_t ic = 0; ic < nec01; ++ic) {

                ggml_vec_dot_f16(neb01,
                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)), 0,
                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
                        S16, 0, 1);
            }

            ggml_vec_add_f32(nec01,
                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
                    (float *) c1->data);
        }
    }
}

static void ggml_compute_forward_flash_ff(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * b0 = dst->src[1];

    switch (b0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_flash_ff_f16(params, dst);
            } break;
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(false); // TODO
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_flash_attn_back

static void ggml_compute_forward_flash_attn_back_f32(
        const struct ggml_compute_params * params,
        const bool masked,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * q = dst->src[0];
    const struct ggml_tensor * k = dst->src[1];
    const struct ggml_tensor * v = dst->src[2];
    const struct ggml_tensor * d = dst->src[3];

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t D = neq0;
    const int64_t N = neq1;
    const int64_t P = nek1 - N;
    const int64_t M = P + N;

    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
    const int mxDM = MAX(D, Mup);

    // GGML_ASSERT(ne0 == D);
    // GGML_ASSERT(ne1 == N);
    GGML_ASSERT(P >= 0);

    GGML_ASSERT(nbq0 == sizeof(float));
    GGML_ASSERT(nbk0 == sizeof(float));
    GGML_ASSERT(nbv0 == sizeof(float));

    GGML_ASSERT(neq0 == D);
    GGML_ASSERT(nek0 == D);
    GGML_ASSERT(nev1 == D);
    GGML_ASSERT(ned0 == D);

    GGML_ASSERT(neq1 == N);
    GGML_ASSERT(nek1 == N + P);
    GGML_ASSERT(nev1 == D);
    GGML_ASSERT(ned1 == N);

    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith == 0) {
            memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
        }
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int64_t elem_q = ggml_nelements(q);
    const int64_t elem_k = ggml_nelements(k);

    enum ggml_type result_type = dst->type;
    GGML_ASSERT(ggml_blck_size(result_type) == 1);
    const size_t tsize = ggml_type_size(result_type);

    const size_t offs_q = 0;
    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);

    void * grad_q = (char *) dst->data;
    void * grad_k = (char *) dst->data + offs_k;
    void * grad_v = (char *) dst->data + offs_v;

    const size_t nbgq1 = nb0*neq0;
    const size_t nbgq2 = nb0*neq0*neq1;
    const size_t nbgq3 = nb0*neq0*neq1*neq2;

    const size_t nbgk1 = nb0*nek0;
    const size_t nbgk2 = nb0*nek0*nek1;
    const size_t nbgk3 = nb0*nek0*nek1*neq2;

    const size_t nbgv1 = nb0*nev0;
    const size_t nbgv2 = nb0*nev0*nev1;
    const size_t nbgv3 = nb0*nev0*nev1*neq2;

    // parallelize by k rows using ggml_vec_dot_f32

    // total rows in k
    const int nr = nek2*nek3;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    const float scale = 1.0f/sqrtf(D);

    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);

    // how often k2 (and v2) is repeated in q2
    int nrep = neq2/nek2;

    for (int ir = ir0; ir < ir1; ++ir) {
        // q indices
        const int ik3 = ir/(nek2);
        const int ik2 = ir - ik3*nek2;

        const int iq3 = ik3;
        const int id3 = ik3;
        const int iv3 = ik3;
        const int iv2 = ik2;

        for (int irep = 0; irep < nrep; ++irep) {
            const int iq2 = ik2 + irep*nek2;
            const int id2 = iq2;

            // (ik2 + irep*nek2) % nek2 == ik2
            for (int iq1 = 0; iq1 < neq1; ++iq1) {
                const int id1 = iq1;

                // not sure about CACHE_LINE_SIZE_F32..
                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);

                for (int i = M; i < Mup; ++i) {
                    S[i] = -INFINITY;
                }

                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
                for (int64_t ic = 0; ic < masked_begin; ++ic) {
                    // k indices
                    const int ik1 = ic;

                    // S indices
                    const int i1 = ik1;

                    ggml_vec_dot_f32(neq0,
                            S + i1, 0,
                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
                }

                // scale
                ggml_vec_scale_f32(masked_begin, S, scale);

                for (int64_t i = masked_begin; i < M; i++) {
                    S[i] = -INFINITY;
                }

                // softmax
                // exclude known -INF S[..] values from max and loop
                // dont forget to set their SM values to zero
                {
                    float max = -INFINITY;
                    ggml_vec_max_f32(masked_begin, &max, S);

                    ggml_float sum = 0.0;
                    {
#ifdef GGML_SOFT_MAX_ACCELERATE
                        max = -max;
                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
                        vvexpf(SM, SM, &Mup);
                        ggml_vec_sum_f32(Mup, &sum, SM);
#else
                        uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
                        ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };

                        for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
                            if (i >= masked_begin) {
                                break;
                            }
                            float * SR =  S + i;
                            float * SW = SM + i;

                            for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
                                if (i + j >= masked_begin) {
                                    break;
                                } else if (SR[j] == -INFINITY) {
                                    SW[j] = 0.0f;
                                } else {
#ifndef GGML_FLASH_ATTN_EXP_FP16
                                    const float val = expf(SR[j] - max);
#else
                                    ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
                                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
#endif
                                    sump[j] += (ggml_float)val;
                                    SW[j] = val;
                                }
                            }
                        }

                        for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
                            sum += sump[i];
                        }
#endif
                    }

                    assert(sum > 0.0);

                    sum = 1.0/sum;
                    ggml_vec_scale_f32(masked_begin, SM, sum);

                }

                // step-by-step explanation
                {
                    // forward-process                    shape      grads from backward process
                    // parallel_for ik2,ik3:
                    //  for irep:
                    //   iq2 = ik2 + irep*nek2
                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
                    //   for iq1:
                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
                    //    S0     = -Inf                   [D,1,1,1]
                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
                    //   ~S5[i]  = dot(vcur[:,i], S4)
                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
                    // dst                               backward-/ grad[dst]                 = d
                    //
                    // output gradients with their dependencies:
                    //
                    // grad[kcur] = grad[S1].T @ qcur
                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
                    // grad[S4]   = grad[S5] @ vcur
                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
                    // grad[qcur] = grad[S1]   @ kcur
                    // grad[vcur] = grad[S5].T @ S4
                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
                    //
                    // in post-order:
                    //
                    // S1         = qcur @ kcur.T
                    // S2         = S1 * scale
                    // S3         = diag_mask_inf(S2, P)
                    // S4         = softmax(S3)
                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
                    // grad[qcur] = grad[S1]   @ kcur
                    // grad[kcur] = grad[S1].T @ qcur
                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
                    //
                    // using less variables (SM=S4):
                    //
                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
                    // SM            = softmax(S)
                    // S             = d[:D,iq1,iq2,iq3] @ vcur
                    // dot_SM_gradSM = dot(SM, S)
                    // S             = SM * (S - dot(SM, S))
                    // S             = diag_mask_zero(S, P) * scale
                    //
                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
                }

                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
                // for ic:
                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
                // exclude known future zero S[..] values from operation
                ggml_vec_set_f32(masked_begin, S, 0);
                for (int64_t ic = 0; ic < D; ++ic) {
                    ggml_vec_mad_f32(masked_begin,
                            S,
                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
                }

                // S = SM * (S - dot(SM, S))
                float dot_SM_gradSM = 0;
                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
                ggml_vec_mul_f32 (masked_begin, S, S, SM);

                // S = diag_mask_zero(S, P) * scale
                // already done by above ggml_vec_set_f32

                // exclude known zero S[..] values from operation
                ggml_vec_scale_f32(masked_begin, S, scale);

                // S    shape [M,1]
                // SM   shape [M,1]
                // kcur shape [D,M]
                // qcur shape [D,1]
                // vcur shape [M,D]

                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
                // for ic:
                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
                // exclude known zero S[..] values from loop
                for (int64_t ic = 0; ic < masked_begin; ++ic) {
                    ggml_vec_mad_f32(D,
                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
                            S[ic]);
                }

                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
                // for ic:
                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
                // exclude known zero S[..] values from loop
                for (int64_t ic = 0; ic < masked_begin; ++ic) {
                    ggml_vec_mad_f32(D,
                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
                            S[ic]);
                }

                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
                // for ic:
                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
                // exclude known zero SM[..] values from mad
                for (int64_t ic = 0; ic < D; ++ic) {
                    ggml_vec_mad_f32(masked_begin,
                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
                            SM,
                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
                }
            }
        }
    }
}

static void ggml_compute_forward_flash_attn_back(
        const struct ggml_compute_params * params,
        const bool masked,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * q = dst->src[0];

    switch (q->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_ssm_conv

static void ggml_compute_forward_ssm_conv_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const struct ggml_tensor * src0 = dst->src[0]; // conv_state
    const struct ggml_tensor * src1 = dst->src[1]; // x
    const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
    const struct ggml_tensor * src3 = dst->src[3]; // state_seq

    const int ith = params->ith;
    const int nth = params->nth;

    const int nc   = src2->ne[0]; // d_conv
    const int nr   = src0->ne[1]; // d_inner
    const int n_t  = src1->ne[1]; // n_tokens
    const int n_kv = src0->ne[2]; // max number of sequences in the batch

    GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
    GGML_ASSERT(src0->nb[0] == sizeof(float));
    GGML_ASSERT(src1->nb[0] == sizeof(float));
    GGML_ASSERT(src2->nb[0] == sizeof(float));
    GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
    // for use with the destination state offset between sequences
    GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);
    const int ir  = ir1 - ir0;

    if (n_kv > 1) {
        // multiple sequences means it's hard to know when it's the first time a state is read,
        // so copy them all over to the destination, just to be sure.
        for (int i3 = 0; i3 < n_kv; ++i3) {
            float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
            float * s  = (float *) ((char *)  dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
            // can't use memcpy because of d_conv vs d_conv - 1
            for (int i1 = 0; i1 < ir; ++i1) {
                for (int i0 = 0; i0 < nc - 1; ++i0) {
                    // copy s0 to last (d_conv - 1) columns of s
                    s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
                }
            }
        }
    }

    for (int i2 = 0; i2 < n_t; ++i2) {
        int32_t * sq = (int32_t *) ((char *) src3->data +  i2*(src3->nb[1])); // {n_kv, n_tokens}
        float *   x  = (float *)   ((char *)  dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
        float *   s  = (float *)   ((char *)  dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
        float *   s0; // {d_conv - 1, d_inner, n_kv}
        float *   x0 = (float *)   ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
        float *   c  = (float *)   ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
        int ne0s0;

        GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);

        // avoid needing to copy the state for the first token
        if (i2 == 0) {
            s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
            ne0s0 = src0->ne[0];
        } else {
            // the source is the last (d_conv - 1) columns of the destination
            s0 = s + 1;
            ne0s0 = nc;
        }

        // d_inner
        for (int i1 = 0; i1 < ir; ++i1) {
            // shift state left
            for (int i0 = 0; i0 < nc - 1; ++i0) {
                s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
            }
            // insert x on the last column
            s[(nc - 1) + i1*nc] = x0[i1];
        }

        // handle copies when there are multiple output states
        for (int i3 = 1; i3 < n_kv; ++i3) {
            int32_t seq = sq[i3];
            if (0 <= seq && seq < n_kv) {
                float * s1 = s + (seq - sq[0])*nc*nr;
                memcpy(s1, s, nc*ir*sizeof(float));
            } else {
                // stop at negative or too big seq_ids
                break;
            }
        }

        // it seems a little faster when this is separate from the state shift
        for (int i1 = 0; i1 < ir; ++i1) {
            // rowwise dot product
            float sumf = 0.0f;
            for (int i0 = 0; i0 < nc; ++i0) {
                int i = i0 + i1*nc;
                sumf += s[i] * c[i];
            }
            x[i1] = sumf;
        }
    }
}

static void ggml_compute_forward_ssm_conv(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    switch (dst->src[0]->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_ssm_conv_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_ssm_scan

static void ggml_compute_forward_ssm_scan_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const struct ggml_tensor * src0 = dst->src[0]; // s
    const struct ggml_tensor * src1 = dst->src[1]; // x
    const struct ggml_tensor * src2 = dst->src[2]; // dt
    const struct ggml_tensor * src3 = dst->src[3]; // A
    const struct ggml_tensor * src4 = dst->src[4]; // B
    const struct ggml_tensor * src5 = dst->src[5]; // C
    const struct ggml_tensor * src6 = dst->src[6]; // sq

    const int ith = params->ith;
    const int nth = params->nth;

    const int64_t nc   = src0->ne[0]; // d_state
    const int64_t nr   = src0->ne[1]; // d_inner
    const int64_t n_t  = src1->ne[1]; // number of tokens in the batch
    const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch

    GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
    GGML_ASSERT(src0->nb[0] == sizeof(float));
    GGML_ASSERT(src1->nb[0] == sizeof(float));
    GGML_ASSERT(src2->nb[0] == sizeof(float));
    GGML_ASSERT(src3->nb[0] == sizeof(float));
    GGML_ASSERT(src4->nb[0] == sizeof(float));
    GGML_ASSERT(src5->nb[0] == sizeof(float));
    // required for the dot product between s and C, and when copying the states
    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
    // required for per-sequence offsets for states
    GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
    // required to get correct offset for state destination (i.e. src1->nb[2])
    GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);
    const int ir  = ir1 - ir0;

    if (n_kv > 1) {
        // it's hard to know if the source states have already been copied
        // when there are multiple, so copy them already.
        for (int i3 = 0; i3 < n_kv; ++i3) {
            float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
            float * s  = (float *) ((char *)  dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
            memcpy(s, s0, nc*ir*sizeof(float));
        }
    }

    for (int i2 = 0; i2 < n_t; ++i2) {
        int32_t * sq = (int32_t *) ((char *) src6->data +  i2*(src6->nb[1])); // {n_kv, n_tokens}
        float *   y  = (float *)   ((char *)  dst->data + ir0*(src1->nb[0]) +    i2*(src1->nb[1])); // {d_inner, n_tokens}
        float *   s  = (float *)   ((char *)  dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
        float *   s0;
        float *   x  = (float *)   ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
        float *   dt = (float *)   ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
        float *   A  = (float *)   ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
        float *   B  = (float *)   ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tokens}
        float *   C  = (float *)   ((char *) src5->data +  i2*(src5->nb[1])); // {d_state, n_tokens}

        GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);

        // avoid needing to copy the state for the first token
        if (i2 == 0) {
            s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
        } else {
            // otherwise the source is the same as the destination
            s0 = s;
        }

        // d_inner
        for (int i1 = 0; i1 < ir; ++i1) {
            // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
            float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
            float x_dt = x[i1] * dt_soft_plus;
            float sumf = 0.0f;
            // d_state
            for (int i0 = 0; i0 < nc; ++i0) {
                int i = i0 + i1*nc;
                // state = prev_state * dA + dB * x
                float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
                // y = rowwise_dotprod(state, C)
                sumf += state * C[i0];
                s[i] = state;
            }
            y[i1] = sumf;
        }

        // handle copies when there are multiple output states
        for (int i3 = 1; i3 < n_kv; ++i3) {
            int32_t seq = sq[i3];
            if (0 <= seq && seq < n_kv) {
                float * s1 = s + (seq - sq[0])*nc*nr;
                memcpy(s1, s, nc*ir*sizeof(float));
            } else {
                // stop at negative or too big seq_ids
                break;
            }
        }
    }
}

static void ggml_compute_forward_ssm_scan(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    switch (dst->src[0]->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_ssm_scan_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_win_part

static void ggml_compute_forward_win_part_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)

    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
    const int32_t w    = ((const int32_t *)(dst->op_params))[2];

    assert(ne00 == ne0);
    assert(ne3  == nep0*nep1);

    // TODO: optimize / multi-thread
    for (int py = 0; py < nep1; ++py) {
        for (int px = 0; px < nep0; ++px) {
            const int64_t i3 = py*nep0 + px;
            for (int64_t i2 = 0; i2 < ne2; ++i2) {
                for (int64_t i1 = 0; i1 < ne1; ++i1) {
                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
                        const int64_t i02 = py*w + i2;
                        const int64_t i01 = px*w + i1;
                        const int64_t i00 = i0;

                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;

                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
                            ((float *) dst->data)[i] = 0.0f;
                        } else {
                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
                        }
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_win_part(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_win_part_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_win_unpart

static void ggml_compute_forward_win_unpart_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)

    const int32_t w = ((const int32_t *)(dst->op_params))[0];

    // padding
    const int px = (w - ne1%w)%w;
    //const int py = (w - ne2%w)%w;

    const int npx = (px + ne1)/w;
    //const int npy = (py + ne2)/w;

    assert(ne0 == ne00);

    // TODO: optimize / multi-thread
    for (int64_t i2 = 0; i2 < ne2; ++i2) {
        for (int64_t i1 = 0; i1 < ne1; ++i1) {
            for (int64_t i0 = 0; i0 < ne0; ++i0) {
                const int ip2 = i2/w;
                const int ip1 = i1/w;

                const int64_t i02 = i2%w;
                const int64_t i01 = i1%w;
                const int64_t i00 = i0;

                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;

                ((float *) dst->data)[j] = ((float *) src0->data)[i];
            }
        }
    }
}

static void ggml_compute_forward_win_unpart(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_win_unpart_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

//gmml_compute_forward_unary

static void ggml_compute_forward_unary(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const enum ggml_unary_op op = ggml_get_unary_op(dst);

    switch (op) {
        case GGML_UNARY_OP_ABS:
            {
                ggml_compute_forward_abs(params, dst);
            } break;
        case GGML_UNARY_OP_SGN:
            {
                ggml_compute_forward_sgn(params, dst);
            } break;
        case GGML_UNARY_OP_NEG:
            {
                ggml_compute_forward_neg(params, dst);
            } break;
        case GGML_UNARY_OP_STEP:
            {
                ggml_compute_forward_step(params, dst);
            } break;
        case GGML_UNARY_OP_TANH:
            {
                ggml_compute_forward_tanh(params, dst);
            } break;
        case GGML_UNARY_OP_ELU:
            {
                ggml_compute_forward_elu(params, dst);
            } break;
        case GGML_UNARY_OP_RELU:
            {
                ggml_compute_forward_relu(params, dst);
            } break;
        case GGML_UNARY_OP_GELU:
            {
                ggml_compute_forward_gelu(params, dst);
            } break;
        case GGML_UNARY_OP_GELU_QUICK:
            {
                ggml_compute_forward_gelu_quick(params, dst);
            } break;
        case GGML_UNARY_OP_SILU:
            {
                ggml_compute_forward_silu(params, dst);
            } break;
        case GGML_UNARY_OP_HARDSWISH:
            {
                ggml_compute_forward_hardswish(params, dst);
            } break;
        case GGML_UNARY_OP_HARDSIGMOID:
            {
                ggml_compute_forward_hardsigmoid(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_get_rel_pos

static void ggml_compute_forward_get_rel_pos_f16(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322

    GGML_TENSOR_UNARY_OP_LOCALS

    const int64_t w = ne1;

    ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
    ggml_fp16_t * dst_data  = (ggml_fp16_t *) dst->data;

    for (int64_t i2 = 0; i2 < ne2; ++i2) {
        for (int64_t i1 = 0; i1 < ne1; ++i1) {
            const int64_t pos = (w - i1 - 1) + i2;
            for (int64_t i0 = 0; i0 < ne0; ++i0) {
                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
            }
        }
    }
}

static void ggml_compute_forward_get_rel_pos(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F16:
            {
                ggml_compute_forward_get_rel_pos_f16(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_add_rel_pos

static void ggml_compute_forward_add_rel_pos_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    const struct ggml_tensor * src2 = dst->src[2];

    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
    if (!inplace && params->type == GGML_TASK_TYPE_INIT) {
        if (params->ith != 0) {
            return;
        }
        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
        return;
    }
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);

    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359

    float * src1_data = (float *) src1->data;
    float * src2_data = (float *) src2->data;
    float * dst_data  = (float *) dst->data;

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];

    const int ith = params->ith;
    const int nth = params->nth;

    // total patches in dst
    const int np = ne13;

    // patches per thread
    const int dp = (np + nth - 1)/nth;

    // patch range for this thread
    const int ip0 = dp*ith;
    const int ip1 = MIN(ip0 + dp, np);

    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
        for (int64_t i12 = 0; i12 < ne12; ++i12) {
            for (int64_t i11 = 0; i11 < ne11; ++i11) {
                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
                for (int64_t i10 = 0; i10 < ne10; ++i10) {
                    const int64_t jp0  = jp1 + i10;
                    const float src1_e = src1_data[jp0];
                    const float src2_e = src2_data[jp0];

                    const int64_t jdh = jp0 * ne10;
                    const int64_t jdw = jdh - (ne10 - 1) * i10;

                    for (int64_t j = 0; j < ne10; ++j) {
                        dst_data[jdh + j     ] += src2_e;
                        dst_data[jdw + j*ne10] += src1_e;
                    }
                }
            }
        }
    }
}

static void ggml_compute_forward_add_rel_pos(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_add_rel_pos_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_map_unary

static void ggml_compute_forward_map_unary_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const ggml_unary_op_f32_t fun) {

    const struct ggml_tensor * src0 = dst->src[0];

    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert( dst->nb[0] == sizeof(float));
    assert(src0->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        fun(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
}

static void ggml_compute_forward_map_unary(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const ggml_unary_op_f32_t fun) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_map_unary_f32(params, dst, fun);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_map_binary

static void ggml_compute_forward_map_binary_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const ggml_binary_op_f32_t fun) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];

    assert( dst->nb[0] == sizeof(float));
    assert(src0->nb[0] == sizeof(float));
    assert(src1->nb[0] == sizeof(float));

    for (int i = 0; i < n; i++) {
        fun(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])),
                (float *) ((char *) src1->data + i*(src1->nb[1])));
    }
}

static void ggml_compute_forward_map_binary(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const ggml_binary_op_f32_t fun) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_map_binary_f32(params, dst, fun);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_map_custom1

static void ggml_compute_forward_map_custom1_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const ggml_custom1_op_f32_t fun) {

    const struct ggml_tensor * a = dst->src[0];

    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    fun(dst, a);
}

// ggml_compute_forward_map_custom2

static void ggml_compute_forward_map_custom2_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const ggml_custom2_op_f32_t fun) {

    const struct ggml_tensor * a = dst->src[0];
    const struct ggml_tensor * b = dst->src[1];

    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    fun(dst, a, b);
}

// ggml_compute_forward_map_custom3

static void ggml_compute_forward_map_custom3_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst,
        const ggml_custom3_op_f32_t fun) {

    const struct ggml_tensor * a = dst->src[0];
    const struct ggml_tensor * b = dst->src[1];
    const struct ggml_tensor * c = dst->src[1];

    assert(params->ith == 0);

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    fun(dst, a, b, c);
}

// ggml_compute_forward_map_custom1

static void ggml_compute_forward_map_custom1(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * a = dst->src[0];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    struct ggml_map_custom1_op_params p;
    memcpy(&p, dst->op_params, sizeof(p));

    p.fun(dst, a, params->ith, params->nth, p.userdata);
}

// ggml_compute_forward_map_custom2

static void ggml_compute_forward_map_custom2(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * a = dst->src[0];
    const struct ggml_tensor * b = dst->src[1];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    struct ggml_map_custom2_op_params p;
    memcpy(&p, dst->op_params, sizeof(p));

    p.fun(dst, a, b, params->ith, params->nth, p.userdata);
}

// ggml_compute_forward_map_custom3

static void ggml_compute_forward_map_custom3(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

    const struct ggml_tensor * a = dst->src[0];
    const struct ggml_tensor * b = dst->src[1];
    const struct ggml_tensor * c = dst->src[2];

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    struct ggml_map_custom3_op_params p;
    memcpy(&p, dst->op_params, sizeof(p));

    p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
}

// ggml_compute_forward_cross_entropy_loss

static void ggml_compute_forward_cross_entropy_loss_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];

    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    GGML_ASSERT(ggml_is_scalar(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, src1));

    const int ith = params->ith;
    const int nth = params->nth;

    float * sums = (float *) params->wdata;

    // TODO: handle transposed/permuted matrices
    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);

    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));

    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith == 0) {
            memset(sums, 0, sizeof(float) * (nth + nth * nc));
        }
        return;
    }

    if (params->type == GGML_TASK_TYPE_FINALIZE) {
        if (ith == 0) {
            float * dp = (float *) dst->data;
            ggml_vec_sum_f32(nth, dp, sums);
            dp[0] *= -1.0f / (float) nr;
        }
        return;
    }

    const double eps = 1e-9;

    // rows per thread
    const int dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);

    for (int i1 = ir0; i1 < ir1; i1++) {
        float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
        float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
        float * st = ((float *) params->wdata) + nth + ith*nc;

#ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            //printf("p[%d] = %f\n", i, p[i]);
            assert(!isnan(s0[i]));
            assert(!isnan(s1[i]));
        }
#endif
        // soft_max
        ggml_float sum = 0.0;
        {
            float max = -INFINITY;
            ggml_vec_max_f32(nc, &max, s0);

            uint16_t scvt; UNUSED(scvt);
            for (int i = 0; i < nc; i++) {
                if (s0[i] == -INFINITY) {
                    st[i] = 0.0f;
                } else {
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
                    const float s = s0[i] - max;
                    const float val = expf(s);
#else
                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                    memcpy(&scvt, &s, sizeof(scvt));
                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
#endif
                    sum += (ggml_float)val;
                    st[i] = val;
                }
            }

            assert(sum > 0.0);
            // sum = 1.0/sum;
        }
        // avoid log(0) by rescaling from [0..1] to [eps..1]
        sum = (1.0 - eps) / sum;
        ggml_vec_scale_f32(nc, st, sum);
        ggml_vec_add1_f32(nc, st, st, eps);
        ggml_vec_log_f32(nc, st, st);
        ggml_vec_mul_f32(nc, st, st, s1);

        float st_sum = 0;
        ggml_vec_sum_f32(nc, &st_sum, st);
        sums[ith] += st_sum;

#ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            assert(!isnan(st[i]));
            assert(!isinf(st[i]));
        }
#endif
    }

}

static void ggml_compute_forward_cross_entropy_loss(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_cross_entropy_loss_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

// ggml_compute_forward_cross_entropy_loss_back

static void ggml_compute_forward_cross_entropy_loss_back_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    const struct ggml_tensor * opt0 = dst->src[2];

    GGML_ASSERT(ggml_is_contiguous(dst));
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    GGML_ASSERT(ggml_is_contiguous(opt0));
    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));

    const int64_t ith = params->ith;
    const int64_t nth = params->nth;

    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }

    const double eps = 1e-9;

    // TODO: handle transposed/permuted matrices
    const int64_t nc = src0->ne[0];
    const int64_t nr = ggml_nrows(src0);

    // rows per thread
    const int64_t dr = (nr + nth - 1)/nth;

    // row range for this thread
    const int64_t ir0 = dr*ith;
    const int64_t ir1 = MIN(ir0 + dr, nr);

    float * d   = (float *) opt0->data;

    for (int64_t i1 = ir0; i1 < ir1; i1++) {
        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);

#ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            //printf("p[%d] = %f\n", i, p[i]);
            assert(!isnan(s0[i]));
            assert(!isnan(s1[i]));
        }
#endif

        // soft_max
        ggml_float sum = 0.0;
        {
            float max = -INFINITY;
            ggml_vec_max_f32(nc, &max, s0);

            uint16_t scvt; UNUSED(scvt);
            for (int i = 0; i < nc; i++) {
                if (s0[i] == -INFINITY) {
                    ds0[i] = 0.0f;
                } else {
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
                    const float s = s0[i] - max;
                    const float val = expf(s);
#else
                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                    memcpy(&scvt, &s, sizeof(scvt));
                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
#endif
                    sum += (ggml_float)val;
                    ds0[i] = val;
                }
            }

            assert(sum > 0.0);
            sum = (1.0 - eps)/sum;
        }

        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
        ggml_vec_scale_f32(nc, ds0, sum);
        ggml_vec_add1_f32(nc, ds0, ds0, eps);
        ggml_vec_sub_f32(nc, ds0, ds0, s1);
        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);

#ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            assert(!isnan(ds0[i]));
            assert(!isinf(ds0[i]));
        }
#endif
    }
}

static void ggml_compute_forward_cross_entropy_loss_back(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {

    const struct ggml_tensor * src0 = dst->src[0];

    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
}

/////////////////////////////////

static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    GGML_ASSERT(params);

    if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
        return;
    }

#if defined(GGML_USE_VULKAN)
    const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
    if (skip_cpu) {
        ggml_vk_check_results_1_cpu_assist(params, tensor);
    }
#endif
    if (skip_cpu) {
        return;
    }
    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
#endif // GGML_USE_VULKAN

    switch (tensor->op) {
        case GGML_OP_DUP:
            {
                ggml_compute_forward_dup(params, tensor);
            } break;
        case GGML_OP_ADD:
            {
                ggml_compute_forward_add(params, tensor);
            } break;
        case GGML_OP_ADD1:
            {
                ggml_compute_forward_add1(params, tensor);
            } break;
        case GGML_OP_ACC:
            {
                ggml_compute_forward_acc(params, tensor);
            } break;
        case GGML_OP_SUB:
            {
                ggml_compute_forward_sub(params, tensor);
            } break;
        case GGML_OP_MUL:
            {
                ggml_compute_forward_mul(params, tensor);
            } break;
        case GGML_OP_DIV:
            {
                ggml_compute_forward_div(params, tensor);
            } break;
        case GGML_OP_SQR:
            {
                ggml_compute_forward_sqr(params, tensor);
            } break;
        case GGML_OP_SQRT:
            {
                ggml_compute_forward_sqrt(params, tensor);
            } break;
        case GGML_OP_LOG:
            {
                ggml_compute_forward_log(params, tensor);
            } break;
        case GGML_OP_SUM:
            {
                ggml_compute_forward_sum(params, tensor);
            } break;
        case GGML_OP_SUM_ROWS:
            {
                ggml_compute_forward_sum_rows(params, tensor);
            } break;
        case GGML_OP_MEAN:
            {
                ggml_compute_forward_mean(params, tensor);
            } break;
        case GGML_OP_ARGMAX:
            {
                ggml_compute_forward_argmax(params, tensor);
            } break;
        case GGML_OP_REPEAT:
            {
                ggml_compute_forward_repeat(params, tensor);
            } break;
        case GGML_OP_REPEAT_BACK:
            {
                ggml_compute_forward_repeat_back(params, tensor);
            } break;
        case GGML_OP_CONCAT:
            {
                ggml_compute_forward_concat(params, tensor);
            } break;
        case GGML_OP_SILU_BACK:
            {
                ggml_compute_forward_silu_back(params, tensor);
            } break;
        case GGML_OP_NORM:
            {
                ggml_compute_forward_norm(params, tensor);
            } break;
        case GGML_OP_RMS_NORM:
            {
                ggml_compute_forward_rms_norm(params, tensor);
            } break;
        case GGML_OP_RMS_NORM_BACK:
            {
                ggml_compute_forward_rms_norm_back(params, tensor);
            } break;
        case GGML_OP_GROUP_NORM:
            {
                ggml_compute_forward_group_norm(params, tensor);
            } break;
        case GGML_OP_MUL_MAT:
            {
                ggml_compute_forward_mul_mat(params, tensor);
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
                ggml_compute_forward_mul_mat_id(params, tensor);
            } break;
        case GGML_OP_OUT_PROD:
            {
                ggml_compute_forward_out_prod(params, tensor);
            } break;
        case GGML_OP_SCALE:
            {
                ggml_compute_forward_scale(params, tensor);
            } break;
        case GGML_OP_SET:
            {
                ggml_compute_forward_set(params, tensor);
            } break;
        case GGML_OP_CPY:
            {
                ggml_compute_forward_cpy(params, tensor);
            } break;
        case GGML_OP_CONT:
            {
                ggml_compute_forward_cont(params, tensor);
            } break;
        case GGML_OP_RESHAPE:
            {
                ggml_compute_forward_reshape(params, tensor);
            } break;
        case GGML_OP_VIEW:
            {
                ggml_compute_forward_view(params, tensor);
            } break;
        case GGML_OP_PERMUTE:
            {
                ggml_compute_forward_permute(params, tensor);
            } break;
        case GGML_OP_TRANSPOSE:
            {
                ggml_compute_forward_transpose(params, tensor);
            } break;
        case GGML_OP_GET_ROWS:
            {
                ggml_compute_forward_get_rows(params, tensor);
            } break;
        case GGML_OP_GET_ROWS_BACK:
            {
                ggml_compute_forward_get_rows_back(params, tensor);
            } break;
        case GGML_OP_DIAG:
            {
                ggml_compute_forward_diag(params, tensor);
            } break;
        case GGML_OP_DIAG_MASK_INF:
            {
                ggml_compute_forward_diag_mask_inf(params, tensor);
            } break;
        case GGML_OP_DIAG_MASK_ZERO:
            {
                ggml_compute_forward_diag_mask_zero(params, tensor);
            } break;
        case GGML_OP_SOFT_MAX:
            {
                ggml_compute_forward_soft_max(params, tensor);
            } break;
        case GGML_OP_SOFT_MAX_BACK:
            {
                ggml_compute_forward_soft_max_back(params, tensor);
            } break;
        case GGML_OP_ROPE:
            {
                ggml_compute_forward_rope(params, tensor);
            } break;
        case GGML_OP_ROPE_BACK:
            {
                ggml_compute_forward_rope_back(params, tensor);
            } break;
        case GGML_OP_ALIBI:
            {
                ggml_compute_forward_alibi(params, tensor);
            } break;
        case GGML_OP_CLAMP:
            {
                ggml_compute_forward_clamp(params, tensor);
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
                ggml_compute_forward_conv_transpose_1d(params, tensor);
            } break;
        case GGML_OP_IM2COL:
            {
                ggml_compute_forward_im2col(params, tensor);
            } break;
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
                ggml_compute_forward_conv_transpose_2d(params, tensor);
            } break;
        case GGML_OP_POOL_1D:
            {
                ggml_compute_forward_pool_1d(params, tensor);
            } break;
        case GGML_OP_POOL_2D:
            {
                ggml_compute_forward_pool_2d(params, tensor);
            } break;
        case GGML_OP_UPSCALE:
            {
                ggml_compute_forward_upscale(params, tensor);
            } break;
        case GGML_OP_PAD:
            {
                ggml_compute_forward_pad(params, tensor);
            } break;
        case GGML_OP_ARANGE:
            {
                ggml_compute_forward_arange(params, tensor);
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                ggml_compute_forward_timestep_embedding(params, tensor);
            } break;
        case GGML_OP_ARGSORT:
            {
                ggml_compute_forward_argsort(params, tensor);
            } break;
        case GGML_OP_LEAKY_RELU:
            {
                ggml_compute_forward_leaky_relu(params, tensor);
            } break;
        case GGML_OP_FLASH_ATTN:
            {
                const int32_t t = ggml_get_op_params_i32(tensor, 0);
                GGML_ASSERT(t == 0 || t == 1);
                const bool masked = t != 0;
                ggml_compute_forward_flash_attn(params, masked, tensor);
            } break;
        case GGML_OP_FLASH_FF:
            {
                ggml_compute_forward_flash_ff(params, tensor);
            } break;
        case GGML_OP_FLASH_ATTN_BACK:
            {
                int32_t t = ggml_get_op_params_i32(tensor, 0);
                GGML_ASSERT(t == 0 || t == 1);
                bool masked = t != 0;
                ggml_compute_forward_flash_attn_back(params, masked, tensor);
            } break;
        case GGML_OP_SSM_CONV:
            {
                ggml_compute_forward_ssm_conv(params, tensor);
            } break;
        case GGML_OP_SSM_SCAN:
            {
                ggml_compute_forward_ssm_scan(params, tensor);
            } break;
        case GGML_OP_WIN_PART:
            {
                ggml_compute_forward_win_part(params, tensor);
            } break;
        case GGML_OP_WIN_UNPART:
            {
                ggml_compute_forward_win_unpart(params, tensor);
            } break;
        case GGML_OP_UNARY:
            {
                ggml_compute_forward_unary(params, tensor);
            } break;
        case GGML_OP_GET_REL_POS:
            {
                ggml_compute_forward_get_rel_pos(params, tensor);
            } break;
        case GGML_OP_ADD_REL_POS:
            {
                ggml_compute_forward_add_rel_pos(params, tensor);
            } break;
        case GGML_OP_MAP_UNARY:
            {
                ggml_unary_op_f32_t fun;
                memcpy(&fun, tensor->op_params, sizeof(fun));
                ggml_compute_forward_map_unary(params, tensor, fun);
            }
            break;
        case GGML_OP_MAP_BINARY:
            {
                ggml_binary_op_f32_t fun;
                memcpy(&fun, tensor->op_params, sizeof(fun));
                ggml_compute_forward_map_binary(params, tensor, fun);
            }
            break;
        case GGML_OP_MAP_CUSTOM1_F32:
            {
                ggml_custom1_op_f32_t fun;
                memcpy(&fun, tensor->op_params, sizeof(fun));
                ggml_compute_forward_map_custom1_f32(params, tensor, fun);
            }
            break;
        case GGML_OP_MAP_CUSTOM2_F32:
            {
                ggml_custom2_op_f32_t fun;
                memcpy(&fun, tensor->op_params, sizeof(fun));
                ggml_compute_forward_map_custom2_f32(params, tensor, fun);
            }
            break;
        case GGML_OP_MAP_CUSTOM3_F32:
            {
                ggml_custom3_op_f32_t fun;
                memcpy(&fun, tensor->op_params, sizeof(fun));
                ggml_compute_forward_map_custom3_f32(params, tensor, fun);
            }
            break;
        case GGML_OP_MAP_CUSTOM1:
            {
                ggml_compute_forward_map_custom1(params, tensor);
            }
            break;
        case GGML_OP_MAP_CUSTOM2:
            {
                ggml_compute_forward_map_custom2(params, tensor);
            }
            break;
        case GGML_OP_MAP_CUSTOM3:
            {
                ggml_compute_forward_map_custom3(params, tensor);
            }
            break;
        case GGML_OP_CROSS_ENTROPY_LOSS:
            {
                ggml_compute_forward_cross_entropy_loss(params, tensor);
            }
            break;
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
            {
                ggml_compute_forward_cross_entropy_loss_back(params, tensor);
            }
            break;
        case GGML_OP_NONE:
            {
                // nop
            } break;
        case GGML_OP_COUNT:
            {
                GGML_ASSERT(false);
            } break;
    }
}

////////////////////////////////////////////////////////////////////////////////

static size_t ggml_hash_size(size_t min_sz) {
    // next primes after powers of two
    static const size_t primes[] = {
        2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
        2053, 4099, 8209, 16411, 32771, 65537, 131101,
        262147, 524309, 1048583, 2097169, 4194319, 8388617,
        16777259, 33554467, 67108879, 134217757, 268435459,
        536870923, 1073741827, 2147483659
    };
    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);

    // find the smallest prime that is larger or equal to min_sz
    size_t l = 0;
    size_t r = n_primes;
    while (l < r) {
        size_t m = (l + r)/2;
        if (primes[m] < min_sz) {
            l = m + 1;
        } else {
            r = m;
        }
    }
    size_t sz = l < n_primes ? primes[l] : min_sz | 1;
    return sz;
}

static size_t ggml_hash(const void * p) {
    return (size_t)p;
}

size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
    size_t h = ggml_hash(key) % hash_set.size;

    // linear probing
    size_t i = h;
    while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
        i = (i + 1) % hash_set.size;
        if (i == h) {
            // visited all hash table entries -> not found
            return GGML_HASHTABLE_FULL;
        }
    }
    return i;
}

bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
    size_t i = ggml_hash_find(hash_set, key);
    return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
}

size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
    size_t i = ggml_hash_find(hash_set, key);

    GGML_ASSERT(i != GGML_HASHTABLE_FULL);

    if (hash_set.keys[i] == key) {
        return GGML_HASHTABLE_ALREADY_EXISTS;
    }

    // insert
    GGML_ASSERT(hash_set.keys[i] == NULL);
    hash_set.keys[i] = key;
    return i;
}

size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
    size_t i = ggml_hash_find(hash_set, key);

    GGML_ASSERT(i != GGML_HASHTABLE_FULL);

    hash_set.keys[i] = key;
    return i;
}

struct ggml_hash_set ggml_hash_set_new(size_t size) {
    size = ggml_hash_size(size);
    struct ggml_hash_set result;
    result.size = size;
    result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
    memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
    return result;
}

static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
    GGML_FREE(hash_set.keys);
}

struct hash_map {
    struct ggml_hash_set set;
    struct ggml_tensor ** vals;
};

static struct hash_map * ggml_new_hash_map(size_t size) {
    struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
    result->set = ggml_hash_set_new(size);
    result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
    memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
    return result;
}

static void ggml_hash_map_free(struct hash_map * map) {
    ggml_hash_set_free(map->set);
    GGML_FREE(map->vals);
    GGML_FREE(map);
}

// gradient checkpointing

static struct ggml_tensor * ggml_recompute_graph_node(
        struct ggml_context * ctx,
        struct ggml_cgraph  * graph,
        struct hash_map     * replacements,
        struct ggml_tensor  * node) {

    if (node == NULL) {
        return NULL;
    }

    if (node->flags & GGML_TENSOR_FLAG_PARAM) {
        return node;
    }

    if (!ggml_hash_contains(graph->visited_hash_table, node)) {
        return node;
    }

    int count_children = 0;
    for (int k = 0; k < GGML_MAX_SRC; ++k) {
        if (node->src[k]) {
            ++count_children;
        }
    }

    if (count_children == 0) {
        return node;
    }

    size_t i = ggml_hash_find(replacements->set, node);
    GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
    if (replacements->set.keys[i] == node) {
        return replacements->vals[i];
    }

    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);

    // insert clone into replacements
    GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
    replacements->set.keys[i] = node;
    replacements->vals[i] = clone;

    clone->op       = node->op;
    clone->grad     = node->grad;
    clone->flags    = node->flags;
    clone->extra    = node->extra;
    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
        clone->nb[k] = node->nb[k];
    }
    for (int k = 0; k < GGML_MAX_SRC; ++k) {
        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
    }
    if (node->view_src != NULL) {
        clone->data = (node->view_src->data == NULL)
                        ? NULL // view_src not yet allocated
                        : (char *) node->view_src->data // view_src already allocated
                                 + node->view_offs;
        clone->view_src  = node->view_src;
        clone->view_offs = node->view_offs;
    }

    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));

    return clone;
}

void ggml_build_backward_gradient_checkpointing(
        struct ggml_context   * ctx,
        struct ggml_cgraph    * gf,
        struct ggml_cgraph    * gb,
        struct ggml_cgraph    * gb_tmp,
        struct ggml_tensor  * * checkpoints,
        int                     n_checkpoints) {
    ggml_graph_cpy(gf, gb_tmp);
    ggml_build_backward_expand(ctx, gf, gb_tmp, true);

    if (n_checkpoints <= 0) {
        ggml_graph_cpy(gb_tmp, gb);
        return;
    }

    struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);

    // insert checkpoints in replacements
    for (int i = 0; i < n_checkpoints; ++i) {
        size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
        GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
        GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
        replacements->set.keys[k] = checkpoints[i];
        replacements->vals[k]     = checkpoints[i];
    }

    ggml_graph_cpy(gf, gb);
    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
    // by recomputing them from checkpoints
    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
        struct ggml_tensor * node = gb_tmp->nodes[i];
        for (int k = 0; k < GGML_MAX_SRC; ++k) {
            // insert new tensors recomputing src, reusing already made replacements,
            // remember replacements: remember new tensors with mapping from corresponding gf nodes
            // recurse for input tensors,
            // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
        }
        // insert rewritten backward node with replacements made into resulting backward graph gb
        ggml_build_forward_expand(gb, node);
    }

    ggml_hash_map_free(replacements);
}

// functions to change gradients considering the case that input a might be initial gradient with zero value

static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
    if (ggml_hash_contains(zero_table, a)) {
        return b;
    } else {
        return ggml_add_impl(ctx, a, b, false);
    }
}

static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
    if (ggml_hash_contains(zero_table, a)) {
        struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
        return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
    } else {
        return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
    }
}

static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
    if (ggml_hash_contains(zero_table, a)) {
        return ggml_repeat(ctx, b, a);
    } else {
        return ggml_add1_impl(ctx, a, b, false);
    }
}

static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
    if (ggml_hash_contains(zero_table, a)) {
        return ggml_neg(ctx, b);
    } else {
        return ggml_sub_impl(ctx, a, b, false);
    }
}

static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
    struct ggml_tensor * src0 = tensor->src[0];
    struct ggml_tensor * src1 = tensor->src[1];

    switch (tensor->op) {
        case GGML_OP_DUP:
            {
                if (src0->grad) {
                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                }
            } break;
        case GGML_OP_ADD:
            {
                if (src0->grad) {
                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                }
                if (src1->grad) {
                    src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
                }
            } break;
        case GGML_OP_ADD1:
            {
                if (src0->grad) {
                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                }
                if (src1->grad) {
                    src1->grad = ggml_add_or_set(ctx,
                        src1->grad,
                        ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
                        zero_table);
                }
            } break;
        case GGML_OP_ACC:
            {
                if (src0->grad) {
                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                }
                if (src1->grad) {
                    const size_t nb1     = ((int32_t *) tensor->op_params)[0];
                    const size_t nb2     = ((int32_t *) tensor->op_params)[1];
                    const size_t nb3     = ((int32_t *) tensor->op_params)[2];
                    const size_t offset  = ((int32_t *) tensor->op_params)[3];

                    struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
                        tensor->grad,
                        src1->grad->ne[0],
                        src1->grad->ne[1],
                        src1->grad->ne[2],
                        src1->grad->ne[3],
                        nb1, nb2, nb3, offset);

                    src1->grad =
                        ggml_add_or_set(ctx,
                            src1->grad,
                            ggml_reshape(ctx,
                                ggml_cont(ctx, tensor_grad_view),
                                src1->grad),
                            zero_table);
                }
            } break;
        case GGML_OP_SUB:
            {
                if (src0->grad) {
                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                }
                if (src1->grad) {
                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table);
                }
            } break;
        case GGML_OP_MUL:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx,
                                src0->grad,
                                ggml_mul(ctx, src1, tensor->grad),
                                zero_table);
                }
                if (src1->grad) {
                    src1->grad =
                        ggml_add_or_set(ctx,
                                src1->grad,
                                ggml_mul(ctx, src0, tensor->grad),
                                zero_table);
                }
            } break;
        case GGML_OP_DIV:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx,
                                src0->grad,
                                ggml_div(ctx, tensor->grad, src1),
                                zero_table);
                }
                if (src1->grad) {
                    src1->grad =
                        ggml_sub_or_set(ctx,
                                src1->grad,
                                ggml_mul(ctx,
                                    tensor->grad,
                                    ggml_div(ctx, tensor, src1)),
                                zero_table);
                }
            } break;
        case GGML_OP_SQR:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx,
                                src0->grad,
                                ggml_scale(ctx,
                                    ggml_mul(ctx, src0, tensor->grad),
                                    2.0f),
                                zero_table);
                }
            } break;
        case GGML_OP_SQRT:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx,
                                src0->grad,
                                ggml_scale(ctx,
                                    ggml_div(ctx,
                                        tensor->grad,
                                        tensor),
                                    0.5f),
                                zero_table);
                }
            } break;
        case GGML_OP_LOG:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx,
                                src0->grad,
                                ggml_div(ctx,
                                    tensor->grad,
                                    src0),
                                zero_table);
                }
            } break;
        case GGML_OP_SUM:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_add1_or_set(ctx,
                                src0->grad,
                                tensor->grad,
                                zero_table);
                }
            } break;
        case GGML_OP_SUM_ROWS:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx,
                                src0->grad,
                                ggml_repeat(ctx,
                                    tensor->grad,
                                    src0->grad),
                                zero_table);
                }
            } break;
        case GGML_OP_MEAN:
        case GGML_OP_ARGMAX:
            {
                GGML_ASSERT(false); // TODO: implement
            } break;
        case GGML_OP_REPEAT:
            {
                // necessary for llama
                if (src0->grad) {
                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
                            ggml_repeat_back(ctx, tensor->grad, src0->grad),
                            zero_table);
                }
            } break;
        case GGML_OP_REPEAT_BACK:
            {
                if (src0->grad) {
                    // TODO: test this
                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
                            ggml_repeat(ctx, tensor->grad, src0->grad),
                            zero_table);
                }
            } break;
        case GGML_OP_CONCAT:
            {
                GGML_ASSERT(false); // TODO: implement
            } break;
        case GGML_OP_SILU_BACK:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_NORM:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_RMS_NORM:
            {
                // necessary for llama
                if (src0->grad) {
                    float eps;
                    memcpy(&eps, tensor->op_params, sizeof(float));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
                            ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
                            zero_table);
                }
            } break;
        case GGML_OP_RMS_NORM_BACK:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_GROUP_NORM:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_MUL_MAT:
            {
                // https://cs231n.github.io/optimization-2/#staged
                // # forward pass
                // s0 = np.random.randn(5, 10)
                // s1 = np.random.randn(10, 3)
                // t = s0.dot(s1)

                // # now suppose we had the gradient on t from above in the circuit
                // dt = np.random.randn(*t.shape) # same shape as t
                // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
                // ds1 = t.T.dot(dt)

                // tensor.shape [m,p,qq,rr]
                // src0.shape   [n,m,q1,r1]
                // src1.shape   [n,p,qq,rr]

                // necessary for llama
                if (src0->grad) {
                    struct ggml_tensor * s1_tg =
                        ggml_out_prod(ctx, // [n,m,qq,rr]
                            src1,          // [n,p,qq,rr]
                            tensor->grad); // [m,p,qq,rr]
                    const int64_t qq = s1_tg->ne[2];
                    const int64_t rr = s1_tg->ne[3];
                    const int64_t q1 = src0->ne[2];
                    const int64_t r1 = src0->ne[3];
                    const bool ne2_broadcasted = qq > q1;
                    const bool ne3_broadcasted = rr > r1;
                    if (ne2_broadcasted || ne3_broadcasted) {
                        // sum broadcast repetitions of s1_tg into shape of src0
                        s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
                    }
                    src0->grad =
                        ggml_add_or_set(ctx,
                                src0->grad, // [n,m,q1,r1]
                                s1_tg,      // [n,m,q1,r1]
                                zero_table);
                }
                if (src1->grad) {
                    src1->grad =
                        ggml_add_or_set(ctx,
                                src1->grad,                            // [n,p,qq,rr]
                                // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
                                //     ggml_cont(ctx,                  // [m,n,q1,r1]
                                //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
                                //     tensor->grad),                  // [m,p,qq,rr]

                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
                                // // avoid transpose of src0, rather transpose smaller tensor->grad
                                // // and then use ggml_out_prod
                                ggml_out_prod(ctx,                  // [n,p,qq,rr]
                                    src0,                           // [n,m,q1,r1]
                                    ggml_transpose(ctx,             // [p,m,qq,rr]
                                        tensor->grad)),             // [m,p,qq,rr]
                                zero_table);
                }
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_OUT_PROD:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_SCALE:
            {
                // necessary for llama
                if (src0->grad) {
                    float s;
                    memcpy(&s, tensor->op_params, sizeof(float));

                    src0->grad =
                        ggml_add_or_set(ctx,
                            src0->grad,
                            ggml_scale_impl(ctx, tensor->grad, s, false),
                            zero_table);
                }
            } break;
        case GGML_OP_SET:
            {
                const size_t nb1     = ((int32_t *) tensor->op_params)[0];
                const size_t nb2     = ((int32_t *) tensor->op_params)[1];
                const size_t nb3     = ((int32_t *) tensor->op_params)[2];
                const size_t offset  = ((int32_t *) tensor->op_params)[3];

                struct ggml_tensor * tensor_grad_view = NULL;

                if (src0->grad || src1->grad) {
                    GGML_ASSERT(src0->type == tensor->type);
                    GGML_ASSERT(tensor->grad->type == tensor->type);
                    GGML_ASSERT(tensor->grad->type == src1->grad->type);

                    tensor_grad_view = ggml_view_4d(ctx,
                        tensor->grad,
                        src1->grad->ne[0],
                        src1->grad->ne[1],
                        src1->grad->ne[2],
                        src1->grad->ne[3],
                        nb1, nb2, nb3, offset);
                }

                if (src0->grad) {
                    src0->grad = ggml_add_or_set(ctx,
                        src0->grad,
                        ggml_acc_impl(ctx,
                            tensor->grad,
                            ggml_neg(ctx, tensor_grad_view),
                            nb1, nb2, nb3, offset, false),
                        zero_table);
                }

                if (src1->grad) {
                    src1->grad =
                        ggml_add_or_set(ctx,
                            src1->grad,
                            ggml_reshape(ctx,
                                ggml_cont(ctx, tensor_grad_view),
                                src1->grad),
                            zero_table);
                }
            } break;
        case GGML_OP_CPY:
            {
                // necessary for llama
                // cpy overwrites value of src1 by src0 and returns view(src1)
                // the overwriting is mathematically equivalent to:
                // tensor = src0 * 1 + src1 * 0
                if (src0->grad) {
                    // dsrc0 = dtensor * 1
                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                }
                if (src1->grad) {
                    // dsrc1 = dtensor * 0 -> noop
                }
            } break;
        case GGML_OP_CONT:
            {
                // same as cpy
                if (src0->grad) {
                    GGML_ASSERT(ggml_is_contiguous(src0->grad));
                    GGML_ASSERT(ggml_is_contiguous(tensor->grad));
                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                }
            } break;
        case GGML_OP_RESHAPE:
            {
                // necessary for llama
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx, src0->grad,
                            ggml_reshape(ctx,
                                ggml_is_contiguous(tensor->grad)
                                    ? tensor->grad
                                    : ggml_cont(ctx, tensor->grad),
                                src0->grad),
                        zero_table);
                }
            } break;
        case GGML_OP_VIEW:
            {
                // necessary for llama
                if (src0->grad) {
                    size_t offset;

                    memcpy(&offset, tensor->op_params, sizeof(offset));

                    size_t nb1     = tensor->nb[1];
                    size_t nb2     = tensor->nb[2];
                    size_t nb3     = tensor->nb[3];

                    if (src0->type != src0->grad->type) {
                        // gradient is typically F32, but src0 could be other type
                        size_t ng = ggml_element_size(src0->grad);
                        size_t n0 = ggml_element_size(src0);
                        GGML_ASSERT(offset % n0 == 0);
                        GGML_ASSERT(nb1 % n0 == 0);
                        GGML_ASSERT(nb2 % n0 == 0);
                        GGML_ASSERT(nb3 % n0 == 0);
                        offset = (offset / n0) * ng;
                        nb1 = (nb1 / n0) * ng;
                        nb2 = (nb2 / n0) * ng;
                        nb3 = (nb3 / n0) * ng;
                    }

                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table);
                }
            } break;
        case GGML_OP_PERMUTE:
            {
                // necessary for llama
                if (src0->grad) {
                    int32_t * axes = (int32_t *) tensor->op_params;
                    int axis0 = axes[0] & 0x3;
                    int axis1 = axes[1] & 0x3;
                    int axis2 = axes[2] & 0x3;
                    int axis3 = axes[3] & 0x3;
                    int axes_backward[4] = {0,0,0,0};
                    axes_backward[axis0] = 0;
                    axes_backward[axis1] = 1;
                    axes_backward[axis2] = 2;
                    axes_backward[axis3] = 3;
                    src0->grad =
                        ggml_add_or_set(ctx, src0->grad,
                            ggml_permute(ctx,
                                tensor->grad,
                                axes_backward[0],
                                axes_backward[1],
                                axes_backward[2],
                                axes_backward[3]),
                            zero_table);
                }
            } break;
        case GGML_OP_TRANSPOSE:
            {
                // necessary for llama
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx, src0->grad,
                            ggml_transpose(ctx, tensor->grad),
                        zero_table);
                }
            } break;
        case GGML_OP_GET_ROWS:
            {
                // necessary for llama (only for tokenizer)
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx, src0->grad,
                            // last ggml_get_rows_back argument src0->grad is only
                            // necessary to setup correct output shape
                            ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
                        zero_table);
                }
                if (src1->grad) {
                    // noop
                }
            } break;
        case GGML_OP_GET_ROWS_BACK:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_DIAG:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_DIAG_MASK_INF:
            {
                // necessary for llama
                if (src0->grad) {
                    const int n_past = ((int32_t *) tensor->op_params)[0];
                    src0->grad =
                        ggml_add_or_set(ctx, src0->grad,
                            /* ggml_diag_mask_inf_impl() shouldn't be here */
                            /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
                        zero_table);
                }
            } break;
        case GGML_OP_DIAG_MASK_ZERO:
            {
                // necessary for llama
                if (src0->grad) {
                    const int n_past = ((int32_t *) tensor->op_params)[0];
                    src0->grad =
                        ggml_add_or_set(ctx, src0->grad,
                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
                        zero_table);
                }
            } break;
        case GGML_OP_SOFT_MAX:
            {
                // necessary for llama
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx, src0->grad,
                            ggml_soft_max_back(ctx, tensor->grad, tensor),
                        zero_table);
                }

            } break;
        case GGML_OP_SOFT_MAX_BACK:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_ROPE:
            {
                // necessary for llama
                if (src0->grad) {
                    //const int n_past = ((int32_t *) tensor->op_params)[0];
                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
                    const int mode       = ((int32_t *) tensor->op_params)[2];
                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;

                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
                            ggml_rope_back(ctx,
                                tensor->grad,
                                src1,
                                n_dims,
                                mode,
                                n_ctx,
                                n_orig_ctx,
                                freq_base,
                                freq_scale,
                                ext_factor,
                                attn_factor,
                                beta_fast,
                                beta_slow,
                                xpos_base,
                                xpos_down),
                            zero_table);
                }
            } break;
        case GGML_OP_ROPE_BACK:
            {
                if (src0->grad) {
                    //const int n_past = ((int32_t *) tensor->op_params)[0];
                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
                    const int mode       = ((int32_t *) tensor->op_params)[2];
                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;

                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
                            ggml_rope_impl(ctx,
                                tensor->grad,
                                src1,
                                n_dims,
                                mode,
                                n_ctx,
                                n_orig_ctx,
                                freq_base,
                                freq_scale,
                                ext_factor,
                                attn_factor,
                                beta_fast,
                                beta_slow,
                                xpos_base,
                                xpos_down,
                                false),
                            zero_table);
                }
            } break;
        case GGML_OP_ALIBI:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_CLAMP:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_IM2COL:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_POOL_1D:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_POOL_2D:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_UPSCALE:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_PAD:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_ARANGE:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_ARGSORT:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_LEAKY_RELU:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_FLASH_ATTN:
            {
                struct ggml_tensor * flash_grad = NULL;
                if (src0->grad || src1->grad || tensor->src[2]->grad) {
                    int32_t t = ggml_get_op_params_i32(tensor, 0);
                    GGML_ASSERT(t == 0 || t == 1);
                    bool masked = t != 0;
                    flash_grad =
                        ggml_flash_attn_back(ctx,
                            src0,
                            src1,
                            tensor->src[2],
                            tensor->grad,
                            masked);
                }

                struct ggml_tensor * src2 = tensor->src[2];
                const int64_t elem_q = ggml_nelements(src0);
                const int64_t elem_k = ggml_nelements(src1);
                const int64_t elem_v = ggml_nelements(src2);

                enum ggml_type result_type = flash_grad->type;
                GGML_ASSERT(ggml_blck_size(result_type) == 1);
                const size_t tsize = ggml_type_size(result_type);

                const size_t offs_q = 0;
                const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
                const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);

                if (src0->grad) {
                    struct ggml_tensor * view_q = ggml_view_1d(ctx, flash_grad, elem_q, offs_q);
                    struct ggml_tensor * grad_q = ggml_reshape(ctx, view_q, src0);
                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
                            grad_q,
                            zero_table);
                }
                if (src1->grad) {
                    struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k);
                    struct ggml_tensor * grad_k = ggml_reshape(ctx, view_k, src1);
                    src1->grad = ggml_add_or_set(ctx,
                            src1->grad,
                            grad_k,
                            zero_table);
                }
                if (src2->grad) {
                    struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v);
                    struct ggml_tensor * grad_v = ggml_reshape(ctx, view_v, src2);
                    src2->grad = ggml_add_or_set(ctx,
                            src2->grad,
                            grad_v,
                            zero_table);
                }
            } break;
        case GGML_OP_FLASH_FF:
            {
                GGML_ASSERT(false); // not supported
            } break;
        case GGML_OP_FLASH_ATTN_BACK:
            {
                GGML_ASSERT(false); // not supported
            } break;
        case GGML_OP_SSM_CONV:
        case GGML_OP_SSM_SCAN:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_WIN_PART:
        case GGML_OP_WIN_UNPART:
        case GGML_OP_UNARY:
            {
                switch (ggml_get_unary_op(tensor)) {
                    case GGML_UNARY_OP_ABS:
                        {
                            if (src0->grad) {
                                src0->grad =
                                    ggml_add_or_set(ctx,
                                            src0->grad,
                                            ggml_mul(ctx,
                                                ggml_sgn(ctx, src0),
                                                tensor->grad),
                                            zero_table);
                            }
                        } break;
                    case GGML_UNARY_OP_SGN:
                        {
                            if (src0->grad) {
                                // noop
                            }
                        } break;
                    case GGML_UNARY_OP_NEG:
                        {
                            if (src0->grad) {
                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table);
                            }
                        } break;
                    case GGML_UNARY_OP_STEP:
                        {
                            if (src0->grad) {
                                // noop
                            }
                        } break;
                    case GGML_UNARY_OP_TANH:
                        {
                            GGML_ASSERT(false); // TODO: not implemented
                        } break;
                    case GGML_UNARY_OP_ELU:
                        {
                            GGML_ASSERT(false); // TODO: not implemented
                        } break;
                    case GGML_UNARY_OP_RELU:
                        {
                            if (src0->grad) {
                                src0->grad = ggml_add_or_set(ctx,
                                        src0->grad,
                                        ggml_mul(ctx,
                                            ggml_step(ctx, src0),
                                            tensor->grad),
                                        zero_table);
                            }
                        } break;
                    case GGML_UNARY_OP_GELU:
                        {
                            GGML_ASSERT(false); // TODO: not implemented
                        } break;
                    case GGML_UNARY_OP_GELU_QUICK:
                        {
                            GGML_ASSERT(false); // TODO: not implemented
                        } break;
                    case GGML_UNARY_OP_SILU:
                        {
                            // necessary for llama
                            if (src0->grad) {
                                src0->grad = ggml_add_or_set(ctx,
                                        src0->grad,
                                        ggml_silu_back(ctx, src0, tensor->grad),
                                        zero_table);
                            }
                        } break;
                    default:
                        GGML_ASSERT(false);
                }
            } break;
        case GGML_OP_GET_REL_POS:
        case GGML_OP_ADD_REL_POS:
        case GGML_OP_MAP_UNARY:
        case GGML_OP_MAP_BINARY:
        case GGML_OP_MAP_CUSTOM1_F32:
        case GGML_OP_MAP_CUSTOM2_F32:
        case GGML_OP_MAP_CUSTOM3_F32:
        case GGML_OP_MAP_CUSTOM1:
        case GGML_OP_MAP_CUSTOM2:
        case GGML_OP_MAP_CUSTOM3:
            {
                GGML_ASSERT(false); // not supported
            } break;
        case GGML_OP_CROSS_ENTROPY_LOSS:
            {
                if (src0->grad) {
                    src0->grad = ggml_add_or_set(ctx,
                                src0->grad,
                                ggml_cross_entropy_loss_back(ctx,
                                    src0,
                                    src1,
                                    tensor->grad),
                                zero_table);
                }
            } break;
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
            {
                GGML_ASSERT(false); // not supported
            } break;
        case GGML_OP_NONE:
            {
                // nop
            } break;
        case GGML_OP_COUNT:
            {
                GGML_ASSERT(false);
            } break;
    }

    for (int i = 0; i < GGML_MAX_SRC; ++i) {
        if (tensor->src[i] && tensor->src[i]->grad) {
            GGML_ASSERT(ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad));
        }
    }
}

static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
    if (node->grad == NULL) {
        // this usually happens when we generate intermediate nodes from constants in the backward pass
        // it can also happen during forward pass, if the user performs computations with constants
        if (node->op != GGML_OP_NONE) {
            //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
        }
    }

    // check if already visited
    if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
        return;
    }

    for (int i = 0; i < GGML_MAX_SRC; ++i) {
        const int k =
            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
            /* unknown order, just fall back to using i*/ i;
        if (node->src[k]) {
            ggml_visit_parents(cgraph, node->src[k]);
        }
    }

    if (node->op == GGML_OP_NONE && node->grad == NULL) {
        // reached a leaf node, not part of the gradient graph (e.g. a constant)
        GGML_ASSERT(cgraph->n_leafs < cgraph->size);

        if (strlen(node->name) == 0) {
            ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
        }

        cgraph->leafs[cgraph->n_leafs] = node;
        cgraph->n_leafs++;
    } else {
        GGML_ASSERT(cgraph->n_nodes < cgraph->size);

        if (strlen(node->name) == 0) {
            ggml_format_name(node, "node_%d", cgraph->n_nodes);
        }

        cgraph->nodes[cgraph->n_nodes] = node;
        if (cgraph->grads) {
            cgraph->grads[cgraph->n_nodes] = node->grad;
        }
        cgraph->n_nodes++;
    }
}

static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
    if (!expand) {
        // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
        ggml_graph_clear(cgraph);
    }

    const int n0 = cgraph->n_nodes;
    UNUSED(n0);

    ggml_visit_parents(cgraph, tensor);

    const int n_new = cgraph->n_nodes - n0;
    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);

    if (n_new > 0) {
        // the last added node should always be starting point
        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
    }
}

void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
    ggml_build_forward_impl(cgraph, tensor, true);
}

void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
    GGML_ASSERT(gf->n_nodes > 0);

    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
    if (keep) {
        for (int i = 0; i < gf->n_nodes; i++) {
            struct ggml_tensor * node = gf->nodes[i];

            if (node->grad) {
                node->grad = ggml_dup_tensor(ctx, node);
                gf->grads[i] = node->grad;
            }
        }
    }

    // remember original gradients which start with zero values
    struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
    for (int i = 0; i < gf->n_nodes; i++) {
        if (gf->grads[i]) {
            ggml_hash_insert(zero_table, gf->grads[i]);
        }
    }

    for (int i = gf->n_nodes - 1; i >= 0; i--) {
        struct ggml_tensor * node = gf->nodes[i];

        // inplace operations to add gradients are not created by ggml_compute_backward
        // use allocator to automatically make inplace operations
        if (node->grad) {
            ggml_compute_backward(ctx, node, zero_table);
        }
    }

    for (int i = 0; i < gf->n_nodes; i++) {
        struct ggml_tensor * node = gf->nodes[i];

        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
            ggml_build_forward_expand(gb, node->grad);
        }
    }

    ggml_hash_set_free(zero_table);
}

static size_t ggml_graph_nbytes(size_t size, bool grads) {
    size_t nbytes = sizeof(struct ggml_cgraph);
    nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
    if (grads) {
        nbytes += size * sizeof(struct ggml_tensor *); // grads
    }
    nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
    return nbytes;
}

size_t ggml_graph_overhead_custom(size_t size, bool grads) {
    return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
}

size_t ggml_graph_overhead(void) {
    return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
}

struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
    const size_t obj_size = ggml_graph_nbytes(size, grads);
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
    struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);

    struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);

    size_t hash_size = ggml_hash_size(size * 2);
    struct ggml_tensor ** nodes_ptr = data_start;
    struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
    struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
    struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;

    // check that we allocated the correct amount of memory
    assert(obj_size == (size_t) (
        (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));

    memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));

    *cgraph = (struct ggml_cgraph) {
        /*.size         =*/ size,
        /*.n_nodes      =*/ 0,
        /*.n_leafs      =*/ 0,
        /*.nodes        =*/ nodes_ptr,
        /*.grads        =*/ grads_ptr,
        /*.leafs        =*/ leafs_ptr,
        /*.hash_table   =*/ { hash_size, hash_keys_ptr },
        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
        /*.perf_runs    =*/ 0,
        /*.perf_cycles  =*/ 0,
        /*.perf_time_us =*/ 0,
    };

    return cgraph;
}

struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
    return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
}

struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
    struct ggml_cgraph cgraph = {
        /*.size         =*/ 0,
        /*.n_nodes      =*/ i1 - i0,
        /*.n_leafs      =*/ 0,
        /*.nodes        =*/ cgraph0->nodes + i0,
        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
        /*.leafs        =*/ NULL,
        /*.hash_table   =*/ { 0, NULL },
        /*.order        =*/ cgraph0->order,
        /*.perf_runs    =*/ 0,
        /*.perf_cycles  =*/ 0,
        /*.perf_time_us =*/ 0,
    };

    return cgraph;
}

void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
    GGML_ASSERT(dst->size >= src->n_leafs);
    GGML_ASSERT(dst->size >= src->n_nodes);
    GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);

    dst->n_leafs = src->n_leafs;
    dst->n_nodes = src->n_nodes;
    dst->order   = src->order;

    for (int i = 0; i < src->n_leafs; ++i) {
        dst->leafs[i] = src->leafs[i];
    }

    for (int i = 0; i < src->n_nodes; ++i) {
        dst->nodes[i] = src->nodes[i];
    }

    if (src->grads) {
        GGML_ASSERT(dst->grads != NULL);
        for (int i = 0; i < src->n_nodes; ++i) {
            dst->grads[i] = src->grads[i];
        }
    }

    for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
        if (src->visited_hash_table.keys[i]) {
            ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
        }
    }
}

struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
    ggml_graph_cpy(cgraph, result);
    return result;
}

void ggml_graph_reset(struct ggml_cgraph * cgraph) {
    GGML_ASSERT(cgraph->grads != NULL);

    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * grad = cgraph->grads[i];

        if (grad) {
            ggml_set_zero(grad);
        }
    }
}

void ggml_graph_clear(struct ggml_cgraph * cgraph) {
    cgraph->n_leafs = 0;
    cgraph->n_nodes = 0;
    memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
}

//
// thread data
//
// synchronization is done via busy loops
// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
//

#ifdef __APPLE__

//#include <os/lock.h>
//
//typedef os_unfair_lock ggml_lock_t;
//
//#define ggml_lock_init(x)    UNUSED(x)
//#define ggml_lock_destroy(x) UNUSED(x)
//#define ggml_lock_lock       os_unfair_lock_lock
//#define ggml_lock_unlock     os_unfair_lock_unlock
//
//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT

typedef int ggml_lock_t;

#define ggml_lock_init(x)    UNUSED(x)
#define ggml_lock_destroy(x) UNUSED(x)
#define ggml_lock_lock(x)    UNUSED(x)
#define ggml_lock_unlock(x)  UNUSED(x)

#define GGML_LOCK_INITIALIZER 0

typedef pthread_t ggml_thread_t;

#define ggml_thread_create pthread_create
#define ggml_thread_join   pthread_join

#else

//typedef pthread_spinlock_t ggml_lock_t;

//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
//#define ggml_lock_destroy pthread_spin_destroy
//#define ggml_lock_lock    pthread_spin_lock
//#define ggml_lock_unlock  pthread_spin_unlock

typedef int ggml_lock_t;

#define ggml_lock_init(x)    UNUSED(x)
#define ggml_lock_destroy(x) UNUSED(x)
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
#define ggml_lock_lock(x)    _mm_pause()
#else
#define ggml_lock_lock(x)    UNUSED(x)
#endif
#define ggml_lock_unlock(x)  UNUSED(x)

#define GGML_LOCK_INITIALIZER 0

typedef pthread_t ggml_thread_t;

#define ggml_thread_create pthread_create
#define ggml_thread_join   pthread_join

#endif

// Android's libc implementation "bionic" does not support setting affinity
#if defined(__gnu_linux__)
static void set_numa_thread_affinity(int thread_n) {
    if (!ggml_is_numa()) {
        return;
    }

    int node_num;
    int rv;
    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);

    switch(g_state.numa.numa_strategy) {
        case GGML_NUMA_STRATEGY_DISTRIBUTE:
            // run thread on node_num thread_n / (threads per node)
            node_num = thread_n % g_state.numa.n_nodes;
            break;
        case GGML_NUMA_STRATEGY_ISOLATE:
            // run thread on current_node
            node_num = g_state.numa.current_node;
            break;
        case GGML_NUMA_STRATEGY_NUMACTL:
            // use the cpuset that numactl gave us
            rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
            if (rv) {
                fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
            }
            return;
        default:
            return;
    }

    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];

    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
    CPU_ZERO_S(setsize, cpus);
    for (size_t i = 0; i < node->n_cpus; ++i) {
        CPU_SET_S(node->cpus[i], setsize, cpus);
    }

    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
    if (rv) {
            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
    }

    CPU_FREE(cpus);
}

static void clear_numa_thread_affinity(void) {
    if (!ggml_is_numa()) {
        return;
    }

    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);

    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
    CPU_ZERO_S(setsize, cpus);
    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
        CPU_SET_S(i, setsize, cpus);
    }

    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
    if (rv) {
        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
    }

    CPU_FREE(cpus);
}
#else
// TODO: Windows etc.
// (the linux implementation may also work on BSD, someone should test)
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
static void clear_numa_thread_affinity(void) {}
#endif

struct ggml_compute_state_shared {
    const struct ggml_cgraph * cgraph;
    const struct ggml_cplan  * cplan;

    int64_t perf_node_start_cycles;
    int64_t perf_node_start_time_us;

    const int n_threads;

    // synchronization primitives
    atomic_int n_active;  // num active threads
    atomic_int node_n;    // active graph node
    atomic_int node_task; // active graph node task phase

    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
    void * abort_callback_data;
};

struct ggml_compute_state {
    ggml_thread_t thrd;
    int ith;
    struct ggml_compute_state_shared * shared;
    enum ggml_status ec;
};

static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
    int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;

    node->perf_runs++;
    node->perf_cycles  += cycles_cur;
    node->perf_time_us += time_us_cur;
}

static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
    int n_tasks = 0;

    if (ggml_is_empty(node)) {
        // no need to multi-thread a no-op
        n_tasks = 1;
        return n_tasks;
    }

    switch (node->op) {
        case GGML_OP_CPY:
        case GGML_OP_DUP:
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_ACC:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_SUB:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_LOG:
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_MEAN:
        case GGML_OP_ARGMAX:
        case GGML_OP_REPEAT:
        case GGML_OP_REPEAT_BACK:
        case GGML_OP_LEAKY_RELU:
            {
                n_tasks = 1;
            } break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(node)) {
                case GGML_UNARY_OP_ABS:
                case GGML_UNARY_OP_SGN:
                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_STEP:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
                    {
                        n_tasks = 1;
                    } break;

                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
                    {
                        n_tasks = n_threads;
                    } break;
                default:
                    GGML_ASSERT(false);
            }
            break;
        case GGML_OP_SILU_BACK:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_NORM:
        case GGML_OP_RMS_NORM:
        case GGML_OP_RMS_NORM_BACK:
        case GGML_OP_GROUP_NORM:
        case GGML_OP_CONCAT:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_MUL_MAT:
            {
                n_tasks = n_threads;

                // TODO: use different scheduling for different matrix sizes
                //const int nr0 = ggml_nrows(node->src[0]);
                //const int nr1 = ggml_nrows(node->src[1]);

                //n_tasks = MIN(n_threads, MAX(1, nr0/128));
                //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_OUT_PROD:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_GET_ROWS:
            {
                // FIXME: the cost of launching additional threads decreases performance with GPU offloading
                //n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
                n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
            } break;
        case GGML_OP_SCALE:
        case GGML_OP_SET:
        case GGML_OP_CONT:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_GET_ROWS_BACK:
        case GGML_OP_DIAG:
            {
                n_tasks = 1;
            } break;
        case GGML_OP_DIAG_MASK_ZERO:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX_BACK:
        case GGML_OP_ROPE:
        case GGML_OP_ROPE_BACK:
        case GGML_OP_ADD_REL_POS:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_ALIBI:
            {
                n_tasks = 1; //TODO
            } break;
        case GGML_OP_CLAMP:
            {
                n_tasks = 1; //TODO
            } break;
        case GGML_OP_SOFT_MAX:
            {
                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_IM2COL:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_POOL_1D:
        case GGML_OP_POOL_2D:
            {
                n_tasks = 1;
            } break;
        case GGML_OP_UPSCALE:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_PAD:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_ARANGE:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_ARGSORT:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_FLASH_ATTN:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_FLASH_FF:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_FLASH_ATTN_BACK:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_SSM_CONV:
        case GGML_OP_SSM_SCAN:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_WIN_PART:
        case GGML_OP_WIN_UNPART:
        case GGML_OP_GET_REL_POS:
        case GGML_OP_MAP_UNARY:
        case GGML_OP_MAP_BINARY:
        case GGML_OP_MAP_CUSTOM1_F32:
        case GGML_OP_MAP_CUSTOM2_F32:
        case GGML_OP_MAP_CUSTOM3_F32:
            {
                n_tasks = 1;
            } break;
        case GGML_OP_MAP_CUSTOM1:
            {
                struct ggml_map_custom1_op_params p;
                memcpy(&p, node->op_params, sizeof(p));
                if (p.n_tasks == GGML_N_TASKS_MAX) {
                    n_tasks = n_threads;
                } else {
                    n_tasks = MIN(p.n_tasks, n_threads);
                }
            } break;
        case GGML_OP_MAP_CUSTOM2:
            {
                struct ggml_map_custom2_op_params p;
                memcpy(&p, node->op_params, sizeof(p));
                if (p.n_tasks == GGML_N_TASKS_MAX) {
                    n_tasks = n_threads;
                } else {
                    n_tasks = MIN(p.n_tasks, n_threads);
                }
            } break;
        case GGML_OP_MAP_CUSTOM3:
            {
                struct ggml_map_custom3_op_params p;
                memcpy(&p, node->op_params, sizeof(p));
                if (p.n_tasks == GGML_N_TASKS_MAX) {
                    n_tasks = n_threads;
                } else {
                    n_tasks = MIN(p.n_tasks, n_threads);
                }
            } break;
        case GGML_OP_CROSS_ENTROPY_LOSS:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_NONE:
            {
                n_tasks = 1;
            } break;
        case GGML_OP_COUNT:
            {
                GGML_ASSERT(false);
            } break;
        default:
            {
                fprintf(stderr, "%s: op not implemented: ", __func__);
                if (node->op < GGML_OP_COUNT) {
                    fprintf(stderr, "%s\n", ggml_op_name(node->op));
                } else {
                    fprintf(stderr, "%d\n", node->op);
                }
                GGML_ASSERT(false);
            } break;
    }

    assert(n_tasks > 0);

    return n_tasks;
}

static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
    // wait for other threads to finish
    const int last_node_n = * node_n;

    while (true) {
        if (do_yield) {
            sched_yield();
        }

        * node_n = atomic_load(&state->shared->node_n);
        if (* node_n != last_node_n) break;
    }
}

static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
    // wait for other threads to finish
    const int last_task_phase = * task_phase;

    while (true) {
        if (do_yield) {
            sched_yield();
        }

        * task_phase = atomic_load(&state->shared->node_task);
        if (* task_phase != last_task_phase) break;
    }
}

static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;

    const struct ggml_cgraph * cgraph = state->shared->cgraph;
    const struct ggml_cplan  * cplan  = state->shared->cplan;

    const int   n_threads   = state->shared->n_threads;

    set_numa_thread_affinity(state->ith);

    int node_n     = -1;
    int task_phase = GGML_TASK_TYPE_FINALIZE;

    while (true) {
        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
            state->shared->node_n += 1;
            state->ec = GGML_STATUS_ABORTED;
            return 0;
        }

        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            // all other threads are finished and spinning
            // do finalize and init here so we don't have synchronize again
            struct ggml_compute_params params = {
                /*.type  =*/ GGML_TASK_TYPE_FINALIZE,
                /*.ith   =*/ 0,
                /*.nth   =*/ 0,
                /*.wsize =*/ cplan->work_size,
                /*.wdata =*/ cplan->work_data,
            };

            if (node_n != -1) {
                /* FINALIZE */
                struct ggml_tensor * node = cgraph->nodes[node_n];
                if (GGML_OP_HAS_FINALIZE[node->op]) {
                    params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
                    ggml_compute_forward(&params, node);
                }
                ggml_graph_compute_perf_stats_node(node, state->shared);
            }

            // distribute new work or execute it direct if 1T
            while (++node_n < cgraph->n_nodes) {
                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
                struct ggml_tensor * node = cgraph->nodes[node_n];
                const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);

                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
                state->shared->perf_node_start_time_us = ggml_perf_time_us();

                params.nth = n_tasks;

                if (n_tasks == 1) {
                    /* INIT */
                    if (GGML_OP_HAS_INIT[node->op]) {
                        params.type = GGML_TASK_TYPE_INIT;
                        ggml_compute_forward(&params, node);
                    }

                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                    // they do something more efficient than spinning (?)
                    params.type = GGML_TASK_TYPE_COMPUTE;
                    ggml_compute_forward(&params, node);

                    if (GGML_OP_HAS_FINALIZE[node->op]) {
                        params.type = GGML_TASK_TYPE_FINALIZE;
                        ggml_compute_forward(&params, node);
                    }

                    ggml_graph_compute_perf_stats_node(node, state->shared);
                } else {
                    break;
                }

                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
                    break;
                }
            }

            task_phase = GGML_TASK_TYPE_INIT;
            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_n,    node_n);
            atomic_store(&state->shared->node_task, task_phase);
        } else {
            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
        }

        // check if we should stop
        if (node_n >= cgraph->n_nodes) break;

        /* INIT & COMPUTE */
        struct ggml_tensor * node = cgraph->nodes[node_n];
        const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);

        struct ggml_compute_params params = {
            /*.type  =*/ GGML_TASK_TYPE_INIT,
            /*.ith   =*/ state->ith,
            /*.nth   =*/ n_tasks,
            /*.wsize =*/ cplan->work_size,
            /*.wdata =*/ cplan->work_data,
        };

        if (state->ith < n_tasks) {
            if (GGML_OP_HAS_INIT[node->op]) {
                ggml_compute_forward(&params, node);
            }
        }

        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            task_phase = GGML_TASK_TYPE_COMPUTE;
            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_task, task_phase);
        }
        else {
            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
            //       depending on the workload and the operating system.
            //       since it is not clear what is the best approach, it should potentially become user-configurable
            //       ref: https://github.com/ggerganov/ggml/issues/291
            // UPD:  adding the do_yield flag seems to resolve the issue universally
            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
        }

        if (state->ith < n_tasks) {
            params.type = GGML_TASK_TYPE_COMPUTE;
            ggml_compute_forward(&params, node);
        }

        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            task_phase = GGML_TASK_TYPE_FINALIZE;
            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_task, task_phase);
        }
        else {
            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
        }
    }

    return 0;
}

struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
    if (n_threads <= 0) {
        n_threads = GGML_DEFAULT_N_THREADS;
    }

    size_t work_size = 0;

    struct ggml_cplan cplan;
    memset(&cplan, 0, sizeof(struct ggml_cplan));

    int max_tasks = 1;

    // thread scheduling for the different operations + work buffer size estimation
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];

        const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);

        max_tasks = MAX(max_tasks, n_tasks);

        size_t cur = 0;

        switch (node->op) {
            case GGML_OP_CPY:
            case GGML_OP_DUP:
                {
                    if (ggml_is_quantized(node->type)) {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                    }
                } break;
            case GGML_OP_ADD:
            case GGML_OP_ADD1:
                {
                    if (ggml_is_quantized(node->src[0]->type)) {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                    }
                } break;
            case GGML_OP_ACC:
                {
                    if (ggml_is_quantized(node->src[0]->type)) {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
                    }
                } break;
            case GGML_OP_MUL_MAT:
                {
                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;

#if defined(GGML_USE_CLBLAST)
                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
                    } else
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
                            // here we need memory for fully dequantized matrix from src0
                            // take into account that src0 can be broadcasted into src1[2,3]
                            cur = ggml_type_size(GGML_TYPE_F32)
                                * node->src[0]->ne[0]*node->src[0]->ne[1]
                                * node->src[1]->ne[2]*node->src[1]->ne[3];
                        }
                    } else
#endif
                    if (node->src[1]->type != vec_dot_type) {
                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
                    }
                } break;
            case GGML_OP_MUL_MAT_ID:
                {
                    cur = 0;
                    const struct ggml_tensor * src0 = node->src[2];
                    const struct ggml_tensor * src1 = node->src[1];
                    const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
                    if (src1->type != vec_dot_type) {
                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
                    }
                    const int n_as = ggml_get_op_params_i32(node, 1);
                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
                } break;
            case GGML_OP_OUT_PROD:
                {
                    if (ggml_is_quantized(node->src[0]->type)) {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                    }
                } break;
            case GGML_OP_SOFT_MAX:
            case GGML_OP_ROPE:
                {
                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                } break;
            case GGML_OP_CONV_TRANSPOSE_1D:
                {
                    GGML_ASSERT(node->src[0]->ne[3] == 1);
                    GGML_ASSERT(node->src[1]->ne[2] == 1);
                    GGML_ASSERT(node->src[1]->ne[3] == 1);

                    const int64_t ne00 = node->src[0]->ne[0];  // K
                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
                    const int64_t ne02 = node->src[0]->ne[2];  // Cin

                    const int64_t ne10 = node->src[1]->ne[0];  // L
                    const int64_t ne11 = node->src[1]->ne[1];  // Cin

                    if (node->src[0]->type == GGML_TYPE_F16 &&
                        node->src[1]->type == GGML_TYPE_F32) {
                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
                               node->src[1]->type == GGML_TYPE_F32) {
                        cur += sizeof(float)*ne00*ne01*ne02;
                        cur += sizeof(float)*ne10*ne11;
                    } else {
                        GGML_ASSERT(false);
                    }
                } break;
            case GGML_OP_CONV_TRANSPOSE_2D:
                {
                    const int64_t ne00 = node->src[0]->ne[0]; // W
                    const int64_t ne01 = node->src[0]->ne[1]; // H
                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In

                    const int64_t ne10 = node->src[1]->ne[0]; // W
                    const int64_t ne11 = node->src[1]->ne[1]; // H
                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In

                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
                } break;
            case GGML_OP_FLASH_ATTN:
                {
                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);

                    if (node->src[1]->type == GGML_TYPE_F32) {
                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                    } else if (node->src[1]->type == GGML_TYPE_F16) {
                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                    }
                } break;
            case GGML_OP_FLASH_FF:
                {
                    if (node->src[1]->type == GGML_TYPE_F32) {
                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
                    } else if (node->src[1]->type == GGML_TYPE_F16) {
                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
                    }
                } break;
            case GGML_OP_FLASH_ATTN_BACK:
                {
                    const int64_t    D = node->src[0]->ne[0];
                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
                    if (node->src[1]->type == GGML_TYPE_F32) {
                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                    } else if (node->src[1]->type == GGML_TYPE_F16) {
                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                    }
                } break;

            case GGML_OP_CROSS_ENTROPY_LOSS:
                {
                    cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
                } break;
            case GGML_OP_COUNT:
                {
                    GGML_ASSERT(false);
                } break;
            default:
                break;
        }

        work_size = MAX(work_size, cur);
    }

    if (work_size > 0) {
        work_size += CACHE_LINE_SIZE*(n_threads - 1);
    }

    cplan.n_threads = MIN(max_tasks, n_threads);
    cplan.work_size = work_size;
    cplan.work_data = NULL;

    return cplan;
}

enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    {
        GGML_ASSERT(cplan);
        GGML_ASSERT(cplan->n_threads > 0);

        if (cplan->work_size > 0) {
            GGML_ASSERT(cplan->work_data);
        }
    }

#ifdef GGML_USE_VULKAN
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
    }
    ggml_vk_preallocate_buffers_cpu_assist();

    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
    }
#endif

    const int n_threads = cplan->n_threads;

    struct ggml_compute_state_shared state_shared = {
        /*.cgraph                  =*/ cgraph,
        /*.cgraph_plan             =*/ cplan,
        /*.perf_node_start_cycles  =*/ 0,
        /*.perf_node_start_time_us =*/ 0,
        /*.n_threads               =*/ n_threads,
        /*.n_active                =*/ n_threads,
        /*.node_n                  =*/ -1,
        /*.node_task               =*/ GGML_TASK_TYPE_FINALIZE,
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
    };
    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);

    // create thread pool
    if (n_threads > 1) {
        for (int j = 1; j < n_threads; ++j) {
            workers[j] = (struct ggml_compute_state) {
                .thrd   = 0,
                .ith = j,
                .shared = &state_shared,
                .ec = GGML_STATUS_SUCCESS,
            };

            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
            GGML_ASSERT(rc == 0);
            UNUSED(rc);
        }
    }

    workers[0].ith = 0;
    workers[0].shared = &state_shared;
    workers[0].ec = GGML_STATUS_SUCCESS;

    const int64_t perf_start_cycles  = ggml_perf_cycles();
    const int64_t perf_start_time_us = ggml_perf_time_us();

    // this is a work thread too
    ggml_graph_compute_thread(&workers[0]);
    enum ggml_status compute_status = workers[0].ec;

    // don't leave affinity set on the main thread
    clear_numa_thread_affinity();

    // join or kill thread pool
    if (n_threads > 1) {
        for (int j = 1; j < n_threads; j++) {
            const int rc = ggml_thread_join(workers[j].thrd, NULL);
            GGML_ASSERT(rc == 0);
            if (workers[j].ec != GGML_STATUS_SUCCESS)
                compute_status = workers[j].ec;
        }
    }

#ifdef GGML_USE_VULKAN
    ggml_vk_graph_cleanup_cpu_assist();
#endif

    // performance stats (graph)
    {
        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;

        cgraph->perf_runs++;
        cgraph->perf_cycles  += perf_cycles_cur;
        cgraph->perf_time_us += perf_time_us_cur;

        GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
                __func__, cgraph->perf_runs,
                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),
                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
                (double) perf_time_us_cur     / 1000.0,
                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
    }

    return compute_status;
}

enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);

    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);

    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;

    return ggml_graph_compute(cgraph, &cplan);
}

struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
    for (int i = 0; i < cgraph->n_leafs; i++) {
        struct ggml_tensor * leaf = cgraph->leafs[i];

        if (strcmp(leaf->name, name) == 0) {
            return leaf;
        }
    }

    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];

        if (strcmp(node->name, name) == 0) {
            return node;
        }
    }

    return NULL;
}

static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
    const int64_t * ne = tensor->ne;
    const size_t  * nb = tensor->nb;

    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
            ggml_type_name(tensor->type),
            ggml_op_name  (tensor->op),
            ggml_n_dims(tensor),
            ne[0], ne[1], ne[2], ne[3],
            nb[0], nb[1], nb[2], nb[3],
            tensor->data,
            tensor->name);
}

static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
    const int64_t * ne = tensor->ne;
    const size_t  * nb = tensor->nb;

    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
            arg,
            ggml_type_name(tensor->type),
            ggml_op_name  (tensor->op),
            ggml_n_dims(tensor),
            ne[0], ne[1], ne[2], ne[3],
            nb[0], nb[1], nb[2], nb[3],
            tensor->data,
            tensor->name);
}

void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
    uint64_t size_eval = 0;

    // compute size of intermediate results
    // TODO: does not take into account scratch buffers !!!!
    for (int i = 0; i < cgraph->n_nodes; ++i) {
        size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
    }

    // print
    {
        FILE * fout = stdout;

        fprintf(fout, "\n");
        fprintf(fout, "%-16s %8x\n", "magic",        GGML_FILE_MAGIC);
        fprintf(fout, "%-16s %8d\n", "version",      GGML_FILE_VERSION);
        fprintf(fout, "%-16s %8d\n", "leafs",        cgraph->n_leafs);
        fprintf(fout, "%-16s %8d\n", "nodes",        cgraph->n_nodes);
        fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);

        // header
        fprintf(fout, "\n");
        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");

        for (int i = 0; i < cgraph->n_leafs; ++i) {
            ggml_graph_export_leaf(cgraph->leafs[i], fout);

            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
            GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
            GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
        }

        // header
        fprintf(fout, "\n");
        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");

        for (int i = 0; i < cgraph->n_nodes; ++i) {
            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);

            for (int j = 0; j < GGML_MAX_SRC; ++j) {
                if (cgraph->nodes[i]->src[j]) {
                    ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
                }
            }

            fprintf(fout, "\n");
        }

        fprintf(fout, "\n");
    }

    // write binary data
    {
        FILE * fout = ggml_fopen(fname, "wb");

        if (!fout) {
            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
            return;
        }

        // header
        {
            const uint32_t magic   = GGML_FILE_MAGIC;
            const uint32_t version = GGML_FILE_VERSION;
            const uint32_t n_leafs = cgraph->n_leafs;
            const uint32_t n_nodes = cgraph->n_nodes;

            fwrite(&magic,     sizeof(uint32_t), 1, fout);
            fwrite(&version,   sizeof(uint32_t), 1, fout);
            fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
            fwrite(&n_nodes,   sizeof(uint32_t), 1, fout);
            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
        }

        // leafs
        {
            for (int i = 0; i < cgraph->n_leafs; ++i) {
                const struct ggml_tensor * tensor = cgraph->leafs[i];

                const uint32_t type   = tensor->type;
                const uint32_t op     = tensor->op;

                fwrite(&type,   sizeof(uint32_t), 1, fout);
                fwrite(&op,     sizeof(uint32_t), 1, fout);

                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    const uint64_t ne = tensor->ne[j];
                    const uint64_t nb = tensor->nb[j];

                    fwrite(&ne, sizeof(uint64_t), 1, fout);
                    fwrite(&nb, sizeof(uint64_t), 1, fout);
                }

                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);

                // dump the data
                // TODO: pad this to 32 byte boundary
                {
                    const size_t size = ggml_nbytes(tensor);

                    fwrite(tensor->data, sizeof(char), size, fout);
                }
            }
        }

        // nodes
        {
            for (int i = 0; i < cgraph->n_nodes; ++i) {
                const struct ggml_tensor * tensor = cgraph->nodes[i];

                const uint32_t type   = tensor->type;
                const uint32_t op     = tensor->op;

                fwrite(&type,   sizeof(uint32_t), 1, fout);
                fwrite(&op,     sizeof(uint32_t), 1, fout);

                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    const uint64_t ne = tensor->ne[j];
                    const uint64_t nb = tensor->nb[j];

                    fwrite(&ne, sizeof(uint64_t), 1, fout);
                    fwrite(&nb, sizeof(uint64_t), 1, fout);
                }

                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);

                // output the op arguments
                {
                    struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };

                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
                        args[j] = tensor->src[j];
                    }

                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
                        if (args[j]) {
                            int32_t idx = -1;

                            // check if leaf
                            {
                                for (int k = 0; k < cgraph->n_leafs; ++k) {
                                    if (args[j] == cgraph->leafs[k]) {
                                        idx = k;
                                        break;
                                    }
                                }
                            }

                            // check if node
                            if (idx == -1) {
                                for (int k = 0; k < cgraph->n_nodes; ++k) {
                                    if (args[j] == cgraph->nodes[k]) {
                                        idx = cgraph->n_leafs + k;
                                        break;
                                    }
                                }
                            }

                            if (idx == -1) {
                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
                                fclose(fout);
                                return;
                            }

                            fwrite(&idx, sizeof(int32_t), 1, fout);
                        } else {
                            const int32_t nul = -1;

                            fwrite(&nul, sizeof(int32_t), 1, fout);
                        }
                    }
                }
            }
        }

        fclose(fout);
    }
}

struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
    assert(*ctx_data == NULL);
    assert(*ctx_eval == NULL);

    struct ggml_cgraph * result = NULL;

    struct ggml_tensor * data = NULL;

    // read file into data
    {
        FILE * fin = ggml_fopen(fname, "rb");
        if (!fin) {
            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
            return result;
        }

        size_t fsize = 0;

        fseek(fin, 0, SEEK_END);
        fsize = ftell(fin);
        fseek(fin, 0, SEEK_SET);

        // create the data context
        {
            const size_t overhead = 1*ggml_tensor_overhead();

            struct ggml_init_params params = {
                .mem_size   = fsize + overhead,
                .mem_buffer = NULL,
                .no_alloc   = false,
            };

            *ctx_data = ggml_init(params);

            if (!*ctx_data) {
                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
                fclose(fin);
                return result;
            }
        }

        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);

        {
            const size_t ret = fread(data->data, sizeof(char), fsize, fin);
            if (ret != fsize) {
                fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
                fclose(fin);
                return result;
            }
        }

        fclose(fin);
    }

    // populate result
    {
        char * ptr = (char *) data->data;

        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);

        if (magic != GGML_FILE_MAGIC) {
            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
            return result;
        }

        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);

        if (version != GGML_FILE_VERSION) {
            fprintf(stderr, "%s: invalid version number\n", __func__);
            return result;
        }

        const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
        const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
        const int     graph_size = MAX(n_leafs, n_nodes);

        // create the data context
        {
            const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);

            struct ggml_init_params params = {
                .mem_size   = size_eval + overhead,
                .mem_buffer = NULL,
                .no_alloc   = true,
            };

            *ctx_eval = ggml_init(params);

            if (!*ctx_eval) {
                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
                return result;
            }
        }

        result = ggml_new_graph_custom(*ctx_eval, graph_size, false);

        result->n_leafs = n_leafs;
        result->n_nodes = n_nodes;


        // leafs
        {
            uint32_t type;
            uint32_t op;

            for (uint32_t i = 0; i < n_leafs; ++i) {
                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
                op     = *(const uint32_t *) ptr; ptr += sizeof(op);

                int64_t ne[GGML_MAX_DIMS];
                size_t  nb[GGML_MAX_DIMS];

                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    uint64_t ne_cur;
                    uint64_t nb_cur;

                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);

                    ne[j] = ne_cur;
                    nb[j] = nb_cur;
                }

                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);

                tensor->op = (enum ggml_op) op;

                memcpy(tensor->name,      ptr, GGML_MAX_NAME);      ptr += GGML_MAX_NAME;
                memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;

                tensor->data = (void *) ptr;

                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    tensor->nb[j] = nb[j];
                }

                result->leafs[i] = tensor;

                ptr += ggml_nbytes(tensor);

                fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
            }
        }

        ggml_set_no_alloc(*ctx_eval, false);

        // nodes
        {
            uint32_t type;
            uint32_t op;

            for (uint32_t i = 0; i < n_nodes; ++i) {
                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
                op     = *(const uint32_t *) ptr; ptr += sizeof(op);

                enum ggml_op eop = (enum ggml_op) op;

                int64_t ne[GGML_MAX_DIMS];
                size_t  nb[GGML_MAX_DIMS];

                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    uint64_t ne_cur;
                    uint64_t nb_cur;

                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);

                    ne[j] = ne_cur;
                    nb[j] = nb_cur;
                }

                const char * ptr_name      = ptr; ptr += GGML_MAX_NAME;
                const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;

                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);

                struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };

                // parse args
                for (int j = 0; j < GGML_MAX_SRC; ++j) {
                    const int32_t arg_idx = ptr_arg_idx[j];

                    if (arg_idx == -1) {
                        continue;
                    }

                    if (arg_idx < result->n_leafs) {
                        args[j] = result->leafs[arg_idx];
                    } else {
                        args[j] = result->nodes[arg_idx - result->n_leafs];
                    }
                }

                // create the tensor
                // "view" operations are handled differently
                // TODO: handle inplace ops - currently a copy is always made

                struct ggml_tensor * tensor = NULL;

                switch (eop) {
                    // TODO: implement other view ops
                    case GGML_OP_RESHAPE:
                        {
                            tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
                        } break;
                    case GGML_OP_VIEW:
                        {
                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);

                            size_t offs;
                            memcpy(&offs, ptr_op_params, sizeof(offs));

                            tensor->data = ((char *) tensor->data) + offs;
                        } break;
                    case GGML_OP_TRANSPOSE:
                        {
                            tensor = ggml_transpose(*ctx_eval, args[0]);
                        } break;
                    case GGML_OP_PERMUTE:
                        {
                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
                        } break;
                    default:
                        {
                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);

                            tensor->op = eop;
                        } break;
                }

                memcpy(tensor->name,      ptr_name,      GGML_MAX_NAME);
                memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);

                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    tensor->nb[j] = nb[j];
                }

                for (int j = 0; j < GGML_MAX_SRC; ++j) {
                    tensor->src[j] = args[j];
                }

                result->nodes[i] = tensor;

                fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
            }
        }
    }

    return result;
}

void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};

    GGML_PRINT("=== GRAPH ===\n");

    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];

        perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);

        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                i,
                node->ne[0], node->ne[1], node->ne[2],
                ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
                (double) node->perf_time_us / 1000.0,
                (double) node->perf_time_us / 1000.0 / node->perf_runs);
    }

    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
    for (int i = 0; i < cgraph->n_leafs; i++) {
        struct ggml_tensor * node = cgraph->leafs[i];

        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
                i,
                node->ne[0], node->ne[1],
                ggml_op_name(node->op),
                ggml_get_name(node));
    }

    for (int i = 0; i < GGML_OP_COUNT; i++) {
        if (perf_total_per_op_us[i] == 0) {
            continue;
        }

        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
    }

    GGML_PRINT("========================================\n");
}

// check if node is part of the graph
static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
    if (cgraph == NULL) {
        return true;
    }

    for (int i = 0; i < cgraph->n_nodes; i++) {
        if (cgraph->nodes[i] == node) {
            return true;
        }
    }

    return false;
}

static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * parent = cgraph->nodes[i];

        if (parent->grad == node) {
            return parent;
        }
    }

    return NULL;
}

static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
    struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
    struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
            gparent0 ? (void *) gparent0 : (void *) parent,
            gparent0 ? "g" : "x",
            gparent ? (void *) gparent : (void *) node,
            gparent ? "g" : "x",
            gparent ? "empty" : "vee",
            gparent ? "dashed" : "solid",
            label);
}

static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
            (void *) parent, "x",
            (void *) node, "x",
            label);
}

void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
    char color[16];

    FILE * fp = ggml_fopen(filename, "w");
    GGML_ASSERT(fp);

    fprintf(fp, "digraph G {\n");
    fprintf(fp, "  newrank = true;\n");
    fprintf(fp, "  rankdir = LR;\n");

    for (int i = 0; i < gb->n_nodes; i++) {
        struct ggml_tensor * node = gb->nodes[i];

        if (ggml_graph_get_parent(gb, node) != NULL) {
            continue;
        }

        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
            snprintf(color, sizeof(color), "yellow");
        } else if (node->grad) {
            if (ggml_graph_find(gf, node)) {
                snprintf(color, sizeof(color), "green");
            } else {
                snprintf(color, sizeof(color), "lightblue");
            }
        } else {
            snprintf(color, sizeof(color), "white");
        }

        fprintf(fp, "  \"%p\" [ "
                    "style = filled; fillcolor = %s; shape = record; "
                    "label=\"",
                (void *) node, color);

        if (strlen(node->name) > 0) {
            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
        } else {
            fprintf(fp, "(%s)|", ggml_type_name(node->type));
        }

        if (ggml_is_matrix(node)) {
            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
        } else {
            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
        }

        if (node->grad) {
            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
        } else {
            fprintf(fp, "\"; ]\n");
        }
    }

    for (int i = 0; i < gb->n_leafs; i++) {
        struct ggml_tensor * node = gb->leafs[i];

        snprintf(color, sizeof(color), "pink");

        fprintf(fp, "  \"%p\" [ "
                    "style = filled; fillcolor = %s; shape = record; "
                    "label=\"<x>",
                (void *) node, color);

        if (strlen(node->name) > 0) {
            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
        } else {
            fprintf(fp, "(%s)|", ggml_type_name(node->type));
        }

        fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
        if (ggml_nelements(node) < 5) {
            fprintf(fp, " | (");
            for (int j = 0; j < ggml_nelements(node); j++) {
                if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
                    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
                }
                else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
                    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
                }
                else {
                    fprintf(fp, "#");
                }
                if (j < ggml_nelements(node) - 1) {
                    fprintf(fp, ", ");
                }
            }
            fprintf(fp, ")");
        }
        fprintf(fp, "\"; ]\n");
    }

    for (int i = 0; i < gb->n_nodes; i++) {
        struct ggml_tensor * node = gb->nodes[i];

        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j]) {
                char label[16];
                snprintf(label, sizeof(label), "src %d", j);
                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
            }
        }
    }

    for (int i = 0; i < gb->n_leafs; i++) {
        struct ggml_tensor * node = gb->leafs[i];

        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j]) {
                char label[16];
                snprintf(label, sizeof(label), "src %d", j);
                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
            }
        }
    }

    fprintf(fp, "}\n");

    fclose(fp);

    GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
}

////////////////////////////////////////////////////////////////////////////////

static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
    int i = 0;
    for (int p = 0; p < np; ++p) {
        const int64_t ne = ggml_nelements(ps[p]) ;
        // TODO: add function to set tensor from array
        for (int64_t j = 0; j < ne; ++j) {
            ggml_set_f32_1d(ps[p], j, x[i++]);
        }
    }
}

static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
    int i = 0;
    for (int p = 0; p < np; ++p) {
        const int64_t ne = ggml_nelements(ps[p]) ;
        // TODO: add function to get all elements at once
        for (int64_t j = 0; j < ne; ++j) {
            x[i++] = ggml_get_f32_1d(ps[p], j);
        }
    }
}

static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
    int64_t i = 0;
    for (int p = 0; p < np; ++p) {
        const int64_t ne = ggml_nelements(ps[p]) ;
        // TODO: add function to get all elements at once
        for (int64_t j = 0; j < ne; ++j) {
            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
        }
    }
}

static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) {
    int64_t i = 0;
    for (int p = 0; p < np; ++p) {
        const int64_t ne = ggml_nelements(ps[p]) ;
        // TODO: add function to get all elements at once
        for (int64_t j = 0; j < ne; ++j) {
            g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale;
        }
    }
}

//
// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
//
// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
//

static enum ggml_opt_result ggml_opt_adam(
        struct ggml_context * ctx,
        struct ggml_opt_context * opt,
        struct ggml_opt_params params,
        struct ggml_tensor * f,
        struct ggml_cgraph * gf,
        struct ggml_cgraph * gb,
        ggml_opt_callback callback,
        void * callback_data) {
    GGML_ASSERT(ggml_is_scalar(f));

    // these will store the parameters we want to optimize
    struct ggml_tensor * ps[GGML_MAX_PARAMS];

    int np = 0;
    int64_t nx = 0;
    for (int i = 0; i < gf->n_nodes; ++i) {
        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);

            GGML_ASSERT(np < GGML_MAX_PARAMS);

            ps[np++] = gf->nodes[i];
            nx += ggml_nelements(gf->nodes[i]);
        }
    }

    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) {
        int iter = opt->iter;
        ggml_opt_init(opt->ctx, opt, params, nx);
        opt->iter = iter;
    }

    // constants
    float sched = params.adam.sched;
    const float alpha = params.adam.alpha;
    const float decay = params.adam.decay * alpha;
    const float beta1 = params.adam.beta1;
    const float beta2 = params.adam.beta2;
    const float eps   = params.adam.eps;
    const float gclip = params.adam.gclip;
    const int decay_min_ndim = params.adam.decay_min_ndim;
    const int n_accum = MAX(1, params.n_gradient_accumulation);
    const float accum_norm = 1.0f / (float) n_accum;

    float * g  = opt->adam.g->data;  // gradients
    float * m  = opt->adam.m->data;  // first moment
    float * v  = opt->adam.v->data;  // second moment

    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values

    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;

    bool cancel = false;

    // compute the function value
    float fx = 0;
    ggml_set_zero(opt->adam.g);
    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
        if (callback) {
            callback(callback_data, accum_step, &sched, &cancel);
            if (cancel) {
                return GGML_OPT_RESULT_CANCEL;
            }
        }
        // ggml_graph_reset  (gf);
        ggml_set_f32      (f->grad, 1.0f);
        ggml_graph_compute(gb, &cplan);
        ggml_opt_acc_grad(np, ps, g, accum_norm);
        fx += ggml_get_f32_1d(f, 0);
    }
    fx *= accum_norm;

    opt->adam.fx_prev = fx;
    opt->adam.fx_best = opt->adam.fx_prev;
    if (pf) {
        pf[opt->iter % params.past] = opt->adam.fx_prev;
    }

    opt->loss_before = opt->adam.fx_prev;
    opt->loss_after  = opt->adam.fx_prev;

    // initialize
    if (opt->just_initialized) {
        opt->adam.n_no_improvement = 0;
        opt->just_initialized = false;
    }

    float * fx_best = &opt->adam.fx_best;
    float * fx_prev = &opt->adam.fx_prev;
    int * n_no_improvement = &opt->adam.n_no_improvement;

    int iter0 = opt->iter;

    // run the optimizer
    for (int t = 0; t < params.adam.n_iter; ++t) {
        opt->iter = iter0 + t + 1;
        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);

        GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
        GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0));
        GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0));

        for (int i = 0; i < np; ++i) {
            GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
        }

        const int64_t t_start_wall = ggml_time_us();
        const int64_t t_start_cpu = ggml_cycles();
        UNUSED(t_start_wall);
        UNUSED(t_start_cpu);

        {
            float gnorm = 1.0f;
            if (gclip > 0.0f) {
                // gradient clipping
                ggml_float sum = 0.0;
                for (int64_t i = 0; i < nx; ++i) {
                    sum += (ggml_float)(g[i]*g[i]);
                }
                ggml_float norm = sqrt(sum);
                if (norm > (ggml_float) gclip) {
                    gnorm = (float) ((ggml_float) gclip / norm);
                }
            }
            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
            int64_t i = 0;
            for (int p = 0; p < np; ++p) {
                const int64_t ne = ggml_nelements(ps[p]);
                const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
                for (int64_t j = 0; j < ne; ++j) {
                    float x  = ggml_get_f32_1d(ps[p], j);
                    float g_ = g[i]*gnorm;
                    m[i] = m[i]*beta1 +    g_*(1.0f - beta1);
                    v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2);
                    float mh = m[i]*beta1h;
                    float vh = v[i]*beta2h;
                    vh = sqrtf(vh) + eps;
                    x  = x*(1.0f - p_decay) - mh/vh;
                    ggml_set_f32_1d(ps[p], j, x);
                    ++i;
                }
            }
        }

        fx = 0;
        ggml_set_zero(opt->adam.g);
        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
            if (callback) {
                callback(callback_data, accum_step, &sched, &cancel);
                if (cancel) {
                    return GGML_OPT_RESULT_CANCEL;;
                }
            }
            // ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
            ggml_graph_compute(gb, &cplan);
            ggml_opt_acc_grad(np, ps, g, accum_norm);
            fx += ggml_get_f32_1d(f, 0);
        }
        fx *= accum_norm;

        opt->loss_after = fx;

        // check convergence
        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
            GGML_PRINT_DEBUG("converged\n");

            return GGML_OPT_RESULT_OK;
        }

        // delta-based convergence test
        if (pf != NULL) {
            // need at least params.past iterations to start checking for convergence
            if (params.past <= iter0 + t) {
                const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;

                if (fabsf(rate) < params.delta) {
                    return GGML_OPT_RESULT_OK;
                }
            }

            pf[(iter0 + t)%params.past] = fx;
        }

        // check for improvement
        if (params.max_no_improvement > 0) {
            if (fx_best[0] > fx) {
                fx_best[0] = fx;
                n_no_improvement[0] = 0;
            } else {
                ++n_no_improvement[0];

                if (n_no_improvement[0] >= params.max_no_improvement) {
                    return GGML_OPT_RESULT_OK;
                }
            }
        }

        fx_prev[0] = fx;

        {
            const int64_t t_end_cpu = ggml_cycles();
            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
            UNUSED(t_end_cpu);

            const int64_t t_end_wall = ggml_time_us();
            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
            UNUSED(t_end_wall);
        }
    }

    return GGML_OPT_RESULT_DID_NOT_CONVERGE;
}

//
// L-BFGS
//
// the L-BFGS implementation below is based on the following implementation:
//
//   https://github.com/chokkan/liblbfgs
//

struct ggml_lbfgs_iteration_data {
    float alpha;
    float ys;
    float * s;
    float * y;
};

static enum ggml_opt_result linesearch_backtracking(
        const struct ggml_opt_params * params,
        int nx,
        float * x,
        float * fx,
        float * g,
        float * d,
        float * step,
        const float * xp,
        struct ggml_tensor * f,
        struct ggml_cgraph * gb,
        struct ggml_cplan  * cplan,
        const int np,
        struct ggml_tensor * ps[],
        bool * cancel,
        ggml_opt_callback callback,
        void * callback_data) {
    int count = 0;

    float width  = 0.0f;
    float dg     = 0.0f;
    float finit  = 0.0f;
    float dginit = 0.0f;
    float dgtest = 0.0f;

    const float dec = 0.5f;
    const float inc = 2.1f;

    const int n_accum = MAX(1, params->n_gradient_accumulation);
    const float accum_norm = 1.0f / (float) n_accum;

    if (*step <= 0.f) {
        return GGML_LINESEARCH_INVALID_PARAMETERS;
    }

    // compute the initial gradient in the search direction
    ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);

    // make sure that d points to a descent direction
    if (0 < dginit) {
        return GGML_LINESEARCH_FAIL;
    }

    // initialize local variables
    finit = *fx;
    dgtest = params->lbfgs.ftol*dginit;

    while (true) {
        ggml_vec_cpy_f32(nx, x, xp);
        ggml_vec_mad_f32(nx, x, d, *step);

        // evaluate the function and gradient values
        {
            ggml_opt_set_params(np, ps, x);

            *fx = 0;
            memset(g, 0, sizeof(float)*nx);
            for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
                if (callback) {
                    // LBFG-S does not support learning rate -> ignore learning schedule
                    float sched = 0;
                    callback(callback_data, accum_step, &sched, cancel);
                    if (*cancel) {
                        return GGML_OPT_RESULT_CANCEL;
                    }
                }
                // ggml_graph_reset  (gf);
                ggml_set_f32      (f->grad, 1.0f);
                ggml_graph_compute(gb, cplan);
                ggml_opt_acc_grad(np, ps, g, accum_norm);
                *fx += ggml_get_f32_1d(f, 0);
            }
            *fx *= accum_norm;

        }

        ++count;

        if (*fx > finit + (*step)*dgtest) {
            width = dec;
        } else {
            // Armijo condition is satisfied
            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
                return count;
            }

            ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);

            // check the Wolfe condition
            if (dg < params->lbfgs.wolfe * dginit) {
                width = inc;
            } else {
                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
                    // regular Wolfe conditions
                    return count;
                }

                if(dg > -params->lbfgs.wolfe*dginit) {
                    width = dec;
                } else {
                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
                    return count;
                }
            }
        }

        if (*step < params->lbfgs.min_step) {
            return GGML_LINESEARCH_MINIMUM_STEP;
        }
        if (*step > params->lbfgs.max_step) {
            return GGML_LINESEARCH_MAXIMUM_STEP;
        }
        if (params->lbfgs.max_linesearch <= count) {
            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;
        }

        (*step) *= width;
    }

    GGML_ASSERT(false && "line search failed");

    return GGML_LINESEARCH_FAIL;
}

static enum ggml_opt_result ggml_opt_lbfgs(
        struct ggml_context * ctx,
        struct ggml_opt_context * opt,
        struct ggml_opt_params params,
        struct ggml_tensor * f,
        struct ggml_cgraph * gf,
        struct ggml_cgraph * gb,
        ggml_opt_callback callback,
        void * callback_data) {
    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
            return GGML_OPT_RESULT_INVALID_WOLFE;
        }
    }

    const int m = params.lbfgs.m;

    // these will store the parameters we want to optimize
    struct ggml_tensor * ps[GGML_MAX_PARAMS];

    int np = 0;
    int nx = 0;
    for (int i = 0; i < gf->n_nodes; ++i) {
        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);

            GGML_ASSERT(np < GGML_MAX_PARAMS);

            ps[np++] = gf->nodes[i];
            nx += ggml_nelements(gf->nodes[i]);
        }
    }

    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) {
        int iter = opt->iter;
        ggml_opt_init(ctx, opt, params, nx);
        opt->iter = iter;
    }

    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;

    float * x  = opt->lbfgs.x->data;  // current parameters
    float * xp = opt->lbfgs.xp->data; // previous parameters
    float * g  = opt->lbfgs.g->data;  // current gradient
    float * gp = opt->lbfgs.gp->data; // previous gradient
    float * d  = opt->lbfgs.d->data;  // search direction

    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values

    const int n_accum = MAX(1, params.n_gradient_accumulation);
    const float accum_norm = 1.0f / (float) n_accum;

    float fx    = 0.0f; // cost function value
    float xnorm = 0.0f; // ||x||
    float gnorm = 0.0f; // ||g||

    // initialize x from the graph nodes
    ggml_opt_get_params(np, ps, x);

    // the L-BFGS memory
    float * lm_alpha = opt->lbfgs.lmal->data;
    float * lm_ys    = opt->lbfgs.lmys->data;
    float * lm_s     = opt->lbfgs.lms->data;
    float * lm_y     = opt->lbfgs.lmy->data;

    bool cancel = false;

    // evaluate the function value and its gradient
    {
        ggml_opt_set_params(np, ps, x);

        fx = 0;
        memset(g, 0, sizeof(float)*nx);
        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
            if (callback) {
                // LBFG-S does not support learning rate -> ignore learning schedule
                float sched = 0;
                callback(callback_data, accum_step, &sched, &cancel);
                if (cancel) {
                    return GGML_OPT_RESULT_CANCEL;
                }
            }
            // ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
            ggml_graph_compute(gb, &cplan);
            ggml_opt_acc_grad(np, ps, g, accum_norm);
            fx += ggml_get_f32_1d(f, 0);
        }
        fx *= accum_norm;

        opt->loss_before = fx;
        opt->loss_after  = fx;
    }

    // search direction = -gradient
    ggml_vec_neg_f32(nx, d, g);

    // ||x||, ||g||
    ggml_vec_norm_f32(nx, &xnorm, x);
    ggml_vec_norm_f32(nx, &gnorm, g);

    if (xnorm < 1.0f) {
        xnorm = 1.0f;
    }

    // already optimized
    if (gnorm/xnorm <= params.lbfgs.eps) {
        return GGML_OPT_RESULT_OK;
    }

    if (opt->just_initialized) {
        if (pf) {
            pf[0] = fx;
        }
        opt->lbfgs.fx_best = fx;

        // initial step
        ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d);
        opt->lbfgs.j                = 0;
        opt->lbfgs.k                = 1;
        opt->lbfgs.end              = 0;
        opt->lbfgs.n_no_improvement = 0;
        opt->just_initialized       = false;
    }

    float * fx_best        = &opt->lbfgs.fx_best;
    float * step           = &opt->lbfgs.step;
    int * j                = &opt->lbfgs.j;
    int * k                = &opt->lbfgs.k;
    int * end              = &opt->lbfgs.end;
    int * n_no_improvement = &opt->lbfgs.n_no_improvement;

    int ls     = 0;
    int bound  = 0;

    float ys   = 0.0f;
    float yy   = 0.0f;
    float beta = 0.0f;

    int it = 0;

    while (true) {
        // store the current position and gradient vectors
        ggml_vec_cpy_f32(nx, xp, x);
        ggml_vec_cpy_f32(nx, gp, g);

        // TODO: instead of passing &cancel here, use the return code of the linesearch
        //       to determine if the optimization should be cancelled
        //       this is a simple change, but not doing this atm, since I don't have a nice
        //       way to test and don't want to break something with so many changes lined up
        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
        if (cancel) {
            return GGML_OPT_RESULT_CANCEL;
        }

        if (ls < 0) {
            // linesearch failed - go back to the previous point and return
            ggml_vec_cpy_f32(nx, x, xp);
            ggml_vec_cpy_f32(nx, g, gp);

            return ls;
        }

        opt->loss_after = fx;

        ggml_vec_norm_f32(nx, &xnorm, x);
        ggml_vec_norm_f32(nx, &gnorm, g);

        GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0));

        if (xnorm < 1.0f) {
            xnorm = 1.0f;
        }
        if (gnorm/xnorm <= params.lbfgs.eps) {
            // converged
            return GGML_OPT_RESULT_OK;
        }

        // delta-based convergence test
        if (pf != NULL) {
            // need at least params.past iterations to start checking for convergence
            if (params.past <= k[0]) {
                const float rate = (pf[k[0]%params.past] - fx)/fx;

                if (fabsf(rate) < params.delta) {
                    return GGML_OPT_RESULT_OK;
                }
            }

            pf[k[0]%params.past] = fx;
        }

        // check for improvement
        if (params.max_no_improvement > 0) {
            if (fx < fx_best[0]) {
                fx_best[0] = fx;
                n_no_improvement[0] = 0;
            } else {
                n_no_improvement[0]++;

                if (n_no_improvement[0] >= params.max_no_improvement) {
                    return GGML_OPT_RESULT_OK;
                }
            }
        }

        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
            // reached the maximum number of iterations
            return GGML_OPT_RESULT_DID_NOT_CONVERGE;
        }

        // update vectors s and y:
        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
        //   y_{k+1} = g_{k+1} - g_{k}.
        //
        ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp);
        ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp);

        // compute scalars ys and yy:
        //     ys = y^t \cdot s    -> 1 / \rho.
        //     yy = y^t \cdot y.
        //
        ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
        ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);

        lm_ys[end[0]] = ys;

        // find new search direction
        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS

        bound = (m <= k[0]) ? m : k[0];
        k[0]++;
        it++;
        end[0] = (end[0] + 1)%m;

        // initialize search direction with -g
        ggml_vec_neg_f32(nx, d, g);

        j[0] = end[0];
        for (int i = 0; i < bound; ++i) {
            j[0] = (j[0] + m - 1) % m;
            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
            lm_alpha[j[0]] /= lm_ys[j[0]];
            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
            ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
        }

        ggml_vec_scale_f32(nx, d, ys/yy);

        for (int i = 0; i < bound; ++i) {
            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
            ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
            beta /= lm_ys[j[0]];
            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
            ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
            j[0] = (j[0] + 1)%m;
        }

        step[0] = 1.0;
    }

    GGML_ASSERT(false && "lbfgs failed");

    return GGML_OPT_RESULT_DID_NOT_CONVERGE;
}

struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
    struct ggml_opt_params result;

    switch (type) {
        case GGML_OPT_TYPE_ADAM:
            {
                result = (struct ggml_opt_params) {
                    .type       = GGML_OPT_TYPE_ADAM,
                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
                    .n_threads  = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
                    .past       = 0,
                    .delta      = 1e-5f,

                    .max_no_improvement = 100,

                    .print_forward_graph  = true,
                    .print_backward_graph = true,

                    .n_gradient_accumulation = 1,

                    .adam = {
                        .n_iter = 10000,
                        .sched  = 1.000f,
                        .decay  = 0.0f,
                        .decay_min_ndim = 2,
                        .alpha  = 0.001f,
                        .beta1  = 0.9f,
                        .beta2  = 0.999f,
                        .eps    = 1e-8f,
                        .eps_f  = 1e-5f,
                        .eps_g  = 1e-3f,
                        .gclip  = 0.0f,
                    },
                };
            } break;
        case GGML_OPT_TYPE_LBFGS:
            {
                result = (struct ggml_opt_params) {
                    .type       = GGML_OPT_TYPE_LBFGS,
                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
                    .n_threads  = 1,
                    .past       = 0,
                    .delta      = 1e-5f,

                    .max_no_improvement = 0,

                    .print_forward_graph  = true,
                    .print_backward_graph = true,

                    .n_gradient_accumulation = 1,

                    .lbfgs = {
                        .m              = 6,
                        .n_iter         = 100,
                        .max_linesearch = 20,

                        .eps      = 1e-5f,
                        .ftol     = 1e-4f,
                        .wolfe    = 0.9f,
                        .min_step = 1e-20f,
                        .max_step = 1e+20f,

                        .linesearch = GGML_LINESEARCH_DEFAULT,
                    },
                };
            } break;
    }

    return result;
}

GGML_API void ggml_opt_init(
        struct ggml_context * ctx,
        struct ggml_opt_context * opt,
        struct ggml_opt_params params,
        int64_t nx) {
    opt->ctx = ctx;
    opt->params = params;
    opt->iter = 0;
    opt->nx = nx;
    opt->just_initialized = true;
    if (opt->ctx == NULL) {
        struct ggml_init_params ctx_opt_params;
        if (opt->params.type == GGML_OPT_TYPE_ADAM) {
            ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
            if (opt->params.past > 0) {
                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
            }
        } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) {
            ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
            if (opt->params.past > 0) {
                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
            }
        }
        ctx_opt_params.mem_buffer = NULL;
        ctx_opt_params.no_alloc   = false;

        opt->ctx = ggml_init(ctx_opt_params);
    }
    switch (opt->params.type) {
        case GGML_OPT_TYPE_ADAM:
            {
                opt->adam.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                opt->adam.m  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                opt->adam.v  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                opt->adam.pf = params.past > 0
                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
                    : NULL;
                ggml_set_zero(opt->adam.m);
                ggml_set_zero(opt->adam.v);
                if (opt->adam.pf) {
                    ggml_set_zero(opt->adam.pf);
                }
            } break;
        case GGML_OPT_TYPE_LBFGS:
            {
                opt->lbfgs.x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                opt->lbfgs.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                opt->lbfgs.d  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                opt->lbfgs.pf = params.past > 0
                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
                    : NULL;
                opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
                opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
                opt->lbfgs.lms  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
                opt->lbfgs.lmy  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
                ggml_set_zero(opt->lbfgs.x);
                ggml_set_zero(opt->lbfgs.xp);
                ggml_set_zero(opt->lbfgs.g);
                ggml_set_zero(opt->lbfgs.gp);
                ggml_set_zero(opt->lbfgs.d);
                if (opt->lbfgs.pf) {
                    ggml_set_zero(opt->lbfgs.pf);
                }
                ggml_set_zero(opt->lbfgs.lmal);
                ggml_set_zero(opt->lbfgs.lmys);
                ggml_set_zero(opt->lbfgs.lms);
                ggml_set_zero(opt->lbfgs.lmy);
            } break;
    }
}

enum ggml_opt_result ggml_opt(
        struct ggml_context * ctx,
        struct ggml_opt_params params,
        struct ggml_tensor * f) {
    bool free_ctx = false;
    if (ctx == NULL) {
        struct ggml_init_params params_ctx = {
            .mem_size   = 16*1024*1024,
            .mem_buffer = NULL,
            .no_alloc   = false,
        };

        ctx = ggml_init(params_ctx);
        if (ctx == NULL) {
            return GGML_OPT_RESULT_NO_CONTEXT;
        }

        free_ctx = true;
    }

    enum ggml_opt_result result = GGML_OPT_RESULT_OK;

    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));

    ggml_opt_init(ctx, opt, params, 0);
    result = ggml_opt_resume(ctx, opt, f);

    if (free_ctx) {
        ggml_free(ctx);
    }

    return result;
}

enum ggml_opt_result ggml_opt_resume(
        struct ggml_context * ctx,
        struct ggml_opt_context * opt,
        struct ggml_tensor * f) {

    // build forward + backward compute graphs
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
    ggml_build_forward_expand(gf, f);

    struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
    ggml_build_backward_expand(ctx, gf, gb, true);

    return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
}

enum ggml_opt_result ggml_opt_resume_g(
        struct ggml_context * ctx,
        struct ggml_opt_context * opt,
        struct ggml_tensor * f,
        struct ggml_cgraph * gf,
        struct ggml_cgraph * gb,
        ggml_opt_callback callback,
        void * callback_data) {

    // build forward + backward compute graphs
    enum ggml_opt_result result = GGML_OPT_RESULT_OK;

    switch (opt->params.type) {
        case GGML_OPT_TYPE_ADAM:
            {
                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
            } break;
        case GGML_OPT_TYPE_LBFGS:
            {
                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
            } break;
    }

    if (opt->params.print_forward_graph) {
        ggml_graph_print   (gf);
        ggml_graph_dump_dot(gf, NULL, "opt-forward.dot");
    }

    if (opt->params.print_backward_graph) {
        ggml_graph_print   (gb);
        ggml_graph_dump_dot(gb, gf, "opt-backward.dot");
    }

    return result;
}

////////////////////////////////////////////////////////////////////////////////

void ggml_set_input(struct ggml_tensor * tensor) {
    tensor->flags |= GGML_TENSOR_FLAG_INPUT;
}

void ggml_set_output(struct ggml_tensor * tensor) {
    tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
}

////////////////////////////////////////////////////////////////////////////////

void ggml_quantize_init(enum ggml_type type) {
    ggml_critical_section_start();

    switch (type) {
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:   iq2xs_init_impl(type); break;
        case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
        case GGML_TYPE_IQ3_S:   iq3xs_init_impl(512); break;
        default: // nothing
            break;
    }

    ggml_critical_section_end();
}

void ggml_quantize_free(void) {
    ggml_critical_section_start();

    iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
    iq2xs_free_impl(GGML_TYPE_IQ2_XS);
    iq2xs_free_impl(GGML_TYPE_IQ1_S);
    iq3xs_free_impl(256);

    ggml_critical_section_end();
}

bool ggml_quantize_requires_imatrix(enum ggml_type type) {
    return
        type == GGML_TYPE_IQ2_XXS ||
        type == GGML_TYPE_IQ2_XS  ||
        type == GGML_TYPE_IQ1_S;//   ||
        //type == GGML_TYPE_IQ1_M;
}

size_t ggml_quantize_chunk(
        enum ggml_type   type,
           const float * src,
                  void * dst,
                   int   start,
                   int   nrows,
                   int   n_per_row,
           const float * imatrix) {
    const int n = nrows * n_per_row;

    if (ggml_quantize_requires_imatrix(type)) {
        GGML_ASSERT(imatrix != NULL);
    }

    GGML_ASSERT(start % type_traits[type].blck_size == 0);
    GGML_ASSERT(start % n_per_row == 0);

    ggml_quantize_init(type); // this is noop if already initialized

    const size_t start_row = start / n_per_row;
    const size_t row_size  = ggml_row_size(type, n_per_row);

    size_t result = 0;

    switch (type) {
        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ3_S:   result = quantize_iq3_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_S:   result = quantize_iq2_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
#if QK_K == 64
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
#else
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
#endif
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
                ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
                result = n * elemsize;
            } break;
        case GGML_TYPE_F32:
            {
                size_t elemsize = sizeof(float);
                result = n * elemsize;
                memcpy((uint8_t *)dst + start * elemsize, src + start, result);
            } break;
        default:
            assert(false);
    }

    GGML_ASSERT(result == nrows * row_size);

    return result;
}

////////////////////////////////////////////////////////////////////////////////

struct gguf_str {
    uint64_t n;  // GGUFv2
    char * data;
};

static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
    [GGUF_TYPE_INT8]    = sizeof(int8_t),
    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
    [GGUF_TYPE_INT16]   = sizeof(int16_t),
    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
    [GGUF_TYPE_INT32]   = sizeof(int32_t),
    [GGUF_TYPE_FLOAT32] = sizeof(float),
    [GGUF_TYPE_BOOL]    = sizeof(bool),
    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
    [GGUF_TYPE_INT64]   = sizeof(int64_t),
    [GGUF_TYPE_FLOAT64] = sizeof(double),
    [GGUF_TYPE_ARRAY]   = 0, // undefined
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");

static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_UINT8]   = "u8",
    [GGUF_TYPE_INT8]    = "i8",
    [GGUF_TYPE_UINT16]  = "u16",
    [GGUF_TYPE_INT16]   = "i16",
    [GGUF_TYPE_UINT32]  = "u32",
    [GGUF_TYPE_INT32]   = "i32",
    [GGUF_TYPE_FLOAT32] = "f32",
    [GGUF_TYPE_BOOL]    = "bool",
    [GGUF_TYPE_STRING]  = "str",
    [GGUF_TYPE_ARRAY]   = "arr",
    [GGUF_TYPE_UINT64]  = "u64",
    [GGUF_TYPE_INT64]   = "i64",
    [GGUF_TYPE_FLOAT64] = "f64",
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");

union gguf_value {
    uint8_t  uint8;
    int8_t   int8;
    uint16_t uint16;
    int16_t  int16;
    uint32_t uint32;
    int32_t  int32;
    float    float32;
    uint64_t uint64;
    int64_t  int64;
    double   float64;
    bool     bool_;

    struct gguf_str str;

    struct {
        enum gguf_type type;

        uint64_t n;  // GGUFv2
        void * data;
    } arr;
};

struct gguf_kv {
    struct gguf_str key;

    enum  gguf_type  type;
    union gguf_value value;
};

struct gguf_header {
    char magic[4];

    uint32_t version;
    uint64_t n_tensors; // GGUFv2
    uint64_t n_kv;      // GGUFv2
};

struct gguf_tensor_info {
    struct gguf_str name;

    uint32_t n_dims;
    uint64_t ne[GGML_MAX_DIMS];

    enum ggml_type type;

    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`

    // for writing API
    const void * data;
    size_t size;
};

struct gguf_context {
    struct gguf_header header;

    struct gguf_kv          * kv;
    struct gguf_tensor_info * infos;

    size_t alignment;
    size_t offset;    // offset of `data` from beginning of file
    size_t size;      // size of `data` in bytes

    //uint8_t * padding;
    void * data;
};

static size_t gguf_type_size(enum gguf_type type) {
    GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
    return GGUF_TYPE_SIZE[type];
}

static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
    GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
    GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);

    for (uint32_t i = 0; i < info->n_dims; ++i) {
        GGML_ASSERT(info->ne[i] > 0);
    }

    // prevent overflow for total number of elements
    GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
    GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
    GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
}

static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
    const size_t n = fread(dst, 1, size, file);
    *offset += n;
    return n == size;
}

static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
    p->n    = 0;
    p->data = NULL;

    bool ok = true;

    ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);

    // early exit if string length is invalid, prevents from integer overflow
    if (p->n == SIZE_MAX) {
        fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
        return false;
    }

    p->data = GGML_CALLOC(p->n + 1, 1);

    ok = ok && gguf_fread_el(file,  p->data, p->n, offset);

    return ok;
}

struct gguf_context * gguf_init_empty(void) {
    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));

    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
    ctx->header.version   = GGUF_VERSION;
    ctx->header.n_tensors = 0;
    ctx->header.n_kv      = 0;

    ctx->kv    = NULL;
    ctx->infos = NULL;

    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
    ctx->offset    = 0;
    ctx->size      = 0;

    ctx->data = NULL;

    return ctx;
}

struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
    FILE * file = ggml_fopen(fname, "rb");
    if (!file) {
        return NULL;
    }

    // offset from start of file
    size_t offset = 0;

    char magic[4];

    // check the magic before making allocations
    {
        gguf_fread_el(file, &magic, sizeof(magic), &offset);

        for (uint32_t i = 0; i < sizeof(magic); i++) {
            if (magic[i] != GGUF_MAGIC[i]) {
                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
                fclose(file);
                return NULL;
            }
        }
    }

    bool ok = true;

    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));

    // read the header
    {
        strncpy(ctx->header.magic, magic, 4);

        ctx->kv    = NULL;
        ctx->infos = NULL;
        ctx->data  = NULL;

        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);

        if (ctx->header.version == 1) {
            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }

        // sanity-checks to prevent from integer/buffer overflows

        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
        ok = ok && (ctx->header.n_kv      < (SIZE_MAX/2)/sizeof(struct gguf_kv));

        if (!ok) {
            fprintf(stderr, "%s: failed to read header\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }
    }

    // read the kv pairs
    {
        ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));

        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];

            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);

            ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
            ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);

            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);

            switch (kv->type) {
                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
                case GGUF_TYPE_UINT64:  ok = ok && gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
                case GGUF_TYPE_INT64:   ok = ok && gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
                case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
                case GGUF_TYPE_ARRAY:
                    {
                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);

                        switch (kv->value.arr.type) {
                            case GGUF_TYPE_UINT8:
                            case GGUF_TYPE_INT8:
                            case GGUF_TYPE_UINT16:
                            case GGUF_TYPE_INT16:
                            case GGUF_TYPE_UINT32:
                            case GGUF_TYPE_INT32:
                            case GGUF_TYPE_FLOAT32:
                            case GGUF_TYPE_UINT64:
                            case GGUF_TYPE_INT64:
                            case GGUF_TYPE_FLOAT64:
                            case GGUF_TYPE_BOOL:
                                {
                                    // prevent from integer overflow in the malloc below
                                    if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
                                        fclose(file);
                                        gguf_free(ctx);
                                        return NULL;
                                    }

                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));

                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
                                } break;
                            case GGUF_TYPE_STRING:
                                {
                                    // prevent from integer overflow in the malloc below
                                    if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
                                        fclose(file);
                                        gguf_free(ctx);
                                        return NULL;
                                    }

                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));

                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                    }
                                } break;
                            case GGUF_TYPE_ARRAY:
                            default: GGML_ASSERT(false && "invalid type"); break;
                        }
                    } break;
                default: GGML_ASSERT(false && "invalid type");
            }

            if (!ok) {
                break;
            }
        }

        if (!ok) {
            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }
    }

    // read the tensor infos
    {
        ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));

        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                info->ne[j] = 1;
            }

            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);

            ok = ok && (info->n_dims <= GGML_MAX_DIMS);

            for (uint32_t j = 0; j < info->n_dims; ++j) {
                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
            }

            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);

            gguf_tensor_info_sanitize(info);

            if (!ok) {
                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
                fclose(file);
                gguf_free(ctx);
                return NULL;
            }
        }
    }

    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;

    int alignment_idx = gguf_find_key(ctx, "general.alignment");
    if (alignment_idx != -1) {
        ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
    }

    // we require the data section to be aligned, so take into account any padding
    {
        const size_t offset_pad = offset % ctx->alignment;

        if (offset_pad != 0) {
            offset += ctx->alignment - offset_pad;
            fseek(file, offset, SEEK_SET);
        }
    }

    // store the current file offset - this is where the data section starts
    ctx->offset = offset;

    // compute the total size of the data section, taking into account the alignment
    {
        ctx->size = 0;
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            const int64_t ne =
                (int64_t) info->ne[0] *
                (int64_t) info->ne[1] *
                (int64_t) info->ne[2] *
                (int64_t) info->ne[3];

            if (ne % ggml_blck_size(info->type) != 0) {
                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
                        __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
                fclose(file);
                gguf_free(ctx);
                return NULL;
            }

            const size_t size_cur = ggml_row_size(info->type, ne);

            ctx->size += GGML_PAD(size_cur, ctx->alignment);
        }
    }

    // load the tensor data only if requested
    if (params.ctx != NULL) {
        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
        // the ggml_tensor structs to the appropriate locations in the binary blob

        // compute the exact size needed for the new ggml_context
        const size_t mem_size =
            params.no_alloc ?
            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;

        struct ggml_init_params pdata = {
            .mem_size   = mem_size,
            .mem_buffer = NULL,
            .no_alloc   = params.no_alloc,
        };

        *params.ctx = ggml_init(pdata);

        struct ggml_context * ctx_data = *params.ctx;

        struct ggml_tensor * data = NULL;

        if (!params.no_alloc) {
            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);

            ok = ok && data != NULL;

            // read the binary blob with the tensor data
            ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);

            if (!ok) {
                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
                fclose(file);
                ggml_free(ctx_data);
                gguf_free(ctx);
                return NULL;
            }

            ctx->data = data->data;
        }

        ggml_set_no_alloc(ctx_data, true);

        // create the tensors
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            const int64_t ne[GGML_MAX_DIMS] = {
                ctx->infos[i].ne[0],
                ctx->infos[i].ne[1],
                ctx->infos[i].ne[2],
                ctx->infos[i].ne[3],
            };

            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);

            ok = ok && cur != NULL;

            ggml_set_name(cur, ctx->infos[i].name.data);

            if (!ok) {
                break;
            }

            // point the data member to the appropriate location in the binary blob using the tensor infos
            if (!params.no_alloc) {
              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
            }
        }

        if (!ok) {
            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
            fclose(file);
            ggml_free(ctx_data);
            gguf_free(ctx);
            return NULL;
        }

        ggml_set_no_alloc(ctx_data, params.no_alloc);
    }

    fclose(file);

    return ctx;
}

void gguf_free(struct gguf_context * ctx) {
    if (ctx == NULL) {
        return;
    }

    if (ctx->kv) {
        // free string memory - not great..
        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];

            if (kv->key.data) {
                GGML_FREE(kv->key.data);
            }

            if (kv->type == GGUF_TYPE_STRING) {
                if (kv->value.str.data) {
                    GGML_FREE(kv->value.str.data);
                }
            }

            if (kv->type == GGUF_TYPE_ARRAY) {
                if (kv->value.arr.data) {
                    if (kv->value.arr.type == GGUF_TYPE_STRING) {
                        for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                            struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
                            if (str->data) {
                                GGML_FREE(str->data);
                            }
                        }
                    }
                    GGML_FREE(kv->value.arr.data);
                }
            }
        }

        GGML_FREE(ctx->kv);
    }

    if (ctx->infos) {
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            if (info->name.data) {
                GGML_FREE(info->name.data);
            }
        }

        GGML_FREE(ctx->infos);
    }

    GGML_ALIGNED_FREE(ctx);
}

const char * gguf_type_name(enum gguf_type type) {
    return GGUF_TYPE_NAME[type];
}

int gguf_get_version(const struct gguf_context * ctx) {
    return ctx->header.version;
}

size_t gguf_get_alignment(const struct gguf_context * ctx) {
    return ctx->alignment;
}

size_t gguf_get_data_offset(const struct gguf_context * ctx) {
    return ctx->offset;
}

void * gguf_get_data(const struct gguf_context * ctx) {
    return ctx->data;
}

int gguf_get_n_kv(const struct gguf_context * ctx) {
    return ctx->header.n_kv;
}

int gguf_find_key(const struct gguf_context * ctx, const char * key) {
    // return -1 if key not found
    int keyfound = -1;

    const int n_kv = gguf_get_n_kv(ctx);

    for (int i = 0; i < n_kv; ++i) {
        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
            keyfound = i;
            break;
        }
    }

    return keyfound;
}

const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    return ctx->kv[key_id].key.data;
}

enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    return ctx->kv[key_id].type;
}

enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.type;
}

const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.data;
}

const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    struct gguf_kv * kv = &ctx->kv[key_id];
    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
    return str->data;
}

int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.n;
}

uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
    return ctx->kv[key_id].value.uint8;
}

int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
    return ctx->kv[key_id].value.int8;
}

uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
    return ctx->kv[key_id].value.uint16;
}

int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
    return ctx->kv[key_id].value.int16;
}

uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
    return ctx->kv[key_id].value.uint32;
}

int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
    return ctx->kv[key_id].value.int32;
}

float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
    return ctx->kv[key_id].value.float32;
}

uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
    return ctx->kv[key_id].value.uint64;
}

int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
    return ctx->kv[key_id].value.int64;
}

double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
    return ctx->kv[key_id].value.float64;
}

bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
    return ctx->kv[key_id].value.bool_;
}

const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
    return ctx->kv[key_id].value.str.data;
}

const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
    return &ctx->kv[key_id].value;
}

int gguf_get_n_tensors(const struct gguf_context * ctx) {
    return ctx->header.n_tensors;
}

int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
    // return -1 if tensor not found
    int tensorfound = -1;

    const int n_tensors = gguf_get_n_tensors(ctx);

    for (int i = 0; i < n_tensors; ++i) {
        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
            tensorfound = i;
            break;
        }
    }

    return tensorfound;
}

size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
    return ctx->infos[i].offset;
}

char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
    return ctx->infos[i].name.data;
}

enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
    return ctx->infos[i].type;
}

// returns the index
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
    const int idx = gguf_find_key(ctx, key);
    if (idx >= 0) {
        return idx;
    }

    const int n_kv = gguf_get_n_kv(ctx);

    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
    ctx->kv[n_kv].key.n    = strlen(key);
    ctx->kv[n_kv].key.data = strdup(key);
    ctx->header.n_kv++;

    return n_kv;
}

void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_UINT8;
    ctx->kv[idx].value.uint8 = val;
}

void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type       = GGUF_TYPE_INT8;
    ctx->kv[idx].value.int8 = val;
}

void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type         = GGUF_TYPE_UINT16;
    ctx->kv[idx].value.uint16 = val;
}

void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_INT16;
    ctx->kv[idx].value.int16 = val;
}

void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type         = GGUF_TYPE_UINT32;
    ctx->kv[idx].value.uint32 = val;
}

void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_INT32;
    ctx->kv[idx].value.int32 = val;
}

void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type          = GGUF_TYPE_FLOAT32;
    ctx->kv[idx].value.float32 = val;
}

void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type         = GGUF_TYPE_UINT64;
    ctx->kv[idx].value.uint64 = val;
}

void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_INT64;
    ctx->kv[idx].value.int64 = val;
}

void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type          = GGUF_TYPE_FLOAT64;
    ctx->kv[idx].value.float64 = val;
}

void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
    ctx->kv[idx].value.bool_ = val;
}

void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type           = GGUF_TYPE_STRING;
    ctx->kv[idx].value.str.n    = strlen(val);
    ctx->kv[idx].value.str.data = strdup(val);
}

void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = type;
    ctx->kv[idx].value.arr.n    = n;
    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
    memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
}

void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
    ctx->kv[idx].value.arr.n    = n;
    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
    for (int i = 0; i < n; i++) {
        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
        str->n    = strlen(data[i]);
        str->data = strdup(data[i]);
    }
}

// set or add KV pairs from another context
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
    for (uint32_t i = 0; i < src->header.n_kv; i++) {
        switch (src->kv[i].type) {
            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
            case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
            case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
            case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
            case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
            case GGUF_TYPE_ARRAY:
                {
                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
                        const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                        }
                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
                        GGML_FREE((void *)data);
                    } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
                        GGML_ASSERT(false && "nested arrays not supported");
                    } else {
                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
                    }
                } break;
            default: GGML_ASSERT(false && "invalid type"); break;
        }
    }
}

void gguf_add_tensor(
             struct gguf_context * ctx,
        const struct ggml_tensor * tensor) {
    const int idx = ctx->header.n_tensors;
    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));

    ctx->infos[idx].name.n    = strlen(tensor->name);
    ctx->infos[idx].name.data = strdup(tensor->name);

    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
        ctx->infos[idx].ne[i] = 1;
    }

    ctx->infos[idx].n_dims = ggml_n_dims(tensor);
    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
        ctx->infos[idx].ne[i] = tensor->ne[i];
    }

    ctx->infos[idx].type   = tensor->type;
    ctx->infos[idx].offset = 0;
    ctx->infos[idx].data   = tensor->data;
    ctx->infos[idx].size   = ggml_nbytes(tensor);

    if (ctx->header.n_tensors > 0) {
        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
    }

    ctx->header.n_tensors++;
}

void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
    const int idx = gguf_find_tensor(ctx, name);
    if (idx < 0) {
        GGML_ASSERT(false && "tensor not found");
    }

    ctx->infos[idx].type = type;
}

void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
    const int idx = gguf_find_tensor(ctx, name);
    if (idx < 0) {
        GGML_ASSERT(false && "tensor not found");
    }

    ctx->infos[idx].data = data;
    ctx->infos[idx].size = size;

    // update offsets
    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
    }
}

//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
//    fwrite(&val->n,   sizeof(val->n),    1, file);
//    fwrite(val->data, sizeof(char), val->n, file);
//}
//
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
//    fwrite(val, sizeof(char), size, file);
//}

struct gguf_buf {
    void * data;
    size_t size;
    size_t offset;
};

static struct gguf_buf gguf_buf_init(size_t size) {
    struct gguf_buf buf = {
        /*buf.data   =*/ size == 0 ? NULL : GGML_MALLOC(size),
        /*buf.size   =*/ size,
        /*buf.offset =*/ 0,
    };

    return buf;
}

static void gguf_buf_free(struct gguf_buf buf) {
    if (buf.data) {
        GGML_FREE(buf.data);
    }
}

static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
    if (buf->offset + size > buf->size) {
        buf->size = 1.5*(buf->offset + size);
        if (buf->data) {
            buf->data = realloc(buf->data, buf->size);
        }
    }
}

static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
    gguf_buf_grow(buf, sizeof(val->n) + val->n);

    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
    }
    buf->offset += sizeof(val->n);

    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, val->data, val->n);
    }
    buf->offset += val->n;
}

static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
    gguf_buf_grow(buf, el_size);

    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, val, el_size);
    }
    buf->offset += el_size;
}

static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
    // write header
    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));

    // write key-value pairs
    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
        struct gguf_kv * kv = &ctx->kv[i];

        gguf_bwrite_str(buf, &kv->key);
        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));

        switch (kv->type) {
            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
            case GGUF_TYPE_UINT64:  gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
            case GGUF_TYPE_INT64:   gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
            case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
            case GGUF_TYPE_ARRAY:
                {
                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );

                    switch (kv->value.arr.type) {
                        case GGUF_TYPE_UINT8:
                        case GGUF_TYPE_INT8:
                        case GGUF_TYPE_UINT16:
                        case GGUF_TYPE_INT16:
                        case GGUF_TYPE_UINT32:
                        case GGUF_TYPE_INT32:
                        case GGUF_TYPE_FLOAT32:
                        case GGUF_TYPE_UINT64:
                        case GGUF_TYPE_INT64:
                        case GGUF_TYPE_FLOAT64:
                        case GGUF_TYPE_BOOL:
                            {
                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
                            } break;
                        case GGUF_TYPE_STRING:
                            {
                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
                                }
                            } break;
                        case GGUF_TYPE_ARRAY:
                        default: GGML_ASSERT(false && "invalid type"); break;
                    }
                } break;
            default: GGML_ASSERT(false && "invalid type");
        }
    }

    // write tensor infos
    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
        struct gguf_tensor_info * info = &ctx->infos[i];

        gguf_bwrite_str(buf, &info->name);
        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
        for (uint32_t j = 0; j < info->n_dims; ++j) {
            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
        }
        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
    }

    // we require the data section to be aligned, so take into account any padding
    {
        const size_t offset     = buf->offset;
        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);

        if (offset_pad != offset) {
            uint8_t pad = 0;
            for (size_t i = 0; i < offset_pad - offset; ++i) {
                gguf_bwrite_el(buf, &pad, sizeof(pad));
            }
        }
    }

    if (only_meta) {
        return;
    }

    size_t offset = 0;

    // write tensor data
    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
        struct gguf_tensor_info * info = &ctx->infos[i];

        const size_t size     = info->size;
        const size_t size_pad = GGML_PAD(size, ctx->alignment);

        gguf_bwrite_el(buf, info->data, size);

        if (size_pad != size) {
            uint8_t pad = 0;
            for (size_t j = 0; j < size_pad - size; ++j) {
                gguf_bwrite_el(buf, &pad, sizeof(pad));
            }
        }

        GGML_ASSERT(offset == info->offset);

        offset += size_pad;
    }
}

void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
    FILE * file = ggml_fopen(fname, "wb");
    if (!file) {
        GGML_ASSERT(false && "failed to open file for writing");
    }

    struct gguf_buf buf = gguf_buf_init(16*1024);

    gguf_write_to_buf(ctx, &buf, only_meta);

    fwrite(buf.data, 1, buf.offset, file);

    gguf_buf_free(buf);

    fclose(file);
}

size_t gguf_get_meta_size(const struct gguf_context * ctx) {
    // no allocs - only compute size
    struct gguf_buf buf = gguf_buf_init(0);

    gguf_write_to_buf(ctx, &buf, true);

    return buf.offset;
}

void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
    struct gguf_buf buf = gguf_buf_init(16*1024);

    gguf_write_to_buf(ctx, &buf, true);

    memcpy(data, buf.data, buf.offset);

    gguf_buf_free(buf);
}

////////////////////////////////////////////////////////////////////////////////

int ggml_cpu_has_avx(void) {
#if defined(__AVX__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_avx_vnni(void) {
#if defined(__AVXVNNI__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_avx2(void) {
#if defined(__AVX2__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_avx512(void) {
#if defined(__AVX512F__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_avx512_vbmi(void) {
#if defined(__AVX512VBMI__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_avx512_vnni(void) {
#if defined(__AVX512VNNI__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_fma(void) {
#if defined(__FMA__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_neon(void) {
#if defined(__ARM_NEON)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_arm_fma(void) {
#if defined(__ARM_FEATURE_FMA)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_metal(void) {
#if defined(GGML_USE_METAL)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_f16c(void) {
#if defined(__F16C__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_fp16_va(void) {
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_wasm_simd(void) {
#if defined(__wasm_simd128__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_cuda(void) {
#if defined(GGML_USE_CUDA)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_clblast(void) {
#if defined(GGML_USE_CLBLAST)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_vulkan(void) {
#if defined(GGML_USE_VULKAN)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_kompute(void) {
#if defined(GGML_USE_KOMPUTE)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_sycl(void) {
#if defined(GGML_USE_SYCL)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_gpublas(void) {
    return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
           ggml_cpu_has_sycl();
}

int ggml_cpu_has_sse3(void) {
#if defined(__SSE3__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_ssse3(void) {
#if defined(__SSSE3__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_vsx(void) {
#if defined(__POWER9_VECTOR__)
    return 1;
#else
    return 0;
#endif
}

int ggml_cpu_has_matmul_int8(void) {
#if defined(__ARM_FEATURE_MATMUL_INT8)
    return 1;
#else
    return 0;
#endif
}

////////////////////////////////////////////////////////////////////////////////
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#define _USE_MATH_DEFINES // For M_PI on MSVC
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#include "ggml-impl.h"
 								#include "ggml-quants.h"
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								#include "ggml.h"
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												Add MinGW support

											
										
										
											2022-10-09 14:26:37 +00:00
+								#if defined(_MSC_VER) || defined(__MINGW32__)
 								#include <malloc.h> // using malloc.h with MSC/MINGW
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-												wip : rpi4 support

											
										
										
											2022-10-05 18:34:41 +00:00
+								#include <alloca.h>
-												Add MinGW support

											
										
										
											2022-10-09 14:26:37 +00:00
+								#endif
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#include <assert.h>
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								#include <errno.h>
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#include <time.h>
 								#include <math.h>
 								#include <stdlib.h>
 								#include <string.h>
 								#include <stdint.h>
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								#include <inttypes.h>
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#include <stdio.h>
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								#include <float.h>
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#include <limits.h>
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#include <stdarg.h>
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								#include <signal.h>
-												ggml : android and old glibc NUMA incompatibility bugfixes (llama/5557)

* #ifdef out some code NUMA blocks for Android due to lack of support

* added in some __ANDROID__ if def gates around numa code and forced GLIBC prior to 2.29 to use a syscall for getcpu instead of the wrapper

* Changed gates on numa platform specific stuff to __gnu_linux__ to skip any platforms without glibc

* harmonizing #if defined blocks for numa code to __gnu_linux__ since that's the only model that's being followed anyways

---------

Co-authored-by: root <root@nenya.lothlorien.ca>

											
										
										
											2024-02-19 07:38:32 +00:00
+								#if defined(__gnu_linux__)
 								#include <syscall.h>
 								#endif
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								#ifdef GGML_USE_METAL
 								#include <unistd.h>
 								#endif
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#if defined(_MSC_VER)
 								// disable "possible loss of data" to avoid hundreds of casts
 								// we should just be careful :)
 								#pragma warning(disable: 4244 4267)
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								// disable POSIX deprecation warnings
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								// these functions are never going away, anyway
 								#pragma warning(disable: 4996)
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#endif
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
+								#if defined(_WIN32)
-												ggml : fix cross-compile Linux -> Window with mingw (#168)

											
										
										
											2022-11-23 20:27:49 +00:00
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								#define WIN32_LEAN_AND_MEAN
 								#ifndef NOMINMAX
 								    #define NOMINMAX
 								#endif
-												ggml : fix cross-compile Linux -> Window with mingw (#168)

											
										
										
											2022-11-23 20:27:49 +00:00
+								#include <windows.h>
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
 								typedef volatile LONG atomic_int;
 								typedef atomic_int atomic_bool;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void atomic_store(atomic_int * ptr, LONG val) {
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								    InterlockedExchange(ptr, val);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static LONG atomic_load(atomic_int * ptr) {
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								    return InterlockedCompareExchange(ptr, 0, 0);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								    return InterlockedExchangeAdd(ptr, inc);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								    return atomic_fetch_add(ptr, -(dec));
 								}
 								typedef HANDLE pthread_t;
 								typedef DWORD thread_ret_t;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
+								    (void) unused;
-												ggml : fix Windows build

											
										
										
											2022-11-20 20:43:32 +00:00
+								    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-												Fix the Windows pthread_create shim

The current implementation doesn't actually set the out parameter,
and it returns 0 on failure instead of on success.

											
										
										
											2022-11-08 11:04:23 +00:00
+								    if (handle == NULL)
 								    {
 								        return EAGAIN;
 								    }
 								    *out = handle;
 								    return 0;
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static int pthread_join(pthread_t thread, void * unused) {
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
+								    (void) unused;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    int ret = (int) WaitForSingleObject(thread, INFINITE);
 								    CloseHandle(thread);
 								    return ret;
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
+								}
-												Implemenated sched_yield function for Windows

											
										
										
											2022-10-30 10:29:27 +00:00
-												Fixed sched_yield

											
										
										
											2022-10-30 17:19:24 +00:00
+								static int sched_yield (void) {
-												Implemenated sched_yield function for Windows

											
										
										
											2022-10-30 10:29:27 +00:00
+								    Sleep (0);
 								    return 0;
 								}
-												Building with MSVC

											
										
										
											2022-10-11 17:57:52 +00:00
+								#else
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#include <pthread.h>
-												Building with MSVC

											
										
										
											2022-10-11 17:57:52 +00:00
+								#include <stdatomic.h>
-												refactoring : move main + stream in examples + other stuff

											
										
										
											2022-10-25 16:13:08 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								typedef void * thread_ret_t;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
 								#include <sys/types.h>
 								#include <sys/stat.h>
 								#include <unistd.h>
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								#endif
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								#ifdef GGML_USE_CPU_HBM
 								#include <hbwmalloc.h>
-												Building with MSVC

											
										
										
											2022-10-11 17:57:52 +00:00
+								#endif
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#if defined(__APPLE__)
 								#include <TargetConditionals.h>
-												fix compilation on haiku

											
										
										
											2022-12-08 05:34:19 +00:00
+								#endif
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
 								    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
 								#include <sys/wait.h>
 								void ggml_print_backtrace(void) {
 								    /*
 								    #include <execinfo.h>
 								    #include <dlfcn.h>
 								    void * trace[100];
 								    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
 								    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
 								    */
 								    // backtrack_symbols does not show line numbers, use gdb instead
 								    char attach[32];
 								    snprintf(attach, sizeof(attach), "attach %d", getpid());
 								    int pid = fork();
 								    if (pid == 0) {
 								        execlp("gdb", "gdb", "--batch",
 								            "-ex", "set style enabled on",
 								            "-ex", attach,
 								            "-ex", "bt -frame-info source-and-location",
 								            "-ex", "detach",
 								            "-ex", "quit",
-												Fix execlp call (ggml/689)

NULL can be an integer constant expression with the value zero, in this case the behavior would be undefined because of an incorrect type being passed to the variable arguments.

											
										
										
											2024-01-09 16:16:37 +00:00
+								            (char *) NULL);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    } else {
 								        waitpid(pid, NULL, 0);
 								    }
 								}
 								#else
 								void ggml_print_backtrace(void) {
 								    // platform not supported
 								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								#endif
-												ggml : unroll ggml_vec_dot_f16 in ggml_compute_forward_flash_attn_f16

											
										
										
											2023-01-07 15:32:23 +00:00
+								/*#define GGML_PERF*/
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#define GGML_DEBUG 0
-												Correct implementation of FP16 GELU

Can toggle it via the GGML_GELU_FP16 macro

											
										
										
											2022-10-18 15:42:08 +00:00
+								#define GGML_GELU_FP16
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#define GGML_GELU_QUICK_FP16
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								#define GGML_SILU_FP16
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								// #define GGML_CROSS_ENTROPY_EXP_FP16
 								// #define GGML_FLASH_ATTN_EXP_FP16
-												ggml : fix bug in new soft max computation

											
										
										
											2023-01-07 19:00:07 +00:00
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								#define GGML_SOFT_MAX_UNROLL 4
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								#define GGML_VEC_DOT_UNROLL  2
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#define GGML_VEC_MAD_UNROLL  32
-												ggml : fix bug in new soft max computation

											
										
										
											2023-01-07 19:00:07 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								//
 								// logging
 								//
 								#if (GGML_DEBUG >= 1)
 								#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
 								#else
 								#define GGML_PRINT_DEBUG(...)
 								#endif
 								#if (GGML_DEBUG >= 5)
 								#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
 								#else
 								#define GGML_PRINT_DEBUG_5(...)
 								#endif
 								#if (GGML_DEBUG >= 10)
 								#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
 								#else
 								#define GGML_PRINT_DEBUG_10(...)
 								#endif
 								#define GGML_PRINT(...) printf(__VA_ARGS__)
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								//
 								// end of logging block
 								//
-												ggml : fix bug in new soft max computation

											
										
										
											2023-01-07 19:00:07 +00:00
+								#ifdef GGML_USE_ACCELERATE
 								// uncomment to use vDSP for soft max computation
 								// note: not sure if it is actually faster
 								//#define GGML_SOFT_MAX_ACCELERATE
 								#endif
-												wip : rpi4 support

											
										
										
											2022-10-05 18:34:41 +00:00
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
+								#if defined(_MSC_VER) || defined(__MINGW32__)
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
 								#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
+								#else
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								inline static void * ggml_aligned_malloc(size_t size) {
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								    if (size == 0) {
 								        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
 								        return NULL;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    void * aligned_memory = NULL;
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								#ifdef GGML_USE_CPU_HBM
 								    int result = hbw_posix_memalign(&aligned_memory, 16, size);
 								#elif GGML_USE_METAL
-												ggml : posixify pagesize (#1251)

* ggml : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD

sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml.c

* metal : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD

sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml-metal.m
											
										
										
											2023-09-06 15:19:36 +00:00
+								    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#else
-												ggml, ci : fix build on whisper.android (ARM_NEON) + add CI (#764)

* ggml : fix undefined symbol by remove inline handle

* ggml : make own ggml_aligned_malloc function

* ci: add ios/android build
											
										
										
											2023-04-15 11:21:58 +00:00
+								    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#endif
-												ggml, ci : fix build on whisper.android (ARM_NEON) + add CI (#764)

* ggml : fix undefined symbol by remove inline handle

* ggml : make own ggml_aligned_malloc function

* ci: add ios/android build
											
										
										
											2023-04-15 11:21:58 +00:00
+								    if (result != 0) {
 								        // Handle allocation failure
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const char *error_desc = "unknown allocation error";
 								        switch (result) {
 								            case EINVAL:
 								                error_desc = "invalid alignment value";
 								                break;
 								            case ENOMEM:
 								                error_desc = "insufficient memory";
 								                break;
 								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								        GGML_ASSERT(false);
-												ggml, ci : fix build on whisper.android (ARM_NEON) + add CI (#764)

* ggml : fix undefined symbol by remove inline handle

* ggml : make own ggml_aligned_malloc function

* ci: add ios/android build
											
										
										
											2023-04-15 11:21:58 +00:00
+								        return NULL;
 								    }
 								    return aligned_memory;
 								}
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								#ifdef GGML_USE_CPU_HBM
 								#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
 								#else
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#define GGML_ALIGNED_FREE(ptr)    free(ptr)
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
+								#endif
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								#endif
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								inline static void * ggml_malloc(size_t size) {
 								    if (size == 0) {
 								        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
 								        return NULL;
 								    }
 								    void * result = malloc(size);
 								    if (result == NULL) {
 								        GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
 								        GGML_ASSERT(false);
 								    }
 								    return result;
 								}
 								// calloc
 								inline static void * ggml_calloc(size_t num, size_t size) {
 								    if (num == 0 || size == 0) {
 								        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
 								        return NULL;
 								    }
 								    void * result = calloc(num, size);
 								    if (result == NULL) {
 								        GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
 								        GGML_ASSERT(false);
 								    }
 								    return result;
 								}
 								#define GGML_MALLOC(size)      ggml_malloc(size)
 								#define GGML_CALLOC(num, size) ggml_calloc(num, size)
 								#define GGML_FREE(ptr) free(ptr)
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								#define UNUSED GGML_UNUSED
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#if defined(GGML_USE_ACCELERATE)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#include <Accelerate/Accelerate.h>
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 								#include "ggml-opencl.h"
-												Allow for Vulkan build with Accelerate.

Closes #5304

											
										
										
											2024-02-03 17:56:46 +00:00
+								#elif defined(GGML_USE_VULKAN)
 								#include "ggml-vulkan.h"
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#endif
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#elif defined(GGML_USE_OPENBLAS)
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								#if defined(GGML_BLAS_USE_MKL)
 								#include <mkl.h>
 								#else
-												Add OpenBLAS support

Supported via CMake - just add:

cmake .. -DWHISPER_SUPPORT_OPENBLAS=ON

On Ubuntu, you have to install the library like this:

apt install libopenblas-dev

Unfortunately, I don't observe any benefit compared to the
original AVX2 + FP16 implementation. Maybe I'm missing something

											
										
										
											2022-10-27 15:31:49 +00:00
+								#include <cblas.h>
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								#endif
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#elif defined(GGML_USE_CLBLAST)
 								#include "ggml-opencl.h"
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								#elif defined(GGML_USE_VULKAN)
 								#include "ggml-vulkan.h"
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#endif
 								// floating point type used to accumulate sums
 								typedef double ggml_float;
-												ggml : fix MIN / MAX macro re-definition

											
										
										
											2023-11-07 14:08:46 +00:00
+								#undef MIN
 								#undef MAX
 								#define MIN(a, b) ((a) < (b) ? (a) : (b))
 								#define MAX(a, b) ((a) > (b) ? (a) : (b))
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								//
 								// global data
 								//
 								// precomputed gelu table for f16 (128 KB)
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// precomputed quick gelu table for f16 (128 KB)
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								// precomputed silu table for f16 (128 KB)
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static ggml_fp16_t ggml_table_silu_f16[1 << 16];
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								// precomputed exp table for f16 (128 KB)
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static ggml_fp16_t ggml_table_exp_f16[1 << 16];
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 								float ggml_table_f32_f16[1 << 16];
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								const char * ggml_status_to_string(enum ggml_status status) {
 								    switch (status) {
 								        case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
-												ggml : fix unknown status (llama/0)

											
										
										
											2024-03-04 18:53:27 +00:00
+								        case GGML_STATUS_FAILED:       return "GGML status: error (operation failed)";
 								        case GGML_STATUS_SUCCESS:      return "GGML status: success";
 								        case GGML_STATUS_ABORTED:      return "GGML status: warning (operation aborted)";
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								    }
-												ggml : fix unknown status (llama/0)

											
										
										
											2024-03-04 18:53:27 +00:00
 								    return "GGML status: unknown";
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								}
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
+								// note: do not use these inside ggml.c
 								// these are meant to be used via the ggml.h API
 								float ggml_fp16_to_fp32(ggml_fp16_t x) {
-												ggml : always define ggml_fp16_t as uint16_t (llama/5666)

* ggml : always define ggml_fp16_t as uint16_t

ggml-ci

* ggml : cont

ggml-ci

* ggml : cont

* ggml : cont

ggml-ci

* ggml : cont

ggml-ci

* cuda : no longer ggml headers last

ggml-ci

* ggml : fix q6_K FP16 -> FP32 conversion

ggml-ci

* ggml : more FP16 -> FP32 conversion fixes

ggml-ci

											
										
										
											2024-02-22 21:21:39 +00:00
+								    return GGML_FP16_TO_FP32(x);
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
+								}
 								ggml_fp16_t ggml_fp32_to_fp16(float x) {
 								    return GGML_FP32_TO_FP16(x);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
 								    for (int i = 0; i < n; i++) {
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        y[i] = GGML_FP16_TO_FP32(x[i]);
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
 								    int i = 0;
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								#if defined(__F16C__)
 								    for (; i + 7 < n; i += 8) {
 								        __m256 x_vec = _mm256_loadu_ps(x + i);
 								        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
 								        _mm_storeu_si128((__m128i *)(y + i), y_vec);
 								    }
 								    for(; i + 3 < n; i += 4) {
 								        __m128 x_vec = _mm_loadu_ps(x + i);
 								        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
 								        _mm_storel_epi64((__m128i *)(y + i), y_vec);
 								    }
 								#endif
 								    for (; i < n; i++) {
 								        y[i] = GGML_FP32_TO_FP16(x[i]);
 								    }
 								}
-												Introduce backend GUIDs (ggml/743)

* Introduce backend GUIDs

Initial proposed implementation of backend GUIDs
(Discussed in https://github.com/ggerganov/ggml/pull/741)

Hardcoded CPU backend GUID (for now)
Change ggml_backend_is_cpu logic to use GUID

* Remove redundant functions

Remove redundant functions `ggml_backend_i::get_name` and `ggml_backend_guid` which are not desired for future expansion

* Add spaces to match style

Co-authored-by: slaren <slarengh@gmail.com>

* Fix brace style to match

Co-authored-by: slaren <slarengh@gmail.com>

* Add void to () in function signature

Co-authored-by: slaren <slarengh@gmail.com>

* Add back ggml_backend_guid and make CPU_GUID a local static in ggml_backend_cpu_guid

* add guids to all backends

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-24 16:27:36 +00:00
+								bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
 								    return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
 								// timing
 								//
-												Cross compilation (#121)

* Cross compile windows

* set env properly

* rm log

* fix review

* Add back space
											
										
										
											2022-11-02 06:46:49 +00:00
+								#if defined(_MSC_VER) || defined(__MINGW32__)
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static int64_t timer_freq, timer_start;
-												Building with MSVC

											
										
										
											2022-10-11 17:57:52 +00:00
+								void ggml_time_init(void) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    LARGE_INTEGER t;
 								    QueryPerformanceFrequency(&t);
 								    timer_freq = t.QuadPart;
 								    // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
 								    // and the uptime is high enough.
 								    // We subtract the program start time to reduce the likelihood of that happening.
 								    QueryPerformanceCounter(&t);
 								    timer_start = t.QuadPart;
-												Building with MSVC

											
										
										
											2022-10-11 17:57:52 +00:00
+								}
 								int64_t ggml_time_ms(void) {
 								    LARGE_INTEGER t;
 								    QueryPerformanceCounter(&t);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    return ((t.QuadPart-timer_start) * 1000) / timer_freq;
-												Building with MSVC

											
										
										
											2022-10-11 17:57:52 +00:00
+								}
 								int64_t ggml_time_us(void) {
 								    LARGE_INTEGER t;
 								    QueryPerformanceCounter(&t);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
-												Building with MSVC

											
										
										
											2022-10-11 17:57:52 +00:00
+								}
 								#else
 								void ggml_time_init(void) {}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								int64_t ggml_time_ms(void) {
 								    struct timespec ts;
 								    clock_gettime(CLOCK_MONOTONIC, &ts);
 								    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
 								}
 								int64_t ggml_time_us(void) {
 								    struct timespec ts;
 								    clock_gettime(CLOCK_MONOTONIC, &ts);
 								    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
 								}
-												Building with MSVC

											
										
										
											2022-10-11 17:57:52 +00:00
+								#endif
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								int64_t ggml_cycles(void) {
 								    return clock();
 								}
 								int64_t ggml_cycles_per_ms(void) {
 								    return CLOCKS_PER_SEC/1000;
 								}
 								#ifdef GGML_PERF
 								#define ggml_perf_time_ms()       ggml_time_ms()
 								#define ggml_perf_time_us()       ggml_time_us()
 								#define ggml_perf_cycles()        ggml_cycles()
 								#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
 								#else
 								#define ggml_perf_time_ms()       0
 								#define ggml_perf_time_us()       0
 								#define ggml_perf_cycles()        0
 								#define ggml_perf_cycles_per_ms() 0
 								#endif
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								//
 								// cross-platform UTF-8 file paths
 								//
 								#ifdef _WIN32
 								static wchar_t * ggml_mbstowcs(const char * mbs) {
 								    int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
 								    if (!wlen) {
 								        errno = EINVAL;
 								        return NULL;
 								    }
 								    wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
 								    wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
 								    if (!wlen) {
 								        GGML_FREE(wbuf);
 								        errno = EINVAL;
 								        return NULL;
 								    }
 								    return wbuf;
 								}
 								#endif
 								FILE * ggml_fopen(const char * fname, const char * mode) {
 								#ifdef _WIN32
 								    FILE * file = NULL;
 								    // convert fname (UTF-8)
 								    wchar_t * wfname = ggml_mbstowcs(fname);
 								    if (wfname) {
 								        // convert mode (ANSI)
 								        wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
 								        wchar_t * wmode_p = wmode;
 								        do {
 								            *wmode_p++ = (wchar_t)*mode;
 								        } while (*mode++);
 								        // open file
 								        file = _wfopen(wfname, wmode);
 								        GGML_FREE(wfname);
 								        GGML_FREE(wmode);
 								    }
 								    return file;
 								#else
 								    return fopen(fname, mode);
 								#endif
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
 								// cache line
 								//
 								#if defined(__cpp_lib_hardware_interference_size)
-												ref #11, #18, #26 : fix CACHE_LINE_SIZE constant

											
										
										
											2022-10-07 18:56:44 +00:00
+								#define CACHE_LINE_SIZE hardware_destructive_interference_size
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#else
-												whisper : PPC64 big-endian support (#398)

* ggml : set cache line size to 128 on POWER9

* whisper : add PPC64 big endian support
											
										
										
											2023-01-23 18:48:10 +00:00
+								#if defined(__POWER9_VECTOR__)
 								#define CACHE_LINE_SIZE 128
 								#else
-												ref #11, #18, #26 : fix CACHE_LINE_SIZE constant

											
										
										
											2022-10-07 18:56:44 +00:00
+								#define CACHE_LINE_SIZE 64
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#endif
-												whisper : PPC64 big-endian support (#398)

* ggml : set cache line size to 128 on POWER9

* whisper : add PPC64 big endian support
											
										
										
											2023-01-23 18:48:10 +00:00
+								#endif
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : make consts static (#317)

These shouldn't be able to be referenced outside the compilation unit.
											
										
										
											2022-12-23 09:05:27 +00:00
+								static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
 								static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 								    [GGML_TYPE_I8] = {
 								        .type_name                = "i8",
 								        .blck_size                = 1,
 								        .type_size                = sizeof(int8_t),
 								        .is_quantized             = false,
 								    },
 								    [GGML_TYPE_I16] = {
 								        .type_name                = "i16",
 								        .blck_size                = 1,
 								        .type_size                = sizeof(int16_t),
 								        .is_quantized             = false,
 								    },
 								    [GGML_TYPE_I32] = {
 								        .type_name                = "i32",
 								        .blck_size                = 1,
 								        .type_size                = sizeof(int32_t),
 								        .is_quantized             = false,
 								    },
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    [GGML_TYPE_I64] = {
 								        .type_name                = "i64",
 								        .blck_size                = 1,
 								        .type_size                = sizeof(int64_t),
 								        .is_quantized             = false,
 								    },
 								    [GGML_TYPE_F64] = {
 								        .type_name                = "f64",
 								        .blck_size                = 1,
 								        .type_size                = sizeof(double),
 								        .is_quantized             = false,
 								        .nrows                    = 1,
 								    },
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    [GGML_TYPE_F32] = {
 								        .type_name                = "f32",
 								        .blck_size                = 1,
 								        .type_size                = sizeof(float),
 								        .is_quantized             = false,
 								        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
 								        .vec_dot_type             = GGML_TYPE_F32,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    },
 								    [GGML_TYPE_F16] = {
 								        .type_name                = "f16",
 								        .blck_size                = 1,
 								        .type_size                = sizeof(ggml_fp16_t),
 								        .is_quantized             = false,
 								        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
 								        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
 								        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
 								        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
 								        .vec_dot_type             = GGML_TYPE_F16,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    },
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    [GGML_TYPE_Q4_0] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q4_0",
 								        .blck_size                = QK4_0,
 								        .type_size                = sizeof(block_q4_0),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
 								        .from_float               = quantize_row_q4_0,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
 								        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_0,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								#if defined (__ARM_FEATURE_MATMUL_INT8)
 								        .nrows                    = 2,
 								#else
 								        .nrows                    = 1,
 								#endif
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    },
 								    [GGML_TYPE_Q4_1] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q4_1",
 								        .blck_size                = QK4_1,
 								        .type_size                = sizeof(block_q4_1),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
 								        .from_float               = quantize_row_q4_1,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
 								        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_1,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								#if defined (__ARM_FEATURE_MATMUL_INT8)
 								        .nrows                    = 2,
 								#else
 								        .nrows                    = 1,
 								#endif
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    },
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    [4] = { // GGML_TYPE_Q4_2
 								        .type_name                = "DEPRECATED",
 								        .blck_size                = 0,
 								        .type_size                = 0,
 								        .is_quantized             = false,
 								        .to_float                 = NULL,
 								        .from_float               = NULL,
 								        .from_float_reference     = NULL,
 								        .vec_dot                  = NULL,
 								        .vec_dot_type             = GGML_TYPE_COUNT,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    },
 								    [5] = { // GGML_TYPE_Q4_3
 								        .type_name                = "DEPRECATED",
 								        .blck_size                = 0,
 								        .type_size                = 0,
 								        .is_quantized             = false,
 								        .to_float                 = NULL,
 								        .from_float               = NULL,
 								        .from_float_reference     = NULL,
 								        .vec_dot                  = NULL,
 								        .vec_dot_type             = GGML_TYPE_COUNT,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    },
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    [GGML_TYPE_Q5_0] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q5_0",
 								        .blck_size                = QK5_0,
 								        .type_size                = sizeof(block_q5_0),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
 								        .from_float               = quantize_row_q5_0,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
 								        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_0,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    },
 								    [GGML_TYPE_Q5_1] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q5_1",
 								        .blck_size                = QK5_1,
 								        .type_size                = sizeof(block_q5_1),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
 								        .from_float               = quantize_row_q5_1,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
 								        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_1,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    },
 								    [GGML_TYPE_Q8_0] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q8_0",
 								        .blck_size                = QK8_0,
 								        .type_size                = sizeof(block_q8_0),
 								        .is_quantized             = true,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .from_float               = quantize_row_q8_0,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
 								        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_0,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								#if defined (__ARM_FEATURE_MATMUL_INT8)
 								        .nrows                    = 2,
 								#else
 								        .nrows                    = 1,
 								#endif
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    },
 								    [GGML_TYPE_Q8_1] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q8_1",
 								        .blck_size                = QK8_1,
 								        .type_size                = sizeof(block_q8_1),
 								        .is_quantized             = true,
 								        .from_float               = quantize_row_q8_1,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_1,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    },
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    [GGML_TYPE_Q2_K] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q2_K",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_q2_K),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
 								        .from_float               = quantize_row_q2_K,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
 								        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    },
 								    [GGML_TYPE_Q3_K] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q3_K",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_q3_K),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
 								        .from_float               = quantize_row_q3_K,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
 								        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    },
 								    [GGML_TYPE_Q4_K] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q4_K",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_q4_K),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
 								        .from_float               = quantize_row_q4_K,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
 								        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    },
 								    [GGML_TYPE_Q5_K] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q5_K",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_q5_K),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
 								        .from_float               = quantize_row_q5_K,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
 								        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    },
 								    [GGML_TYPE_Q6_K] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        .type_name                = "q6_K",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_q6_K),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
 								        .from_float               = quantize_row_q6_K,
 								        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
 								        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    },
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								    [GGML_TYPE_IQ2_XXS] = {
 								        .type_name                = "iq2_xxs",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_iq2_xxs),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
--bit quantizations (llama/4897)

* imatrix: load

* imatrix: WIP

* imatrix: Add Q2_K quantization

* imatrix: also guard against Q2_K_S quantization without importance matrix

* imatrix: guard even more against low-bit quantization misuse

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-14 07:45:56 +00:00
+								        .from_float               = NULL,
 								        .from_float_reference     = NULL,
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
 								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								    },
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								    [GGML_TYPE_IQ2_XS] = {
 								        .type_name                = "iq2_xs",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_iq2_xs),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
--bit quantizations (llama/4897)

* imatrix: load

* imatrix: WIP

* imatrix: Add Q2_K quantization

* imatrix: also guard against Q2_K_S quantization without importance matrix

* imatrix: guard even more against low-bit quantization misuse

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-14 07:45:56 +00:00
+								        .from_float               = NULL,
 								        .from_float_reference     = NULL,
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
 								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								    },
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								    [GGML_TYPE_IQ3_XXS] = {
 								        .type_name                = "iq3_xxs",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_iq3_xxs),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
 								        .from_float               = quantize_row_iq3_xxs,
 								        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
 								        .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
 								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        .nrows                    = 1,
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								    },
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								    [GGML_TYPE_IQ3_S] = {
 								        .type_name                = "iq3_s",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_iq3_s),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
 								        .from_float               = quantize_row_iq3_s,
 								        .from_float_reference     = (ggml_from_float_t)quantize_row_iq3_s_reference,
 								        .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
 								        .vec_dot_type             = GGML_TYPE_Q8_K,
 								        .nrows                    = 1,
 								    },
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								    [GGML_TYPE_IQ2_S] = {
 								        .type_name                = "iq2_s",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_iq2_s),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
 								        .from_float               = quantize_row_iq2_s,
 								        .from_float_reference     = (ggml_from_float_t)quantize_row_iq2_s_reference,
 								        .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
 								        .vec_dot_type             = GGML_TYPE_Q8_K,
 								        .nrows                    = 1,
 								    },
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								    [GGML_TYPE_IQ1_S] = {
 								        .type_name                = "iq1_s",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_iq1_s),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
 								        .from_float               = NULL,
 								        .from_float_reference     = NULL,
 								        .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
 								        .vec_dot_type             = GGML_TYPE_Q8_K,
 								        .nrows                    = 1,
 								    },
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    [GGML_TYPE_IQ1_M] = {
 								        .type_name                = "iq1_m",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_iq1_m),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_m,
 								        .from_float               = NULL,
 								        .from_float_reference     = NULL,
 								        .vec_dot                  = ggml_vec_dot_iq1_m_q8_K,
 								        .vec_dot_type             = GGML_TYPE_Q8_K,
 								        .nrows                    = 1,
 								    },
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								    [GGML_TYPE_IQ4_NL] = {
 								        .type_name                = "iq4_nl",
 								        .blck_size                = QK4_NL,
 								        .type_size                = sizeof(block_iq4_nl),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
 								        .from_float               = quantize_row_iq4_nl,
 								        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_nl_reference,
 								        .vec_dot                  = ggml_vec_dot_iq4_nl_q8_0,
 								        .vec_dot_type             = GGML_TYPE_Q8_0,
 								        .nrows                    = 1,
 								    },
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								    [GGML_TYPE_IQ4_XS] = {
 								        .type_name                = "iq4_xs",
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (llama/5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-28 08:37:02 +00:00
+								#if QK_K == 64
 								        .blck_size                = QK4_NL,
 								#else
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        .blck_size                = QK_K,
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (llama/5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-28 08:37:02 +00:00
+								#endif
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        .type_size                = sizeof(block_iq4_xs),
 								        .is_quantized             = true,
 								        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
 								        .from_float               = quantize_row_iq4_xs,
 								        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_xs_reference,
 								        .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (llama/5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-28 08:37:02 +00:00
+								#if QK_K == 64
 								        .vec_dot_type             = GGML_TYPE_Q8_0,
 								#else
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        .vec_dot_type             = GGML_TYPE_Q8_K,
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (llama/5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-28 08:37:02 +00:00
+								#endif
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        .nrows                    = 1,
 								    },
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    [GGML_TYPE_Q8_K] = {
 								        .type_name                = "q8_K",
 								        .blck_size                = QK_K,
 								        .type_size                = sizeof(block_q8_K),
 								        .is_quantized             = true,
 								        .from_float               = quantize_row_q8_K,
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								};
 								// For internal test use
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
 								    GGML_ASSERT(type < GGML_TYPE_COUNT);
 								    return type_traits[type];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								// simd mappings
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#if defined(__ARM_NEON)
 								#if !defined(__aarch64__)
 								// 64-bit compatibility
 								inline static float vaddvq_f32(float32x4_t v) {
 								    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 								}
 								#endif
 								#endif
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								// we define a common set of C macros which map to specific intrinsics based on the current architecture
 								// we then implement the fundamental computation operations below using only these macros
 								// adding support for new architectures requires to define the corresponding SIMD macros
 								//
 								// GGML_F32_STEP / GGML_F16_STEP
 								//   number of elements to process in a single step
 								//
 								// GGML_F32_EPR / GGML_F16_EPR
 								//   number of elements to fit in a single register
 								//
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												Check for both __ARM_NEON and __ARM_FEATURE_FMA so that the project can be compiled for armv7a.

Android armeabi-v7a's NEON support doesn't support FMA unless configured with `-mfpu=neon-fp-armv8`, which would need runtime checks.
* Also removed ABI filter from Android project.

											
										
										
											2022-12-20 18:33:33 +00:00
+								#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_SIMD
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								// F32 NEON
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32_STEP 16
 								#define GGML_F32_EPR  4
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32x4              float32x4_t
 								#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
 								#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
 								#define GGML_F32x4_LOAD         vld1q_f32
 								#define GGML_F32x4_STORE        vst1q_f32
 								#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
 								#define GGML_F32x4_ADD          vaddq_f32
 								#define GGML_F32x4_MUL          vmulq_f32
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
+								#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32x4_REDUCE(res, x)              \
 								{                                              \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int offset = GGML_F32_ARR >> 1;            \
 								    for (int i = 0; i < offset; ++i) {         \
 								        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                          \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                              \
 								    for (int i = 0; i < offset; ++i) {         \
 								        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                          \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                              \
 								    for (int i = 0; i < offset; ++i) {         \
 								        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                          \
 								    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
 								}
 								#define GGML_F32_VEC        GGML_F32x4
 								#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 								#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 								#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 								#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 								#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 								#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 								#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 								#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 								// F16 NEON
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 								    #define GGML_F16_STEP 32
 								    #define GGML_F16_EPR  8
 								    #define GGML_F16x8              float16x8_t
 								    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
 								    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-												ggml, ci : Windows ARM runner and build fixes (llama/5979)

* windows arm ci

* fix `error C2078: too many initializers` with ggml_vld1q_u32 macro for MSVC ARM64

* fix `warning C4146: unary minus operator applied to unsigned type, result still unsigned`

* fix `error C2065: '__fp16': undeclared identifier`

											
										
										
											2024-03-11 09:28:51 +00:00
+								    #define GGML_F16x8_LOAD(x)      vld1q_f16((const ggml_fp16_internal_t *)(x))
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    #define GGML_F16x8_STORE        vst1q_f16
 								    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
 								    #define GGML_F16x8_ADD          vaddq_f16
 								    #define GGML_F16x8_MUL          vmulq_f16
 								    #define GGML_F16x8_REDUCE(res, x)                             \
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    do {                                                          \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        int offset = GGML_F16_ARR >> 1;                           \
 								        for (int i = 0; i < offset; ++i) {                        \
 								            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								        }                                                         \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        offset >>= 1;                                             \
 								        for (int i = 0; i < offset; ++i) {                        \
 								            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								        }                                                         \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        offset >>= 1;                                             \
 								        for (int i = 0; i < offset; ++i) {                        \
 								            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								        }                                                         \
 								        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
 								        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    } while (0)
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
-												ggml : change f16 load and store macro arguments

											
										
										
											2023-01-03 05:43:17 +00:00
+								    #define GGML_F16_VEC                GGML_F16x8
 								    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
 								    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
 								    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
 								    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
 								    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
 								    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
 								    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
 								    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#else
 								    // if FP16 vector arithmetic is not supported, we use FP32 instead
 								    // and take advantage of the vcvt_ functions to convert to/from FP16
 								    #define GGML_F16_STEP 16
 								    #define GGML_F16_EPR  4
 								    #define GGML_F32Cx4              float32x4_t
 								    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
 								    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-												ggml, ci : Windows ARM runner and build fixes (llama/5979)

* windows arm ci

* fix `error C2078: too many initializers` with ggml_vld1q_u32 macro for MSVC ARM64

* fix `warning C4146: unary minus operator applied to unsigned type, result still unsigned`

* fix `error C2065: '__fp16': undeclared identifier`

											
										
										
											2024-03-11 09:28:51 +00:00
+								    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
 								    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
 								    #define GGML_F32Cx4_ADD          vaddq_f32
 								    #define GGML_F32Cx4_MUL          vmulq_f32
 								    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
-												ggml : change f16 load and store macro arguments

											
										
										
											2023-01-03 05:43:17 +00:00
+								    #define GGML_F16_VEC                GGML_F32Cx4
 								    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
 								    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
 								    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
 								    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
 								    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
 								    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
 								    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
 								    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#endif
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								#elif defined(__AVX512F__)
 								#define GGML_SIMD
 								// F32 AVX512
 								#define GGML_F32_STEP 64
 								#define GGML_F32_EPR  16
 								#define GGML_F32x16         __m512
 								#define GGML_F32x16_ZERO    _mm512_setzero_ps()
 								#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
 								#define GGML_F32x16_LOAD    _mm512_loadu_ps
 								#define GGML_F32x16_STORE   _mm512_storeu_ps
 								// _mm512_fmadd_ps is defined in AVX512F so no guard is required
 								#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
 								#define GGML_F32x16_ADD     _mm512_add_ps
 								#define GGML_F32x16_MUL     _mm512_mul_ps
 								#define GGML_F32x16_REDUCE(res, x)                                    \
 								do {                                                                  \
 								    int offset = GGML_F32_ARR >> 1;                                   \
 								    for (int i = 0; i < offset; ++i) {                                \
 								        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
 								    }                                                                 \
 								    offset >>= 1;                                                     \
 								    for (int i = 0; i < offset; ++i) {                                \
 								        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
 								    }                                                                 \
 								    offset >>= 1;                                                     \
 								    for (int i = 0; i < offset; ++i) {                                \
 								        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
 								    }                                                                 \
 								    res = _mm512_reduce_add_ps(x[0]);                                 \
 								} while (0)
 								// TODO: is this optimal ?
 								#define GGML_F32_VEC        GGML_F32x16
 								#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
 								#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
 								#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
 								#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
 								#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
 								#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
 								#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
 								#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
 								// F16 AVX512
 								// F16 AVX
 								#define GGML_F16_STEP 64
 								#define GGML_F16_EPR  16
 								// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
 								#define GGML_F32Cx16             __m512
 								#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
 								#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
 								// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
 								// so F16C guard isn't required
 								#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
 								#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
 								#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
 								#define GGML_F32Cx16_ADD         _mm512_add_ps
 								#define GGML_F32Cx16_MUL         _mm512_mul_ps
 								#define GGML_F32Cx16_REDUCE(res, x)                               \
 								do {                                                              \
 								    int offset = GGML_F32_ARR >> 1;                               \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
 								    }                                                             \
 								    offset >>= 1;                                                 \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
 								    }                                                             \
 								    offset >>= 1;                                                 \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
 								    }                                                             \
 								    res = _mm512_reduce_add_ps(x[0]);                             \
 								} while (0)
 								#define GGML_F16_VEC                GGML_F32Cx16
 								#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
 								#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
 								#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
 								#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
 								#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
 								#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
 								#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
 								#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#elif defined(__AVX__)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_SIMD
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								// F32 AVX
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32_STEP 32
 								#define GGML_F32_EPR  8
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32x8         __m256
 								#define GGML_F32x8_ZERO    _mm256_setzero_ps()
 								#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
 								#define GGML_F32x8_LOAD    _mm256_loadu_ps
 								#define GGML_F32x8_STORE   _mm256_storeu_ps
 								#if defined(__FMA__)
 								    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
 								#else
 								    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
 								#endif
 								#define GGML_F32x8_ADD     _mm256_add_ps
 								#define GGML_F32x8_MUL     _mm256_mul_ps
 								#define GGML_F32x8_REDUCE(res, x)                                 \
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								do {                                                              \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int offset = GGML_F32_ARR >> 1;                               \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                                             \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                                                 \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                                             \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                                                 \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                                             \
 								    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
 								                                 _mm256_extractf128_ps(x[0], 1)); \
 								    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
-												ci : add an option to fail on compile warning (llama/3952)

* feat(ci): add an option to fail on compile warning

* Update CMakeLists.txt

* minor : fix compile warnings

ggml-ci

* ggml : fix unreachable code warnings

ggml-ci

* ci : disable fatal warnings for windows, ios and tvos

* ggml : fix strncpy warning

* ci : disable fatal warnings for MPI build

* ci : add fatal warnings to ggml-ci

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-17 21:03:14 +00:00
+								    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								} while (0)
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								// TODO: is this optimal ?
 								#define GGML_F32_VEC        GGML_F32x8
 								#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
 								#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
 								#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
 								#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
 								#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
 								#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
 								#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
 								#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
 								// F16 AVX
 								#define GGML_F16_STEP 32
 								#define GGML_F16_EPR  8
 								// F16 arithmetic is not supported by AVX, so we use F32 instead
 								#define GGML_F32Cx8             __m256
 								#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
 								#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
 								#if defined(__F16C__)
 								// the  _mm256_cvt intrinsics require F16C
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
 								#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								#else
 								static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
 								    float tmp[8];
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    for (int i = 0; i < 8; i++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
 								    return _mm256_loadu_ps(tmp);
 								}
 								static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
 								    float arr[8];
 								    _mm256_storeu_ps(arr, y);
 								    for (int i = 0; i < 8; i++)
 								        x[i] = GGML_FP32_TO_FP16(arr[i]);
 								}
 								#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
 								#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
 								#endif
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
 								#define GGML_F32Cx8_ADD         _mm256_add_ps
 								#define GGML_F32Cx8_MUL         _mm256_mul_ps
 								#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-												ggml : change f16 load and store macro arguments

											
										
										
											2023-01-03 05:43:17 +00:00
+								#define GGML_F16_VEC                GGML_F32Cx8
 								#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
 								#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
 								#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
 								#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
 								#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
 								#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
 								#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
 								#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#elif defined(__POWER9_VECTOR__)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : reorganize POWER9 ppc64le SIMD code

											
										
										
											2023-01-03 05:45:43 +00:00
+								#define GGML_SIMD
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
 								// F32 POWER9
 								#define GGML_F32_STEP 32
-												ggml : reorganize POWER9 ppc64le SIMD code

											
										
										
											2023-01-03 05:45:43 +00:00
+								#define GGML_F32_EPR  4
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
-												ggml : reorganize POWER9 ppc64le SIMD code

											
										
										
											2023-01-03 05:45:43 +00:00
+								#define GGML_F32x4              vector float
 								#define GGML_F32x4_ZERO         0.0f
 								#define GGML_F32x4_SET1         vec_splats
 								#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
 								#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-												ggml : reorganize POWER9 ppc64le SIMD code

											
										
										
											2023-01-03 05:45:43 +00:00
+								#define GGML_F32x4_ADD          vec_add
 								#define GGML_F32x4_MUL          vec_mul
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_F32x4_REDUCE(res, x)              \
 								{                                              \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int offset = GGML_F32_ARR >> 1;            \
 								    for (int i = 0; i < offset; ++i) {         \
 								        x[i] = vec_add(x[i], x[offset+i]);     \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                          \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                              \
 								    for (int i = 0; i < offset; ++i) {         \
 								        x[i] = vec_add(x[i], x[offset+i]);     \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                          \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                              \
 								    for (int i = 0; i < offset; ++i) {         \
 								        x[i] = vec_add(x[i], x[offset+i]);     \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                          \
 								    res = vec_extract(x[0], 0) +               \
 								          vec_extract(x[0], 1) +               \
 								          vec_extract(x[0], 2) +               \
 								          vec_extract(x[0], 3);                \
 								}
 								#define GGML_F32_VEC        GGML_F32x4
 								#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 								#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 								#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 								#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 								#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 								#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 								#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 								#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 								// F16 POWER9
-												ggml : reorganize POWER9 ppc64le SIMD code

											
										
										
											2023-01-03 05:45:43 +00:00
+								#define GGML_F16_STEP       GGML_F32_STEP
 								#define GGML_F16_EPR        GGML_F32_EPR
 								#define GGML_F16_VEC        GGML_F32x4
 								#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
 								#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
 								#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
 								#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
 								// Use vec_xl, not vec_ld, in case the load address is not aligned.
 								#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
 								  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
 								  vec_extract_fp32_from_shortl(vec_xl(0, p))
-												whisper : PPC64 big-endian support (#398)

* ggml : set cache line size to 128 on POWER9

* whisper : add PPC64 big endian support
											
										
										
											2023-01-23 18:48:10 +00:00
+								#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
 								#define GGML_F16_VEC_STORE(p, r, i)                             \
 								  if (i & 0x1)                                                  \
 								    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
 								                                   r[i - GGML_ENDIAN_BYTE(0)]), \
 , p - GGML_F16_EPR)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#elif defined(__wasm_simd128__)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								#define GGML_SIMD
 								// F32 WASM
 								#define GGML_F32_STEP 16
 								#define GGML_F32_EPR  4
 								#define GGML_F32x4              v128_t
 								#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
 								#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
 								#define GGML_F32x4_LOAD         wasm_v128_load
 								#define GGML_F32x4_STORE        wasm_v128_store
 								#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
 								#define GGML_F32x4_ADD          wasm_f32x4_add
 								#define GGML_F32x4_MUL          wasm_f32x4_mul
 								#define GGML_F32x4_REDUCE(res, x)                  \
 								{                                                  \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int offset = GGML_F32_ARR >> 1;                \
 								    for (int i = 0; i < offset; ++i) {             \
 								        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                              \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                                  \
 								    for (int i = 0; i < offset; ++i) {             \
 								        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                              \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                                  \
 								    for (int i = 0; i < offset; ++i) {             \
 								        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                              \
 								    res = wasm_f32x4_extract_lane(x[0], 0) +       \
 								          wasm_f32x4_extract_lane(x[0], 1) +       \
 								          wasm_f32x4_extract_lane(x[0], 2) +       \
 								          wasm_f32x4_extract_lane(x[0], 3);        \
 								}
 								#define GGML_F32_VEC        GGML_F32x4
 								#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 								#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 								#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 								#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 								#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 								#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 								#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 								#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 								// F16 WASM
 								#define GGML_F16_STEP 16
 								#define GGML_F16_EPR  4
 								inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
 								    float tmp[4];
 								    tmp[0] = GGML_FP16_TO_FP32(p[0]);
 								    tmp[1] = GGML_FP16_TO_FP32(p[1]);
 								    tmp[2] = GGML_FP16_TO_FP32(p[2]);
 								    tmp[3] = GGML_FP16_TO_FP32(p[3]);
 								    return wasm_v128_load(tmp);
 								}
 								inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 								    float tmp[4];
 								    wasm_v128_store(tmp, x);
 								    p[0] = GGML_FP32_TO_FP16(tmp[0]);
 								    p[1] = GGML_FP32_TO_FP16(tmp[1]);
 								    p[2] = GGML_FP32_TO_FP16(tmp[2]);
 								    p[3] = GGML_FP32_TO_FP16(tmp[3]);
 								}
 								#define GGML_F16x4             v128_t
 								#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
 								#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
 								#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
 								#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
 								#define GGML_F16x4_FMA         GGML_F32x4_FMA
 								#define GGML_F16x4_ADD         wasm_f32x4_add
 								#define GGML_F16x4_MUL         wasm_f32x4_mul
 								#define GGML_F16x4_REDUCE(res, x)                  \
 								{                                                  \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int offset = GGML_F16_ARR >> 1;                \
 								    for (int i = 0; i < offset; ++i) {             \
 								        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                              \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                                  \
 								    for (int i = 0; i < offset; ++i) {             \
 								        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                              \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                                  \
 								    for (int i = 0; i < offset; ++i) {             \
 								        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    }                                              \
 								    res = wasm_f32x4_extract_lane(x[0], 0) +       \
 								          wasm_f32x4_extract_lane(x[0], 1) +       \
 								          wasm_f32x4_extract_lane(x[0], 2) +       \
 								          wasm_f32x4_extract_lane(x[0], 3);        \
 								}
-												ggml : change f16 load and store macro arguments

											
										
										
											2023-01-03 05:43:17 +00:00
+								#define GGML_F16_VEC                GGML_F16x4
 								#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
 								#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
 								#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
 								#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
 								#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
 								#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
 								#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
 								#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
-												add AVX support

											
										
										
											2022-11-23 11:23:24 +00:00
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
+								#elif defined(__SSE3__)
 								#define GGML_SIMD
 								// F32 SSE
 								#define GGML_F32_STEP 32
 								#define GGML_F32_EPR  4
 								#define GGML_F32x4         __m128
 								#define GGML_F32x4_ZERO    _mm_setzero_ps()
 								#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
 								#define GGML_F32x4_LOAD    _mm_loadu_ps
 								#define GGML_F32x4_STORE   _mm_storeu_ps
 								#if defined(__FMA__)
 								    // TODO: Does this work?
 								    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
 								#else
 								    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
 								#endif
 								#define GGML_F32x4_ADD     _mm_add_ps
 								#define GGML_F32x4_MUL     _mm_mul_ps
 								#define GGML_F32x4_REDUCE(res, x)                                 \
 								{                                                                 \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int offset = GGML_F32_ARR >> 1;                               \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
+								    }                                                             \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                                                 \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
+								    }                                                             \
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    offset >>= 1;                                                 \
 								    for (int i = 0; i < offset; ++i) {                            \
 								        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
+								    }                                                             \
 								    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
-												ci : add an option to fail on compile warning (llama/3952)

* feat(ci): add an option to fail on compile warning

* Update CMakeLists.txt

* minor : fix compile warnings

ggml-ci

* ggml : fix unreachable code warnings

ggml-ci

* ci : disable fatal warnings for windows, ios and tvos

* ggml : fix strncpy warning

* ci : disable fatal warnings for MPI build

* ci : add fatal warnings to ggml-ci

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-17 21:03:14 +00:00
+								    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
+								}
 								// TODO: is this optimal ?
 								#define GGML_F32_VEC        GGML_F32x4
 								#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 								#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 								#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 								#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 								#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 								#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 								#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 								#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 								// F16 SSE
 								#define GGML_F16_STEP 32
 								#define GGML_F16_EPR  4
 								static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
 								    float tmp[4];
 								    tmp[0] = GGML_FP16_TO_FP32(x[0]);
 								    tmp[1] = GGML_FP16_TO_FP32(x[1]);
 								    tmp[2] = GGML_FP16_TO_FP32(x[2]);
 								    tmp[3] = GGML_FP16_TO_FP32(x[3]);
 								    return _mm_loadu_ps(tmp);
 								}
 								static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
 								    float arr[4];
 								    _mm_storeu_ps(arr, y);
 								    x[0] = GGML_FP32_TO_FP16(arr[0]);
 								    x[1] = GGML_FP32_TO_FP16(arr[1]);
 								    x[2] = GGML_FP32_TO_FP16(arr[2]);
 								    x[3] = GGML_FP32_TO_FP16(arr[3]);
 								}
 								#define GGML_F32Cx4             __m128
 								#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
 								#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
 								#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
 								#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
 								#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
 								#define GGML_F32Cx4_ADD         _mm_add_ps
 								#define GGML_F32Cx4_MUL         _mm_mul_ps
 								#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
 								#define GGML_F16_VEC                 GGML_F32Cx4
 								#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
 								#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
 								#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
 								#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
 								#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
 								#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
 								#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#endif
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// GGML_F32_ARR / GGML_F16_ARR
 								//   number of registers to use per step
 								#ifdef GGML_SIMD
 								#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
 								#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
 								#endif
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								//
 								// fundamental operations
 								//
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 								inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
 								inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
 								inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
 								inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
 								inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
 								inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
 								inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
 								inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 								inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
 								   assert(nrc == 1);
 								   UNUSED(nrc);
 								   UNUSED(bx);
 								   UNUSED(by);
 								   UNUSED(bs);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#ifdef GGML_SIMD
 								    float sumf = 0.0f;
 								    const int np = (n & ~(GGML_F32_STEP - 1));
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_F32_VEC ax[GGML_F32_ARR];
 								    GGML_F32_VEC ay[GGML_F32_ARR];
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    for (int i = 0; i < np; i += GGML_F32_STEP) {
 								        for (int j = 0; j < GGML_F32_ARR; j++) {
 								            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
 								            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
 								        }
 								    }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // reduce sum0..sum3 to sum0
 								    GGML_F32_VEC_REDUCE(sumf, sum);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // leftovers
 								    for (int i = np; i < n; ++i) {
 								        sumf += x[i]*y[i];
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#else
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    // scalar
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    ggml_float sumf = 0.0;
 								    for (int i = 0; i < n; ++i) {
 								        sumf += (ggml_float)(x[i]*y[i]);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#endif
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    *s = sumf;
 								}
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
 								    assert(nrc == 1);
 								    UNUSED(nrc);
 								    UNUSED(bx);
 								    UNUSED(by);
 								    UNUSED(bs);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    ggml_float sumf = 0.0;
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#if defined(GGML_SIMD)
 								    const int np = (n & ~(GGML_F16_STEP - 1));
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_F16_VEC ax[GGML_F16_ARR];
 								    GGML_F16_VEC ay[GGML_F16_ARR];
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    for (int i = 0; i < np; i += GGML_F16_STEP) {
 								        for (int j = 0; j < GGML_F16_ARR; j++) {
 								            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
 								            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
 								        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // reduce sum0..sum3 to sum0
 								    GGML_F16_VEC_REDUCE(sumf, sum);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // leftovers
 								    for (int i = np; i < n; ++i) {
 								        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#else
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    for (int i = 0; i < n; ++i) {
 								        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#endif
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    *s = sumf;
 								}
 								// compute GGML_VEC_DOT_UNROLL dot products at once
 								// xs - x row stride in bytes
 								inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
 								    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
 								    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
 								    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
 								        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
 								    }
 								#if defined(GGML_SIMD)
 								    const int np = (n & ~(GGML_F16_STEP - 1));
 								    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
 								    GGML_F16_VEC ax[GGML_F16_ARR];
 								    GGML_F16_VEC ay[GGML_F16_ARR];
 								    for (int i = 0; i < np; i += GGML_F16_STEP) {
 								        for (int j = 0; j < GGML_F16_ARR; j++) {
 								            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
 								            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
 								                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
 								                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
 								            }
 								        }
 								    }
 								    // reduce sum0..sum3 to sum0
 								    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
 								        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
 								    }
 								    // leftovers
 								    for (int i = np; i < n; ++i) {
 								        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        }
 								    }
 								#else
 								    for (int i = 0; i < n; ++i) {
 								        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        }
 								    }
 								#endif
 								    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
 								        s[i] = sumf[i];
 								    }
 								}
 								inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
 								#if defined(GGML_SIMD)
 								    const int np = (n & ~(GGML_F32_STEP - 1));
 								    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
 								    GGML_F32_VEC ax[GGML_F32_ARR];
 								    GGML_F32_VEC ay[GGML_F32_ARR];
 								    for (int i = 0; i < np; i += GGML_F32_STEP) {
 								        for (int j = 0; j < GGML_F32_ARR; j++) {
 								            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
 								            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
 								            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
 								            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
 								        }
 								    }
 								    // leftovers
 								    for (int i = np; i < n; ++i) {
 								        y[i] += x[i]*v;
 								    }
 								#else
 								    // scalar
 								    for (int i = 0; i < n; ++i) {
 								        y[i] += x[i]*v;
 								    }
 								#endif
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// xs and vs are byte strides of x and v
 								inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
 								    const float * restrict x[GGML_VEC_MAD_UNROLL];
 								    const float * restrict v[GGML_VEC_MAD_UNROLL];
 								    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
 								        x[i] = (const float *) ((const char *) xv + i*xs);
 								        v[i] = (const float *) ((const char *) vv + i*vs);
 								    }
 								#if defined(GGML_SIMD)
 								    const int np = (n & ~(GGML_F32_STEP - 1));
 								    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
 								    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
 								        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
 								    }
 								    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
 								    GGML_F32_VEC ay[GGML_F32_ARR];
 								    for (int i = 0; i < np; i += GGML_F32_STEP) {
 								        for (int j = 0; j < GGML_F32_ARR; j++) {
 								            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
 								            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
 								                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
 								                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
 								            }
 								            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
 								        }
 								    }
 								    // leftovers
 								    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
 								        for (int i = np; i < n; ++i) {
 								            y[i] += x[k][i]*v[k][0];
 								        }
 								    }
 								#else
 								    // scalar
 								    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
 								        for (int i = 0; i < n; ++i) {
 								            y[i] += x[k][i]*v[k][0];
 								        }
 								    }
 								#endif
 								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 								inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								#if defined(GGML_USE_ACCELERATE)
 								    vDSP_vsmul(y, 1, &v, y, 1, n);
 								#elif defined(GGML_SIMD)
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    const int np = (n & ~(GGML_F32_STEP - 1));
 								    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
 								    GGML_F32_VEC ay[GGML_F32_ARR];
 								    for (int i = 0; i < np; i += GGML_F32_STEP) {
 								        for (int j = 0; j < GGML_F32_ARR; j++) {
 								            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
 								            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
-												Add AVX,AVX2 support for ggml_vec_scale_f32

											
										
										
											2022-12-16 23:42:30 +00:00
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
 								        }
-												Add AVX,AVX2 support for ggml_vec_scale_f32

											
										
										
											2022-12-16 23:42:30 +00:00
+								    }
 								    // leftovers
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								    for (int i = np; i < n; ++i) {
-												Add AVX,AVX2 support for ggml_vec_scale_f32

											
										
										
											2022-12-16 23:42:30 +00:00
+								        y[i] *= v;
 								    }
 								#else
 								    // scalar
 								    for (int i = 0; i < n; ++i) {
 								        y[i] *= v;
 								    }
 								#endif
 								}
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 								inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 								inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
 								inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								// TODO: optimize performance
 								inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 								inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static const float GELU_COEF_A     = 0.044715f;
 								static const float GELU_QUICK_COEF = -1.702f;
 								static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								inline static float ggml_gelu_f32(float x) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								}
-												Use Accelerate framework on Apple silicon

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

											
										
										
											2022-10-17 18:44:16 +00:00
+								inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
 								    const uint16_t * i16 = (const uint16_t *) x;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    for (int i = 0; i < n; ++i) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        y[i] = ggml_table_gelu_f16[i16[i]];
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    }
 								}
-												Correct implementation of FP16 GELU

Can toggle it via the GGML_GELU_FP16 macro

											
										
										
											2022-10-18 15:42:08 +00:00
+								#ifdef GGML_GELU_FP16
 								inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
 								    uint16_t t;
 								    for (int i = 0; i < n; ++i) {
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								        if (x[i] <= -10.0f) {
 								            y[i] = 0.0f;
 								        } else if (x[i] >= 10.0f) {
 								            y[i] = x[i];
 								        } else {
 								            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
 								            memcpy(&t, &fp16, sizeof(uint16_t));
 								            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
 								        }
-												Correct implementation of FP16 GELU

Can toggle it via the GGML_GELU_FP16 macro

											
										
										
											2022-10-18 15:42:08 +00:00
+								    }
 								}
 								#else
-												Revert GELU change

Seems it does not work on x86 for some reason

											
										
										
											2022-10-17 21:45:08 +00:00
+								inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
 								    for (int i = 0; i < n; ++i) {
 								        y[i] = ggml_gelu_f32(x[i]);
 								    }
 								}
-												Correct implementation of FP16 GELU

Can toggle it via the GGML_GELU_FP16 macro

											
										
										
											2022-10-18 15:42:08 +00:00
+								#endif
-												Revert GELU change

Seems it does not work on x86 for some reason

											
										
										
											2022-10-17 21:45:08 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								inline static float ggml_gelu_quick_f32(float x) {
 								    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
 								}
 								//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
 								//    const uint16_t * i16 = (const uint16_t *) x;
 								//    for (int i = 0; i < n; ++i) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								//    }
 								//}
 								#ifdef GGML_GELU_QUICK_FP16
 								inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
 								    uint16_t t;
 								    for (int i = 0; i < n; ++i) {
 								        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
 								        memcpy(&t, &fp16, sizeof(uint16_t));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    }
 								}
 								#else
 								inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
 								    for (int i = 0; i < n; ++i) {
 								        y[i] = ggml_gelu_quick_f32(x[i]);
 								    }
 								}
 								#endif
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								// Sigmoid Linear Unit (SiLU) function
 								inline static float ggml_silu_f32(float x) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    return x/(1.0f + expf(-x));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
 								//    const uint16_t * i16 = (const uint16_t *) x;
 								//    for (int i = 0; i < n; ++i) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								//        y[i] = ggml_table_silu_f16[i16[i]];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								//    }
 								//}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								#ifdef GGML_SILU_FP16
 								inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
 								    uint16_t t;
 								    for (int i = 0; i < n; ++i) {
 								        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
 								        memcpy(&t, &fp16, sizeof(uint16_t));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    }
 								}
 								#else
 								inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
 								    for (int i = 0; i < n; ++i) {
 								        y[i] = ggml_silu_f32(x[i]);
 								    }
 								}
 								#endif
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								inline static float ggml_silu_backward_f32(float x, float dy) {
 								    const float s = 1.0f/(1.0f + expf(-x));
 								    return dy*s*(1.0f + x*(1.0f - s));
 								}
 								#ifdef GGML_SILU_FP16
 								inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
 								    for (int i = 0; i < n; ++i) {
 								        // we did not use x[i] to compute forward silu but its f16 equivalent
 								        // take derivative at f16 of x[i]:
 								        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
 								        float usedx = GGML_FP16_TO_FP32(fp16);
 								        dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
 								    }
 								}
 								#else
 								inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
 								    for (int i = 0; i < n; ++i) {
 								        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
 								    }
 								}
 								#endif
-												ggml : use vDSP_sve and vDSP_maxv from Accelerate

											
										
										
											2023-01-07 14:10:16 +00:00
+								inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 								#ifndef GGML_USE_ACCELERATE
 								    ggml_float sum = 0.0;
 								    for (int i = 0; i < n; ++i) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        sum += (ggml_float)x[i];
-												ggml : use vDSP_sve and vDSP_maxv from Accelerate

											
										
										
											2023-01-07 14:10:16 +00:00
+								    }
-												ggml : correct behaviour of ggml_vec_sum_f32 (#390)


											
										
										
											2023-01-08 18:06:09 +00:00
+								    *s = sum;
-												ggml : use vDSP_sve and vDSP_maxv from Accelerate

											
										
										
											2023-01-07 14:10:16 +00:00
+								#else
 								    vDSP_sve(x, 1, s, n);
 								#endif
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    ggml_float sum = 0.0;
 								    for (int i = 0; i < n; ++i) {
 								        sum += (ggml_float)x[i];
 								    }
 								    *s = sum;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
 								    float sum = 0.0f;
 								    for (int i = 0; i < n; ++i) {
 								        sum += GGML_FP16_TO_FP32(x[i]);
 								    }
 								    *s = sum;
 								}
-												ggml : use vDSP_sve and vDSP_maxv from Accelerate

											
										
										
											2023-01-07 14:10:16 +00:00
+								inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
 								#ifndef GGML_USE_ACCELERATE
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    float max = -INFINITY;
-												ggml : use vDSP_sve and vDSP_maxv from Accelerate

											
										
										
											2023-01-07 14:10:16 +00:00
+								    for (int i = 0; i < n; ++i) {
 								        max = MAX(max, x[i]);
 								    }
 								    *s = max;
 								#else
 								    vDSP_maxv(x, 1, s, n);
 								#endif
 								}
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
 								    ggml_vec_norm_f32(n, s, x);
 								    *s = 1.f/(*s);
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
 								    float max = -INFINITY;
 								    int idx = 0;
 								    for (int i = 0; i < n; ++i) {
 								        max = MAX(max, x[i]);
 								        if (max == x[i]) { idx = i; }
 								    }
 								    *s = idx;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
 								// data types
 								//
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "NONE",
 								    "DUP",
 								    "ADD",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "ADD1",
 								    "ACC",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "SUB",
 								    "MUL",
 								    "DIV",
 								    "SQR",
 								    "SQRT",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "LOG",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "SUM",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "SUM_ROWS",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "MEAN",
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    "ARGMAX",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "REPEAT",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "REPEAT_BACK",
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "CONCAT",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "SILU_BACK",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "NORM",
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    "RMS_NORM",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "RMS_NORM_BACK",
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "GROUP_NORM",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    "MUL_MAT",
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    "MUL_MAT_ID",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "OUT_PROD",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    "SCALE",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "SET",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "CPY",
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    "CONT",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "RESHAPE",
 								    "VIEW",
 								    "PERMUTE",
 								    "TRANSPOSE",
 								    "GET_ROWS",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "GET_ROWS_BACK",
 								    "DIAG",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "DIAG_MASK_INF",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "DIAG_MASK_ZERO",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "SOFT_MAX",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "SOFT_MAX_BACK",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "ROPE",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "ROPE_BACK",
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
+								    "ALIBI",
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    "CLAMP",
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    "CONV_TRANSPOSE_1D",
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    "IM2COL",
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "CONV_TRANSPOSE_2D",
 								    "POOL_1D",
 								    "POOL_2D",
 								    "UPSCALE",
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    "PAD",
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								    "ARANGE",
 								    "TIMESTEP_EMBEDDING",
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    "ARGSORT",
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    "LEAKY_RELU",
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    "FLASH_ATTN",
 								    "FLASH_FF",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "FLASH_ATTN_BACK",
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								    "SSM_CONV",
 								    "SSM_SCAN",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "WIN_PART",
 								    "WIN_UNPART",
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "GET_REL_POS",
 								    "ADD_REL_POS",
 								    "UNARY",
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
 								    "MAP_UNARY",
 								    "MAP_BINARY",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "MAP_CUSTOM1_F32",
 								    "MAP_CUSTOM2_F32",
 								    "MAP_CUSTOM3_F32",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "MAP_CUSTOM1",
 								    "MAP_CUSTOM2",
 								    "MAP_CUSTOM3",
 								    "CROSS_ENTROPY_LOSS",
 								    "CROSS_ENTROPY_LOSS_BACK",
 								};
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : make consts static (#317)

These shouldn't be able to be referenced outside the compilation unit.
											
										
										
											2022-12-23 09:05:27 +00:00
+								static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "none",
 								    "x",
 								    "x+y",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "x+y",
 								    "view(x,nb,offset)+=y->x",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "x-y",
 								    "x*y",
 								    "x/y",
 								    "x^2",
 								    "√x",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "log(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "Σx",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "Σx_k",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "Σx/n",
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    "argmax(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "repeat(x)",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "repeat_back(x)",
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "concat(x, y)",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "silu_back(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "norm(x)",
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    "rms_norm(x)",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "rms_norm_back(x)",
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "group_norm(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "X*Y",
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    "X[i]*Y",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "X*Y",
 								    "x*v",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "y-\\>view(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "x-\\>y",
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    "cont(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "reshape(x)",
 								    "view(x)",
 								    "permute(x)",
 								    "transpose(x)",
 								    "get_rows(x)",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "get_rows_back(x)",
 								    "diag(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "diag_mask_inf(x)",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "diag_mask_zero(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "soft_max(x)",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "soft_max_back(x)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    "rope(x)",
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    "rope_back(x)",
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
+								    "alibi(x)",
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    "clamp(x)",
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    "conv_transpose_1d(x)",
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    "im2col(x)",
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "conv_transpose_2d(x)",
 								    "pool_1d(x)",
 								    "pool_2d(x)",
 								    "upscale(x)",
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    "pad(x)",
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								    "arange(start, stop, step)",
 								    "timestep_embedding(timesteps, dim, max_period)",
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    "argsort(x)",
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    "leaky_relu(x)",
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    "flash_attn(x)",
 								    "flash_ff(x)",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "flash_attn_back(x)",
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								    "ssm_conv(x)",
 								    "ssm_scan(x)",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "win_part(x)",
 								    "win_unpart(x)",
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "get_rel_pos(x)",
 								    "add_rel_pos(x)",
 								    "unary(x)",
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
 								    "f(x)",
 								    "f(x,y)",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    "custom_f32(x)",
 								    "custom_f32(x,y)",
 								    "custom_f32(x,y,z)",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    "custom(x)",
 								    "custom(x,y)",
 								    "custom(x,y,z)",
 								    "cross_entropy_loss(x,y)",
 								    "cross_entropy_loss_back(x,y)",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								};
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
 								static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
 								    "ABS",
 								    "SGN",
 								    "NEG",
 								    "STEP",
 								    "TANH",
 								    "ELU",
 								    "RELU",
 								    "GELU",
 								    "GELU_QUICK",
 								    "SILU",
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								    "HARDSWISH",
 								    "HARDSIGMOID",
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								};
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 								static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// WARN:
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								// Mis-configuration can lead to problem that's hard to reason about:
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// * At best  it crash or talks nosense.
 								// * At worst it talks slightly difference but hard to perceive.
 								//
 								// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
 								// Take care about compile options (e.g., GGML_USE_xxx).
 								static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 };
 								static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
 								static void ggml_setup_op_has_task_pass(void) {
 								    {   // INIT
 								        bool * p = GGML_OP_HAS_INIT;
 								        p[GGML_OP_ACC                    ] = true;
 								        p[GGML_OP_MUL_MAT                ] = true;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        p[GGML_OP_MUL_MAT_ID             ] = true;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        p[GGML_OP_OUT_PROD               ] = true;
 								        p[GGML_OP_SET                    ] = true;
 								        p[GGML_OP_GET_ROWS_BACK          ] = true;
 								        p[GGML_OP_DIAG_MASK_INF          ] = true;
 								        p[GGML_OP_DIAG_MASK_ZERO         ] = true;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
 								        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        p[GGML_OP_ADD_REL_POS            ] = true;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    }
 								    {   // FINALIZE
 								        bool * p = GGML_OP_HAS_FINALIZE;
 								        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
 								// ggml context
 								//
 								struct ggml_context {
 								    size_t mem_size;
 								    void * mem_buffer;
 								    bool   mem_buffer_owned;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    bool   no_alloc;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    int    n_objects;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    struct ggml_object * objects_begin;
 								    struct ggml_object * objects_end;
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
 								    struct ggml_scratch scratch;
 								    struct ggml_scratch scratch_save;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								};
 								struct ggml_context_container {
 								    bool used;
 								    struct ggml_context context;
 								};
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								//
 								// NUMA support
 								//
 								#define GGML_NUMA_MAX_NODES 8
 								#define GGML_NUMA_MAX_CPUS 512
 								struct ggml_numa_node {
 								    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
 								    uint32_t n_cpus;
 								};
 								struct ggml_numa_nodes {
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    enum ggml_numa_strategy numa_strategy;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
 								    uint32_t n_nodes;
 								    uint32_t total_cpus; // hardware threads on system
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    uint32_t current_node; // node on which main process is execting
-												ggml : android and old glibc NUMA incompatibility bugfixes (llama/5557)

* #ifdef out some code NUMA blocks for Android due to lack of support

* added in some __ANDROID__ if def gates around numa code and forced GLIBC prior to 2.29 to use a syscall for getcpu instead of the wrapper

* Changed gates on numa platform specific stuff to __gnu_linux__ to skip any platforms without glibc

* harmonizing #if defined blocks for numa code to __gnu_linux__ since that's the only model that's being followed anyways

---------

Co-authored-by: root <root@nenya.lothlorien.ca>

											
										
										
											2024-02-19 07:38:32 +00:00
+								#if defined(__gnu_linux__)
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    cpu_set_t cpuset; // cpuset from numactl
 								#else
 								    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
 								#endif
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								};
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
 								// ggml state
 								//
 								struct ggml_state {
 								    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    struct ggml_numa_nodes numa;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								};
 								// global state
-												ggml : make consts static (#317)

These shouldn't be able to be referenced outside the compilation unit.
											
										
										
											2022-12-23 09:05:27 +00:00
+								static struct ggml_state g_state;
 								static atomic_int g_state_barrier = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								// barrier via spin lock
-												ggml : add void to argument-less functions

											
										
										
											2023-01-05 19:40:38 +00:00
+								inline static void ggml_critical_section_start(void) {
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								    int processing = atomic_fetch_add(&g_state_barrier, 1);
 								    while (processing > 0) {
 								        // wait for other threads to finish
 								        atomic_fetch_sub(&g_state_barrier, 1);
 								        sched_yield(); // TODO: reconsider this
 								        processing = atomic_fetch_add(&g_state_barrier, 1);
 								    }
 								}
 								// TODO: make this somehow automatically executed
 								//       some sort of "sentry" mechanism
-												ggml : add void to argument-less functions

											
										
										
											2023-01-05 19:40:38 +00:00
+								inline static void ggml_critical_section_end(void) {
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								    atomic_fetch_sub(&g_state_barrier, 1);
 								}
-												ggml : android and old glibc NUMA incompatibility bugfixes (llama/5557)

* #ifdef out some code NUMA blocks for Android due to lack of support

* added in some __ANDROID__ if def gates around numa code and forced GLIBC prior to 2.29 to use a syscall for getcpu instead of the wrapper

* Changed gates on numa platform specific stuff to __gnu_linux__ to skip any platforms without glibc

* harmonizing #if defined blocks for numa code to __gnu_linux__ since that's the only model that's being followed anyways

---------

Co-authored-by: root <root@nenya.lothlorien.ca>

											
										
										
											2024-02-19 07:38:32 +00:00
+								#if defined(__gnu_linux__)
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								static cpu_set_t ggml_get_numa_affinity(void) {
 								    cpu_set_t cpuset;
 								    pthread_t thread;
 								    thread = pthread_self();
 								    CPU_ZERO(&cpuset);
 								    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 								    return cpuset;
 								}
 								#else
 								static uint32_t ggml_get_numa_affinity(void) {
 								    return 0; // no NUMA support
 								}
 								#endif
 								void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    if (g_state.numa.n_nodes > 0) {
 								        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
 								        return;
 								    }
-												ggml : android and old glibc NUMA incompatibility bugfixes (llama/5557)

* #ifdef out some code NUMA blocks for Android due to lack of support

* added in some __ANDROID__ if def gates around numa code and forced GLIBC prior to 2.29 to use a syscall for getcpu instead of the wrapper

* Changed gates on numa platform specific stuff to __gnu_linux__ to skip any platforms without glibc

* harmonizing #if defined blocks for numa code to __gnu_linux__ since that's the only model that's being followed anyways

---------

Co-authored-by: root <root@nenya.lothlorien.ca>

											
										
										
											2024-02-19 07:38:32 +00:00
+								#if defined(__gnu_linux__)
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    struct stat st;
 								    char path[256];
 								    int rv;
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    // set numa scheme
 								    g_state.numa.numa_strategy = numa_flag;
 								    GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
 								    g_state.numa.cpuset = ggml_get_numa_affinity();
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    // enumerate nodes
 								    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
 								        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
 								        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
 								        if (stat(path, &st) != 0) { break; }
 								        ++g_state.numa.n_nodes;
 								    }
 								    // enumerate CPUs
 								    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
 								        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
 								        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
 								        if (stat(path, &st) != 0) { break; }
 								        ++g_state.numa.total_cpus;
 								    }
 								    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    // figure out which node we're on
 								    uint current_cpu;
-												ggml : android and old glibc NUMA incompatibility bugfixes (llama/5557)

* #ifdef out some code NUMA blocks for Android due to lack of support

* added in some __ANDROID__ if def gates around numa code and forced GLIBC prior to 2.29 to use a syscall for getcpu instead of the wrapper

* Changed gates on numa platform specific stuff to __gnu_linux__ to skip any platforms without glibc

* harmonizing #if defined blocks for numa code to __gnu_linux__ since that's the only model that's being followed anyways

---------

Co-authored-by: root <root@nenya.lothlorien.ca>

											
										
										
											2024-02-19 07:38:32 +00:00
+								    int getcpu_ret = 0;
 								#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
 								    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 								#else
 								    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
-												ggml : use SYS_get_cpu if SYS_getcpu is not defined (llama/5906)

Fixes #5694
Fixes ggerganov/whisper.cpp#1894

											
										
										
											2024-03-06 20:42:23 +00:00
+								#   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
 								#       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
 								#   endif
 								    getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
-												ggml : android and old glibc NUMA incompatibility bugfixes (llama/5557)

* #ifdef out some code NUMA blocks for Android due to lack of support

* added in some __ANDROID__ if def gates around numa code and forced GLIBC prior to 2.29 to use a syscall for getcpu instead of the wrapper

* Changed gates on numa platform specific stuff to __gnu_linux__ to skip any platforms without glibc

* harmonizing #if defined blocks for numa code to __gnu_linux__ since that's the only model that's being followed anyways

---------

Co-authored-by: root <root@nenya.lothlorien.ca>

											
										
										
											2024-02-19 07:38:32 +00:00
+								#endif
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
 								    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        g_state.numa.n_nodes = 0;
 								        return;
 								    }
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
 								        struct ggml_numa_node * node = &g_state.numa.nodes[n];
 								        GGML_PRINT_DEBUG("CPUs on node %u:", n);
 								        node->n_cpus = 0;
 								        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
 								            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
 								            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
 								            if (stat(path, &st) == 0) {
 								                node->cpus[node->n_cpus++] = c;
 								                GGML_PRINT_DEBUG(" %u", c);
 								            }
 								        }
 								        GGML_PRINT_DEBUG("\n");
 								    }
 								    if (ggml_is_numa()) {
 								        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
 								        if (fptr != NULL) {
 								            char buf[42];
 								            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
 								                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
 								            }
 								            fclose(fptr);
 								        }
 								    }
 								#else
-												ci : add an option to fail on compile warning (llama/3952)

* feat(ci): add an option to fail on compile warning

* Update CMakeLists.txt

* minor : fix compile warnings

ggml-ci

* ggml : fix unreachable code warnings

ggml-ci

* ci : disable fatal warnings for windows, ios and tvos

* ggml : fix strncpy warning

* ci : disable fatal warnings for MPI build

* ci : add fatal warnings to ggml-ci

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-17 21:03:14 +00:00
+								    GGML_UNUSED(numa_flag);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    // TODO
 								#endif
 								}
 								bool ggml_is_numa(void) {
 								    return g_state.numa.n_nodes > 1;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								////////////////////////////////////////////////////////////////////////////////
 								void ggml_print_object(const struct ggml_object * obj) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
 								            obj->type, obj->offs, obj->size, (const void *) obj->next);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								void ggml_print_objects(const struct ggml_context * ctx) {
 								    struct ggml_object * obj = ctx->objects_begin;
 								    GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
 								    while (obj != NULL) {
 								        ggml_print_object(obj);
 								        obj = obj->next;
 								    }
 								    GGML_PRINT("%s: --- end ---\n", __func__);
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
-												whisper : Metal and ggml-alloc support (#1270)

* metal : init

* whisper : factor out graph builds

* whisper : allocate encoder and decoder using ggml-alloc

* whisper : ggml-alloc is now supported

* whisper : CoreML support ggml-alloc

* build : fix ggml-alloc

* ios : update submodule

* extra : update sync-ggml.sh script to also sync ggml-alloc

* ci : see if this is causing the crash

* whisper : refactor ggml-alloc init

* whisper.android : try to fix build

* whisper : initial Metal version

* ci : try to debug vmem issue

* metal : decoder works on GPU!

* metal : add multi-decoder support

* ggml : fix ggml_nbytes (probably temp solution)

* metal : run "cross" step on the GPU

* whisper : remove ggml_repeat in the encoder

* whisper : offload the Encoder to Metal

* ggml : use simpler ggml_bytes() implementation

* ggml-alloc : try to make CI happy by reducing vram to 128GB

* whisper : add whisper_allocr to wrap ggml_allocr

* whisper : factor out alloc init in a function

* cmake : update to support Metal build

* whisper : add <functional> header

* objc : fix build (no Metal yet)

* ios : add Metal support

* swiftui : fix build

* metal : speed-up KQ multiplication

* metal : sync latest llama.cpp kernels

* readme : add Metal info

* ios : update submodule

* coreml : add code to toggle Core ML config (CPU, ANE, GPU)

* bench : fix timings by running a pre-heat

* bench : start benching the decoder

* whisper : add ggml_mul_mat_pad

* bench : fix uninitialized vars

* whisper : add comment for disabling mul-mat padding

* whisper : add description of ggml_mul_mat_pad

* whisper : clean-up ggml_mul_mat_pad

* metal : remove the "concurrent" flag

* bench : variable n_past

* ios : update SPM package
											
										
										
											2023-09-15 09:18:18 +00:00
+								    size_t nbytes;
 								    size_t blck_size = ggml_blck_size(tensor->type);
 								    if (blck_size == 1) {
 								        nbytes = ggml_type_size(tensor->type);
 								        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
 								            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
 								        }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    }
-												whisper : Metal and ggml-alloc support (#1270)

* metal : init

* whisper : factor out graph builds

* whisper : allocate encoder and decoder using ggml-alloc

* whisper : ggml-alloc is now supported

* whisper : CoreML support ggml-alloc

* build : fix ggml-alloc

* ios : update submodule

* extra : update sync-ggml.sh script to also sync ggml-alloc

* ci : see if this is causing the crash

* whisper : refactor ggml-alloc init

* whisper.android : try to fix build

* whisper : initial Metal version

* ci : try to debug vmem issue

* metal : decoder works on GPU!

* metal : add multi-decoder support

* ggml : fix ggml_nbytes (probably temp solution)

* metal : run "cross" step on the GPU

* whisper : remove ggml_repeat in the encoder

* whisper : offload the Encoder to Metal

* ggml : use simpler ggml_bytes() implementation

* ggml-alloc : try to make CI happy by reducing vram to 128GB

* whisper : add whisper_allocr to wrap ggml_allocr

* whisper : factor out alloc init in a function

* cmake : update to support Metal build

* whisper : add <functional> header

* objc : fix build (no Metal yet)

* ios : add Metal support

* swiftui : fix build

* metal : speed-up KQ multiplication

* metal : sync latest llama.cpp kernels

* readme : add Metal info

* ios : update submodule

* coreml : add code to toggle Core ML config (CPU, ANE, GPU)

* bench : fix timings by running a pre-heat

* bench : start benching the decoder

* whisper : add ggml_mul_mat_pad

* bench : fix uninitialized vars

* whisper : add comment for disabling mul-mat padding

* whisper : add description of ggml_mul_mat_pad

* whisper : clean-up ggml_mul_mat_pad

* metal : remove the "concurrent" flag

* bench : variable n_past

* ios : update SPM package
											
										
										
											2023-09-15 09:18:18 +00:00
+								    else {
 								        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
 								        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
 								            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
 								        }
 								    }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    return nbytes;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
 								    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL int ggml_blck_size(enum ggml_type type) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return type_traits[type].blck_size;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL size_t ggml_type_size(enum ggml_type type) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return type_traits[type].type_size;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    assert(ne % ggml_blck_size(type) == 0);
 								    return ggml_type_size(type)*ne/ggml_blck_size(type);
 								}
 								double ggml_type_sizef(enum ggml_type type) {
 								    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL const char * ggml_type_name(enum ggml_type type) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return type_traits[type].type_name;
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return type_traits[type].is_quantized;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL const char * ggml_op_name(enum ggml_op op) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    return GGML_OP_NAME[op];
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								const char * ggml_op_symbol(enum ggml_op op) {
 								    return GGML_OP_SYMBOL[op];
 								}
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								const char * ggml_unary_op_name(enum ggml_unary_op op) {
 								    return GGML_UNARY_OP_NAME[op];
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    if (t->op == GGML_OP_UNARY) {
 								        enum ggml_unary_op uop = ggml_get_unary_op(t);
 								        return ggml_unary_op_name(uop);
 								    }
 								    else {
 								        return ggml_op_name(t->op);
 								    }
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_type_size(tensor->type);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								bool ggml_is_scalar(const struct ggml_tensor * tensor) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
 								}
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								bool ggml_is_vector(const struct ggml_tensor * tensor) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
 								}
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								bool ggml_is_matrix(const struct ggml_tensor * tensor) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
 								}
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								bool ggml_is_3d(const struct ggml_tensor * tensor) {
 								    return tensor->ne[3] == 1;
 								}
 								int ggml_n_dims(const struct ggml_tensor * tensor) {
 								    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
 								        if (tensor->ne[i] > 1) {
 								            return i + 1;
 								        }
 								    }
 								    return 1;
 								}
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return (t0->ne[0]           == t1->ne[0])  &&
 								           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
 								           (t1->ne[3]%t0->ne[3] == 0);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
 								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    return (t0->ne[1] == t1->ne[1])   &&
 								           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
 								           (t1->ne[3]%t0->ne[3] == 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
+								enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
 								    enum ggml_type wtype = GGML_TYPE_COUNT;
 								    switch (ftype) {
 								        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
 								        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
 								        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
 								        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
 								        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
 								        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
 								        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_FTYPE_MOSTLY_Q2_K:          wtype = GGML_TYPE_Q2_K;  break;
 								        case GGML_FTYPE_MOSTLY_Q3_K:          wtype = GGML_TYPE_Q3_K;  break;
 								        case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
 								        case GGML_FTYPE_MOSTLY_Q5_K:          wtype = GGML_TYPE_Q5_K;  break;
 								        case GGML_FTYPE_MOSTLY_Q6_K:          wtype = GGML_TYPE_Q6_K;  break;
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_TYPE_IQ2_XXS;  break;
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ3_XXS:       wtype = GGML_TYPE_IQ3_XXS;  break;
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ1_S:         wtype = GGML_TYPE_IQ1_S;    break;
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ1_M:         wtype = GGML_TYPE_IQ1_M;    break;
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ4_NL:        wtype = GGML_TYPE_IQ4_NL;   break;
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
-												whisper : add integer quantization support (#540)

* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
											
										
										
											2023-04-30 15:51:57 +00:00
+								        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
 								        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
 								    }
 								    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
 								    return wtype;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								size_t ggml_tensor_overhead(void) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    return tensor->nb[0] > tensor->nb[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        tensor->nb[0] == ggml_type_size(tensor->type) &&
 								        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
 								        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
 								        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 								}
 								static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
 								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return
 								        tensor->nb[0] == ggml_type_size(tensor->type) &&
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
 								        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
 								}
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        tensor->nb[0] == ggml_type_size(tensor->type) &&
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-												minor : small code cleanups (#302)

* Small code cleanups

- fix indentation
- remove extra semicolons
- remove extra break after returns in case statements
- remove unnecessary call to .data() on string
- use empty() instead of checking size()
- no need to check for nullptr before free
- remove unnecessary initialization of string to ""

* minor : switch case always break

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2022-12-22 15:06:19 +00:00
+								        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
 								    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
 								        if (tensor->ne[i] == 0) {
 								            // empty if any dimension has no elements
 								            return true;
 								        }
 								    }
 								    return false;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return
 								        (t0->ne[0] == t1->ne[0] ) &&
 								        (t0->ne[1] == t1->ne[1] ) &&
 								        (t0->ne[2] == t1->ne[2] ) &&
 								        (t0->ne[3] == t1->ne[3] );
 								}
 								// check if t1 can be represented as a repeatition of t0
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    return ggml_is_empty(t0) ? ggml_is_empty(t1) :
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        (t1->ne[0]%t0->ne[0] == 0) &&
 								        (t1->ne[1]%t0->ne[1] == 0) &&
 								        (t1->ne[2]%t0->ne[2] == 0) &&
 								        (t1->ne[3]%t0->ne[3] == 0);
 								}
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
 								    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 								    return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
 								}
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								static inline int ggml_up32(int n) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    return (n + 31) & ~31;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								//static inline int ggml_up64(int n) {
 								//    return (n + 63) & ~63;
 								//}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								static inline int ggml_up(int n, int m) {
 								    // assert m is a power of 2
 								    GGML_ASSERT((m & (m - 1)) == 0);
 								    return (n + m - 1) & ~(m - 1);
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// assert that pointer is aligned to GGML_MEM_ALIGN
 								#define ggml_assert_aligned(ptr) \
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								////////////////////////////////////////////////////////////////////////////////
 								struct ggml_context * ggml_init(struct ggml_init_params params) {
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
+								    // make this function thread safe
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								    ggml_critical_section_start();
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    static bool is_first_call = true;
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    if (is_first_call) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        // initialize time system (required on Windows)
 								        ggml_time_init();
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								        {
 								            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 								            ggml_fp16_t ii;
 								            for (int i = 0; i < (1 << 16); ++i) {
 								                uint16_t ui = i;
 								                memcpy(&ii, &ui, sizeof(ii));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
 								                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
 								                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
 								                ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
 								                ggml_table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								            }
 								            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								        // initialize g_state
 								        {
 								            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 								            g_state = (struct ggml_state) {
-												ggml : make gcc happy (minor)

											
										
										
											2023-01-07 07:34:39 +00:00
+								                /*.contexts =*/ { { 0 } },
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                /*.numa =*/ {
 								                    .n_nodes = 0,
 								                    .total_cpus = 0,
 								                },
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								            };
 								            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
 								                g_state.contexts[i].used = false;
 								            }
 								            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
 								        }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								#if defined(GGML_USE_CLBLAST)
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        ggml_cl_init();
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								#elif defined(GGML_USE_VULKAN)
-												Basic Vulkan Multi-GPU implementation (llama/5321)

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-07 06:54:50 +00:00
+								        ggml_vk_init_cpu_assist();
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								#endif
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        ggml_setup_op_has_task_pass();
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        is_first_call = false;
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // find non-used context in g_state
 								    struct ggml_context * ctx = NULL;
 								    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
 								        if (!g_state.contexts[i].used) {
 								            g_state.contexts[i].used = true;
 								            ctx = &g_state.contexts[i].context;
 								            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
 								            break;
 								        }
 								    }
 								    if (ctx == NULL) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								        ggml_critical_section_end();
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return NULL;
 								    }
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								    // allow to call ggml_init with 0 size
 								    if (params.mem_size == 0) {
 								        params.mem_size = GGML_MEM_ALIGN;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    *ctx = (struct ggml_context) {
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								        /*.mem_size           =*/ mem_size,
 								        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        /*.no_alloc           =*/ params.no_alloc,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        /*.no_alloc_save      =*/ params.no_alloc,
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        /*.n_objects          =*/ 0,
 								        /*.objects_begin      =*/ NULL,
 								        /*.objects_end        =*/ NULL,
 								        /*.scratch            =*/ { 0, 0, NULL, },
 								        /*.scratch_save       =*/ { 0, 0, NULL, },
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    };
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								    GGML_ASSERT(ctx->mem_buffer != NULL);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    ggml_assert_aligned(ctx->mem_buffer);
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
+								    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								    ggml_critical_section_end();
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    return ctx;
 								}
 								void ggml_free(struct ggml_context * ctx) {
-												llama : ggml-backend integration (llama/4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (llama/4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (llama/4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

											
										
										
											2024-01-12 19:07:38 +00:00
+								    if (ctx == NULL) {
 								        return;
 								    }
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
+								    // make this function thread safe
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								    ggml_critical_section_start();
 								    bool found = false;
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
 								        if (&g_state.contexts[i].context == ctx) {
 								            g_state.contexts[i].used = false;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
 								                    __func__, i, ggml_used_mem(ctx));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								            if (ctx->mem_buffer_owned) {
-												ggml : sync latest changes from ggml and llama.cpp

											
										
										
											2023-04-13 15:53:44 +00:00
+								                GGML_ALIGNED_FREE(ctx->mem_buffer);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								            found = true;
 								            break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								    if (!found) {
 								        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
 								    }
-												ggml : fix thread-safety of ggml_init and ggml_free

											
										
										
											2022-10-29 08:23:44 +00:00
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								    ggml_critical_section_end();
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								size_t ggml_used_mem(const struct ggml_context * ctx) {
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
+								}
 								size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
 								    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
 								    ctx->scratch = scratch;
 								    return result;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								bool ggml_get_no_alloc(struct ggml_context * ctx) {
 								    return ctx->no_alloc;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
 								    ctx->no_alloc = no_alloc;
 								}
 								void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
 								    return ctx->mem_buffer;
 								}
 								size_t ggml_get_mem_size(const struct ggml_context * ctx) {
 								    return ctx->mem_size;
 								}
 								size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
 								    size_t max_size = 0;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
-												ggml : avoid duplicating function calls using MIN/MAX macros (llama/5325)

* Avoid duplicating function calls when using MIN/MAX macros.

Since these copy "a" and "b" they ask the compiler to evaluate one of them twice. The compiler doesn't have a problem with removing the duplication in something like MAX(0, x + 2), but in some cases we're calling functions, and those calls just happen twice.
By explicitly evaluating at the expression we get smaller and faster code without duplicate calls. See ggml_rope_yarn_corr_dims in Compiler Explorer:

https://godbolt.org/z/Ee4KMrvKh

Code behaves exactly the same.

* Update ggml.c

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-05 11:13:57 +00:00
+								        size_t bytes = ggml_nbytes(tensor);
 								        max_size = MAX(max_size, bytes);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    }
 								    return max_size;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// IMPORTANT:
 								// when creating "opt" tensors, always save and load the scratch buffer
 								// this is an error prone process, but it is necessary to support inplace
 								// operators when using scratch buffers
 								// TODO: implement a better way
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_scratch_save(struct ggml_context * ctx) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // this is needed to allow opt tensors to store their data
 								    // TODO: again, need to find a better way
 								    ctx->no_alloc_save = ctx->no_alloc;
 								    ctx->no_alloc      = false;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    ctx->scratch_save = ctx->scratch;
 								    ctx->scratch.data = NULL;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_scratch_load(struct ggml_context * ctx) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ctx->no_alloc = ctx->no_alloc_save;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    ctx->scratch = ctx->scratch_save;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								////////////////////////////////////////////////////////////////////////////////
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // always insert objects at the end of the context's memory pool
 								    struct ggml_object * obj_cur = ctx->objects_end;
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
+								    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
 								    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
 								    const size_t cur_end  = cur_offs + cur_size;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // align to GGML_MEM_ALIGN
 								    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    char * const mem_buffer = ctx->mem_buffer;
 								    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
 								        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
 								                __func__, cur_end + size_needed, ctx->mem_size);
 								        assert(false);
 								        return NULL;
 								    }
 								    *obj_new = (struct ggml_object) {
 								        .offs = cur_end + GGML_OBJECT_SIZE,
 								        .size = size_needed,
 								        .next = NULL,
 								        .type = type,
 								    };
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_assert_aligned(mem_buffer + obj_new->offs);
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (obj_cur != NULL) {
 								        obj_cur->next = obj_new;
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
+								    } else {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        // this is the first object in this context
 								        ctx->objects_begin = obj_new;
 								    }
 								    ctx->objects_end = obj_new;
 								    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
 								    return obj_new;
 								}
 								static struct ggml_tensor * ggml_new_tensor_impl(
 								        struct ggml_context * ctx,
 								        enum   ggml_type      type,
 								        int                   n_dims,
 								        const int64_t       * ne,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        struct ggml_tensor  * view_src,
 								        size_t                view_offs) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    // find the base tensor and absolute offset
 								    if (view_src != NULL && view_src->view_src != NULL) {
 								        view_offs += view_src->view_offs;
 								        view_src   = view_src->view_src;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    size_t data_size = ggml_row_size(type, ne[0]);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    for (int i = 1; i < n_dims; i++) {
 								        data_size *= ne[i];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    }
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
 								    void * data = view_src != NULL ? view_src->data : NULL;
 								    if (data != NULL) {
 								        data = (char *) data + view_offs;
 								    }
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    size_t obj_alloc_size = 0;
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								    if (view_src == NULL && !ctx->no_alloc) {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        if (ctx->scratch.data != NULL) {
 								            // allocate tensor data in the scratch buffer
 								            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
 								                GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
 								                        __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
 								                assert(false);
 								                return NULL;
 								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
 								            ctx->scratch.offs += data_size;
 								        } else {
 								            // allocate tensor data in the context's memory pool
 								            obj_alloc_size = data_size;
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    *result = (struct ggml_tensor) {
 								        /*.type         =*/ type,
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        /*.backend      =*/ GGML_BACKEND_TYPE_CPU,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        /*.buffer       =*/ NULL,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        /*.ne           =*/ { 1, 1, 1, 1 },
 								        /*.nb           =*/ { 0, 0, 0, 0 },
 								        /*.op           =*/ GGML_OP_NONE,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        /*.op_params    =*/ { 0 },
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								        /*.flags        =*/ 0,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        /*.grad         =*/ NULL,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        /*.src          =*/ { NULL },
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        /*.perf_runs    =*/ 0,
 								        /*.perf_cycles  =*/ 0,
 								        /*.perf_time_us =*/ 0,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        /*.view_src     =*/ view_src,
 								        /*.view_offs    =*/ view_offs,
 								        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        /*.name         =*/ { 0 },
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        /*.extra        =*/ NULL,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        /*.padding      =*/ { 0 },
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    };
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
 								    //ggml_assert_aligned(result->data);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    for (int i = 0; i < n_dims; i++) {
 								        result->ne[i] = ne[i];
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->nb[0] = ggml_type_size(type);
 								    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    for (int i = 2; i < GGML_MAX_DIMS; i++) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
 								    }
 								    ctx->n_objects++;
 								    return result;
 								}
 								struct ggml_tensor * ggml_new_tensor(
 								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        enum   ggml_type      type,
 								        int                   n_dims,
 								        const int64_t       * ne) {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_new_tensor_1d(
 								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        enum   ggml_type      type,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        int64_t ne0) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    return ggml_new_tensor(ctx, type, 1, &ne0);
 								}
 								struct ggml_tensor * ggml_new_tensor_2d(
 								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        enum   ggml_type      type,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        int64_t ne0,
 								        int64_t ne1) {
 								    const int64_t ne[2] = { ne0, ne1 };
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    return ggml_new_tensor(ctx, type, 2, ne);
 								}
 								struct ggml_tensor * ggml_new_tensor_3d(
 								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        enum   ggml_type      type,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        int64_t ne0,
 								        int64_t ne1,
 								        int64_t ne2) {
 								    const int64_t ne[3] = { ne0, ne1, ne2 };
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    return ggml_new_tensor(ctx, type, 3, ne);
 								}
 								struct ggml_tensor * ggml_new_tensor_4d(
 								        struct ggml_context * ctx,
 								        enum   ggml_type type,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        int64_t ne0,
 								        int64_t ne1,
 								        int64_t ne2,
 								        int64_t ne3) {
 								    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    return ggml_new_tensor(ctx, type, 4, ne);
 								}
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    ggml_scratch_save(ctx);
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    ggml_scratch_load(ctx);
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    ggml_set_i32(result, value);
 								    return result;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    ggml_scratch_save(ctx);
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    ggml_scratch_load(ctx);
-												whisper : reduce memory usage during inference (#431)

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md
											
										
										
											2023-02-04 07:45:52 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    ggml_set_f32(result, value);
 								    return result;
 								}
 								struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								}
 								static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
 								    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
 								    assert(params_size <= GGML_MAX_OP_PARAMS);
 								    memcpy(tensor->op_params, params, params_size);
 								}
 								static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
 								    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
 								    return ((const int32_t *)(tensor->op_params))[i];
 								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
 								    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
 								    return ((const float *)(tensor->op_params))[i];
 								}
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
 								    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
 								    ((int32_t *)(tensor->op_params))[i] = value;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
 								    assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
 								    ((float *)(tensor->op_params))[i] = value;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
 								    memset(tensor->data, 0, ggml_nbytes(tensor));
 								    return tensor;
 								}
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
 								    const int n     = ggml_nrows(tensor);
 								    const int nc    = tensor->ne[0];
 								    const size_t n1 = tensor->nb[1];
 								    char * const data = tensor->data;
 								    switch (tensor->type) {
 								        case GGML_TYPE_I8:
 								            {
 								                assert(tensor->nb[0] == sizeof(int8_t));
 								                for (int i = 0; i < n; i++) {
 								                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
 								                }
 								            } break;
 								        case GGML_TYPE_I16:
 								            {
 								                assert(tensor->nb[0] == sizeof(int16_t));
 								                for (int i = 0; i < n; i++) {
 								                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
 								                }
 								            } break;
 								        case GGML_TYPE_I32:
 								            {
 								                assert(tensor->nb[0] == sizeof(int32_t));
 								                for (int i = 0; i < n; i++) {
 								                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
 								                }
 								            } break;
 								        case GGML_TYPE_F16:
 								            {
 								                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
 								                for (int i = 0; i < n; i++) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                }
 								            } break;
 								        case GGML_TYPE_F32:
 								            {
 								                assert(tensor->nb[0] == sizeof(float));
 								                for (int i = 0; i < n; i++) {
 								                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
 								                }
 								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
 								    }
 								    return tensor;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
 								    const int n     = ggml_nrows(tensor);
 								    const int nc    = tensor->ne[0];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    const size_t n1 = tensor->nb[1];
 								    char * const data = tensor->data;
 								    switch (tensor->type) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_I8:
 								            {
 								                assert(tensor->nb[0] == sizeof(int8_t));
 								                for (int i = 0; i < n; i++) {
 								                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
 								                }
 								            } break;
 								        case GGML_TYPE_I16:
 								            {
 								                assert(tensor->nb[0] == sizeof(int16_t));
 								                for (int i = 0; i < n; i++) {
 								                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
 								                }
 								            } break;
 								        case GGML_TYPE_I32:
 								            {
 								                assert(tensor->nb[0] == sizeof(int32_t));
 								                for (int i = 0; i < n; i++) {
 								                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
 								                }
 								            } break;
 								        case GGML_TYPE_F16:
 								            {
 								                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
 								                for (int i = 0; i < n; i++) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
 								        case GGML_TYPE_F32:
 								            {
 								                assert(tensor->nb[0] == sizeof(float));
 								                for (int i = 0; i < n; i++) {
 								                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
 								                }
 								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								    return tensor;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
 								    const int64_t ne2 = tensor->ne[2];
 								    const int64_t ne1 = tensor->ne[1];
 								    const int64_t ne0 = tensor->ne[0];
 								    const int64_t i3_ = (i/(ne2*ne1*ne0));
 								    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
 								    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
 								    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
 								    if (i0) {
 								        * i0 = i0_;
 								    }
 								    if (i1) {
 								        * i1 = i1_;
 								    }
 								    if (i2) {
 								        * i2 = i2_;
 								    }
 								    if (i3) {
 								        * i3 = i3_;
 								    }
 								}
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    if (!ggml_is_contiguous(tensor)) {
 								        int64_t id[4] = { 0, 0, 0, 0 };
 								        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
 								        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
 								    }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    switch (tensor->type) {
 								        case GGML_TYPE_I8:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
 								                return ((int8_t *)(tensor->data))[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        case GGML_TYPE_I16:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
 								                return ((int16_t *)(tensor->data))[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        case GGML_TYPE_I32:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
 								                return ((int32_t *)(tensor->data))[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        case GGML_TYPE_F16:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-												ggml : use macros to inline FP16 <-> FP32 conversions

											
										
										
											2022-12-06 20:05:33 +00:00
+								                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        case GGML_TYPE_F32:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(float));
 								                return ((float *)(tensor->data))[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            {
 								                GGML_ASSERT(false);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    }
 								    return 0.0f;
 								}
 								void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    if (!ggml_is_contiguous(tensor)) {
 								        int64_t id[4] = { 0, 0, 0, 0 };
 								        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
 								        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
 								        return;
 								    }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    switch (tensor->type) {
 								        case GGML_TYPE_I8:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
 								                ((int8_t *)(tensor->data))[i] = value;
 								            } break;
 								        case GGML_TYPE_I16:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
 								                ((int16_t *)(tensor->data))[i] = value;
 								            } break;
 								        case GGML_TYPE_I32:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
 								                ((int32_t *)(tensor->data))[i] = value;
 								            } break;
 								        case GGML_TYPE_F16:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-												ggml : use macros to inline FP16 <-> FP32 conversions

											
										
										
											2022-12-06 20:05:33 +00:00
+								                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(float));
 								                ((float *)(tensor->data))[i] = value;
 								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
 								    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
 								    switch (tensor->type) {
 								        case GGML_TYPE_I8:
 								            return ((int8_t *) data)[0];
 								        case GGML_TYPE_I16:
 								            return ((int16_t *) data)[0];
 								        case GGML_TYPE_I32:
 								            return ((int32_t *) data)[0];
 								        case GGML_TYPE_F16:
 								            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
 								        case GGML_TYPE_F32:
 								            return ((float *) data)[0];
 								        default:
 								            GGML_ASSERT(false);
 								    }
 								    return 0.0f;
 								}
 								void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
 								    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
 								    switch (tensor->type) {
 								        case GGML_TYPE_I8:
 								            {
 								                ((int8_t *)(data))[0] = value;
 								            } break;
 								        case GGML_TYPE_I16:
 								            {
 								                ((int16_t *)(data))[0] = value;
 								            } break;
 								        case GGML_TYPE_I32:
 								            {
 								                ((int32_t *)(data))[0] = value;
 								            } break;
 								        case GGML_TYPE_F16:
 								            {
 								                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
 								            } break;
 								        case GGML_TYPE_F32:
 								            {
 								                ((float *)(data))[0] = value;
 								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    if (!ggml_is_contiguous(tensor)) {
 								        int64_t id[4] = { 0, 0, 0, 0 };
 								        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
 								        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (tensor->type) {
 								        case GGML_TYPE_I8:
 								            {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                return ((int8_t *)(tensor->data))[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_I16:
 								            {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                return ((int16_t *)(tensor->data))[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_I32:
 								            {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                return ((int32_t *)(tensor->data))[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_F16:
 								            {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-												ggml : use macros to inline FP16 <-> FP32 conversions

											
										
										
											2022-12-06 20:05:33 +00:00
+								                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_F32:
 								            {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                return ((float *)(tensor->data))[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            }
 								    }
 								    return 0.0f;
 								}
 								void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
 								    if (!ggml_is_contiguous(tensor)) {
 								        int64_t id[4] = { 0, 0, 0, 0 };
 								        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
 								        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
 								        return;
 								    }
 								    switch (tensor->type) {
 								        case GGML_TYPE_I8:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
 								                ((int8_t *)(tensor->data))[i] = value;
 								            } break;
 								        case GGML_TYPE_I16:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
 								                ((int16_t *)(tensor->data))[i] = value;
 								            } break;
 								        case GGML_TYPE_I32:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
 								                ((int32_t *)(tensor->data))[i] = value;
 								            } break;
 								        case GGML_TYPE_F16:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
 								                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
 								            } break;
 								        case GGML_TYPE_F32:
 								            {
 								                GGML_ASSERT(tensor->nb[0] == sizeof(float));
 								                ((float *)(tensor->data))[i] = value;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								}
 								float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
 								    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
 								    switch (tensor->type) {
 								        case GGML_TYPE_I8:
 								            return ((int8_t *) data)[0];
 								        case GGML_TYPE_I16:
 								            return ((int16_t *) data)[0];
 								        case GGML_TYPE_I32:
 								            return ((int32_t *) data)[0];
 								        case GGML_TYPE_F16:
 								            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
 								        case GGML_TYPE_F32:
 								            return ((float *) data)[0];
 								        default:
 								            GGML_ASSERT(false);
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return 0.0f;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
 								    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (tensor->type) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        case GGML_TYPE_I8:
 								            {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                ((int8_t *)(data))[0] = value;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_TYPE_I16:
 								            {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                ((int16_t *)(data))[0] = value;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_TYPE_I32:
 								            {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                ((int32_t *)(data))[0] = value;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_TYPE_F16:
 								            {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                ((float *)(data))[0] = value;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
 								void * ggml_get_data(const struct ggml_tensor * tensor) {
 								    return tensor->data;
 								}
 								float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
 								    assert(tensor->type == GGML_TYPE_F32);
 								    return (float *)(tensor->data);
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(tensor->op == GGML_OP_UNARY);
 								    return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
 								}
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								const char * ggml_get_name(const struct ggml_tensor * tensor) {
 								    return tensor->name;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
-												ci : add an option to fail on compile warning (llama/3952)

* feat(ci): add an option to fail on compile warning

* Update CMakeLists.txt

* minor : fix compile warnings

ggml-ci

* ggml : fix unreachable code warnings

ggml-ci

* ci : disable fatal warnings for windows, ios and tvos

* ggml : fix strncpy warning

* ci : disable fatal warnings for MPI build

* ci : add fatal warnings to ggml-ci

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-17 21:03:14 +00:00
+								    strncpy(tensor->name, name, sizeof(tensor->name) - 1);
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								    tensor->name[sizeof(tensor->name) - 1] = '\0';
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    return tensor;
 								}
 								struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
 								    va_list args;
 								    va_start(args, fmt);
 								    vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
 								    va_end(args);
 								    return tensor;
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								struct ggml_tensor * ggml_view_tensor(
 								        struct ggml_context * ctx,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        struct ggml_tensor  * src) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (view)", src->name);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    for (int i = 0; i < GGML_MAX_DIMS; i++) {
 								        result->nb[i] = src->nb[i];
 								    }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
 								    return result;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct ggml_object * obj = ctx->objects_begin;
 								    char * const mem_buffer = ctx->mem_buffer;
 								    while (obj != NULL) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            return (struct ggml_tensor *)(mem_buffer + obj->offs);
 								        }
 								        obj = obj->next;
 								    }
 								    return NULL;
 								}
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
 								    obj = obj->next;
 								    char * const mem_buffer = ctx->mem_buffer;
 								    while (obj != NULL) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            return (struct ggml_tensor *)(mem_buffer + obj->offs);
 								        }
 								        obj = obj->next;
 								    }
 								    return NULL;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
 								    struct ggml_object * obj = ctx->objects_begin;
 								    char * const mem_buffer = ctx->mem_buffer;
 								    while (obj != NULL) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
 								            if (strcmp(cur->name, name) == 0) {
 								                return cur;
 								            }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        }
 								        obj = obj->next;
 								    }
 								    return NULL;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								////////////////////////////////////////////////////////////////////////////////
 								// ggml_dup
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_dup_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_DUP;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_dup(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a) {
 								    return ggml_dup_impl(ctx, a, false);
 								}
 								struct ggml_tensor * ggml_dup_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a) {
 								    return ggml_dup_impl(ctx, a, true);
 								}
 								// ggml_add
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_add_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        bool inplace) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ggml_can_repeat(b, a));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (!inplace && (a->grad || b->grad)) {
 								        // TODO: support backward pass for broadcasting
 								        GGML_ASSERT(ggml_are_same_shape(a, b));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_ADD;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_add(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
 								    return ggml_add_impl(ctx, a, b, false);
 								}
 								struct ggml_tensor * ggml_add_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
 								    return ggml_add_impl(ctx, a, b, true);
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// ggml_add_cast
 								static struct ggml_tensor * ggml_add_cast_impl(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        enum   ggml_type     type) {
 								    // TODO: support less-strict constraint
 								    //       GGML_ASSERT(ggml_can_repeat(b, a));
 								    GGML_ASSERT(ggml_can_repeat_rows(b, a));
 								    GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
 								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        // TODO: support backward pass for broadcasting
 								        GGML_ASSERT(ggml_are_same_shape(a, b));
 								        is_node = true;
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    result->op   = GGML_OP_ADD;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
 								    return result;
 								}
 								struct ggml_tensor * ggml_add_cast(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        enum   ggml_type     type) {
 								    return ggml_add_cast_impl(ctx, a, b, type);
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_add1
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_add1_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        bool inplace) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_is_scalar(b));
 								    GGML_ASSERT(ggml_is_padded_1d(a));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if (a->grad || b->grad) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    result->op   = GGML_OP_ADD1;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_add1(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    return ggml_add1_impl(ctx, a, b, false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_add1_inplace(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    return ggml_add1_impl(ctx, a, b, true);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_acc
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_acc_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        size_t               nb1,
 								        size_t               nb2,
 								        size_t               nb3,
 								        size_t               offset,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        bool inplace) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
 								    GGML_ASSERT(ggml_is_contiguous(a));
 								    GGML_ASSERT(a->type == GGML_TYPE_F32);
 								    GGML_ASSERT(b->type == GGML_TYPE_F32);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
 								    if (!inplace && (a->grad || b->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
 								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_ACC;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_acc(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        size_t               nb1,
 								        size_t               nb2,
 								        size_t               nb3,
 								        size_t               offset) {
 								    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_acc_inplace(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        size_t               nb1,
 								        size_t               nb2,
 								        size_t               nb3,
 								        size_t               offset) {
 								    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_sub
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_sub_impl(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        bool inplace) {
 								    GGML_ASSERT(ggml_are_same_shape(a, b));
 								    bool is_node = false;
 								    if (!inplace && (a->grad || b->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_SUB;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_sub(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
 								    return ggml_sub_impl(ctx, a, b, false);
 								}
 								struct ggml_tensor * ggml_sub_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
 								    return ggml_sub_impl(ctx, a, b, true);
 								}
 								// ggml_mul
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_mul_impl(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        bool inplace) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ggml_can_repeat(b, a));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    bool is_node = false;
 								    if (!inplace && (a->grad || b->grad)) {
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        // TODO: support backward pass for broadcasting
 								        GGML_ASSERT(ggml_are_same_shape(a, b));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        is_node = true;
 								    }
 								    if (inplace) {
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								        GGML_ASSERT(!is_node);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_MUL;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_mul(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    return ggml_mul_impl(ctx, a, b, false);
 								}
 								struct ggml_tensor * ggml_mul_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    return ggml_mul_impl(ctx, a, b, true);
 								}
 								// ggml_div
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_div_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b,
 								        bool inplace) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ggml_can_repeat(b, a));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
 								    if (!inplace && (a->grad || b->grad)) {
 								        is_node = true;
 								    }
 								    if (inplace) {
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								        GGML_ASSERT(!is_node);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_DIV;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_div(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    return ggml_div_impl(ctx, a, b, false);
 								}
 								struct ggml_tensor * ggml_div_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    return ggml_div_impl(ctx, a, b, true);
 								}
 								// ggml_sqr
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_sqr_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_SQR;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_sqr(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    return ggml_sqr_impl(ctx, a, false);
 								}
 								struct ggml_tensor * ggml_sqr_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    return ggml_sqr_impl(ctx, a, true);
 								}
 								// ggml_sqrt
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_sqrt_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_SQRT;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_sqrt(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    return ggml_sqrt_impl(ctx, a, false);
 								}
 								struct ggml_tensor * ggml_sqrt_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    return ggml_sqrt_impl(ctx, a, true);
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_log
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_log_impl(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_LOG;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_log(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    return ggml_log_impl(ctx, a, false);
 								}
 								struct ggml_tensor * ggml_log_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    return ggml_log_impl(ctx, a, true);
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_sum
 								struct ggml_tensor * ggml_sum(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a) {
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
 								    result->op   = GGML_OP_SUM;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_sum_rows
 								struct ggml_tensor * ggml_sum_rows(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a) {
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    int64_t ne[GGML_MAX_DIMS] = { 1 };
 								    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        ne[i] = a->ne[i];
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_SUM_ROWS;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_mean
 								struct ggml_tensor * ggml_mean(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a) {
 								    bool is_node = false;
 								    if (a->grad) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        GGML_ASSERT(false); // TODO: implement
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
 								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    result->op   = GGML_OP_MEAN;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// ggml_argmax
 								struct ggml_tensor * ggml_argmax(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a) {
 								    GGML_ASSERT(ggml_is_matrix(a));
 								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false);
 								        is_node = true;
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
 								    result->op   = GGML_OP_ARGMAX;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
 								    return result;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_repeat
 								struct ggml_tensor * ggml_repeat(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_can_repeat(a, b));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    result->op   = GGML_OP_REPEAT;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_repeat_back
 								struct ggml_tensor * ggml_repeat_back(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
 								    GGML_ASSERT(ggml_can_repeat(b, a));
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
 								    if (ggml_are_same_shape(a, b) && !is_node) {
 								        return a;
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    result->op   = GGML_OP_REPEAT_BACK;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_concat
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								struct ggml_tensor * ggml_concat(
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_context* ctx,
 								    struct ggml_tensor* a,
 								    struct ggml_tensor* b) {
 								    GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (a->grad || b->grad) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->op = GGML_OP_CONCAT;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_abs
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								struct ggml_tensor * ggml_abs(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_abs_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								// ggml_sgn
 								struct ggml_tensor * ggml_sgn(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_sgn_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								// ggml_neg
 								struct ggml_tensor * ggml_neg(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_neg_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								// ggml_step
 								struct ggml_tensor * ggml_step(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_step_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// ggml_tanh
 								struct ggml_tensor * ggml_tanh(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								}
 								struct ggml_tensor * ggml_tanh_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								}
 								// ggml_elu
 								struct ggml_tensor * ggml_elu(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								}
 								struct ggml_tensor * ggml_elu_inplace(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_relu
 								struct ggml_tensor * ggml_relu(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_relu_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								// ggml_leaky_relu
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								struct ggml_tensor * ggml_leaky_relu(
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        struct ggml_context * ctx,
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        struct ggml_tensor  * a, float negative_slope, bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
 								    result->op   = GGML_OP_LEAKY_RELU;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    return result;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_gelu
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_gelu(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_gelu_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_gelu_quick
 								struct ggml_tensor * ggml_gelu_quick(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
 								struct ggml_tensor * ggml_gelu_quick_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								// ggml_silu
 								struct ggml_tensor * ggml_silu(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
 								struct ggml_tensor * ggml_silu_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_silu_back
 								struct ggml_tensor * ggml_silu_back(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        // TODO: implement backward
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_SILU_BACK;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								// ggml hardswish
 								struct ggml_tensor * ggml_hardswish(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
 								}
 								// ggml hardsigmoid
 								struct ggml_tensor * ggml_hardsigmoid(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_norm
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_norm_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        float eps,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        GGML_ASSERT(false); // TODO: implement backward
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, &eps, sizeof(eps));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->op   = GGML_OP_NORM;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_norm(
 								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor  * a,
 								        float eps) {
 								    return ggml_norm_impl(ctx, a, eps, false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_norm_inplace(
 								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor  * a,
 								        float eps) {
 								    return ggml_norm_impl(ctx, a, eps, true);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_rms_norm
 								static struct ggml_tensor * ggml_rms_norm_impl(
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        float eps,
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, &eps, sizeof(eps));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    result->op   = GGML_OP_RMS_NORM;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_rms_norm(
 								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor  * a,
 								        float  eps) {
 								    return ggml_rms_norm_impl(ctx, a, eps, false);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
 								struct ggml_tensor * ggml_rms_norm_inplace(
 								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor  * a,
 								        float eps) {
 								    return ggml_rms_norm_impl(ctx, a, eps, true);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_rms_norm_back
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_rms_norm_back(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        struct ggml_tensor  * b,
 								        float  eps) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    bool is_node = false;
 								    if (a->grad) {
 								        // TODO: implement backward
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    ggml_set_op_params(result, &eps, sizeof(eps));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    result->op   = GGML_OP_RMS_NORM_BACK;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
 								    return result;
 								}
 								// ggml_group_norm
 								static struct ggml_tensor * ggml_group_norm_impl(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor * a,
 								    int n_groups,
 								    bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op_params[0] = n_groups;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
 								    result->op = GGML_OP_GROUP_NORM;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_group_norm(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor * a,
 								    int n_groups) {
 								    return ggml_group_norm_impl(ctx, a, n_groups, false);
 								}
 								struct ggml_tensor * ggml_group_norm_inplace(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor * a,
 								    int n_groups) {
 								    return ggml_group_norm_impl(ctx, a, n_groups, true);
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_mul_mat
 								struct ggml_tensor * ggml_mul_mat(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_can_mul_mat(a, b));
 								    GGML_ASSERT(!ggml_is_transposed(a));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        is_node = true;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    result->op   = GGML_OP_MUL_MAT;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								void ggml_mul_mat_set_prec(
 								        struct ggml_tensor * a,
 								        enum ggml_prec       prec) {
 								    const int32_t prec_i32 = (int32_t) prec;
 								    ggml_set_op_params_i32(a, 0, prec_i32);
 								}
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								// ggml_mul_mat_id
 								struct ggml_tensor * ggml_mul_mat_id(
 								        struct ggml_context * ctx,
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        struct ggml_tensor  * const as[],
 								        int                   n_as,
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        struct ggml_tensor  * ids,
 								        int                   id,
 								        struct ggml_tensor  * b) {
 								    GGML_ASSERT(ids->type == GGML_TYPE_I32);
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
 								    GGML_ASSERT(ids->ne[1] == b->ne[1]);
 								    GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    GGML_ASSERT(id >= 0 && id < ids->ne[0]);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
 								    bool is_node = false;
 								    if (as[0]->grad || b->grad) {
 								        is_node = true;
 								    }
 								    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
 								    ggml_set_op_params_i32(result, 0, id);
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    ggml_set_op_params_i32(result, 1, n_as);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
 								    result->op   = GGML_OP_MUL_MAT_ID;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = ids;
 								    result->src[1] = b;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    for (int i = 0; i < n_as; i++) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        struct ggml_tensor * a = as[i];
 								        GGML_ASSERT(ggml_are_same_shape(as[0], a));
 								        GGML_ASSERT(ggml_can_mul_mat(a, b));
 								        GGML_ASSERT(!ggml_is_transposed(a));
 								        result->src[i + 2] = a;
 								    }
 								    return result;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_out_prod
 								struct ggml_tensor * ggml_out_prod(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    GGML_ASSERT(ggml_can_out_prod(a, b));
 								    GGML_ASSERT(!ggml_is_transposed(a));
 								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        is_node = true;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
 								    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    result->op   = GGML_OP_OUT_PROD;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_scale
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_scale_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								        float                 s,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        bool inplace) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_is_padded_1d(a));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    if (a->grad) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    ggml_set_op_params(result, &s, sizeof(s));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->op   = GGML_OP_SCALE;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_scale(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								        float                s) {
 								    return ggml_scale_impl(ctx, a, s, false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_tensor * ggml_scale_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								        float                s) {
 								    return ggml_scale_impl(ctx, a, s, true);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_set
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_set_impl(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b,
 								        size_t                nb1,
 								        size_t                nb2,
 								        size_t                nb3,
 								        size_t                offset,
 								        bool inplace) {
 								    GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
 								    bool is_node = false;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if (a->grad || b->grad) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        is_node = true;
 								    }
 								    // make a view of the destination
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
 								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_SET;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_set(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor *  a,
 								        struct ggml_tensor *  b,
 								        size_t                nb1,
 								        size_t                nb2,
 								        size_t                nb3,
 								        size_t                offset) {
 								    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
 								}
 								struct ggml_tensor * ggml_set_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor *  a,
 								        struct ggml_tensor *  b,
 								        size_t                nb1,
 								        size_t                nb2,
 								        size_t                nb3,
 								        size_t                offset) {
 								    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
 								}
 								struct ggml_tensor * ggml_set_1d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor *  a,
 								        struct ggml_tensor *  b,
 								        size_t                offset) {
 								    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
 								}
 								struct ggml_tensor * ggml_set_1d_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor *  a,
 								        struct ggml_tensor *  b,
 								        size_t                offset) {
 								    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
 								}
 								struct ggml_tensor * ggml_set_2d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor *  a,
 								        struct ggml_tensor *  b,
 								        size_t                nb1,
 								        size_t                offset) {
 								    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
 								}
 								struct ggml_tensor * ggml_set_2d_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor *  a,
 								        struct ggml_tensor *  b,
 								        size_t                nb1,
 								        size_t                offset) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_cpy
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_cpy_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693)

											
										
										
											2024-01-11 07:27:48 +00:00
+								        struct ggml_tensor  * b) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
-												ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693)

											
										
										
											2024-01-11 07:27:48 +00:00
+								    if (a->grad || b->grad) {
 								        // inplace is false and either one have a grad
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
 								    // make a view of the destination
 								    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if (strlen(b->name) > 0) {
 								        ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
 								    } else {
 								        ggml_format_name(result, "%s (copy)", a->name);
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    result->op   = GGML_OP_CPY;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_cpy(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
-												ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693)

											
										
										
											2024-01-11 07:27:48 +00:00
+								    return ggml_cpy_impl(ctx, a, b);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												llama : ggml-backend integration (llama/4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (llama/4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (llama/4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

											
										
										
											2024-01-12 19:07:38 +00:00
+								struct ggml_tensor * ggml_cast(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        enum   ggml_type      type) {
 								    bool is_node = false;
 								    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
 								    ggml_format_name(result, "%s (copy)", a->name);
 								    result->op   = GGML_OP_CPY;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    result->src[1] = result;
 								    return result;
 								}
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								// ggml_cont
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_cont_impl(
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        struct ggml_context * ctx,
-												ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693)

											
										
										
											2024-01-11 07:27:48 +00:00
+								        struct ggml_tensor  * a) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    bool is_node = false;
-												ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693)

											
										
										
											2024-01-11 07:27:48 +00:00
+								    if (a->grad) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        is_node = true;
 								    }
-												ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693)

											
										
										
											2024-01-11 07:27:48 +00:00
+								    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (cont)", a->name);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
 								    result->op   = GGML_OP_CONT;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_cont(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a) {
-												ggml : remove ggml_cpy_inplace and ggml_cont_inplace (ggml/693)

											
										
										
											2024-01-11 07:27:48 +00:00
+								    return ggml_cont_impl(ctx, a);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// make contiguous, with new shape
 								GGML_API struct ggml_tensor * ggml_cont_1d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int64_t               ne0) {
 								    return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
 								}
 								GGML_API struct ggml_tensor * ggml_cont_2d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int64_t               ne0,
 								        int64_t               ne1) {
 								    return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
 								}
 								GGML_API struct ggml_tensor * ggml_cont_3d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int64_t               ne0,
 								        int64_t               ne1,
 								        int64_t               ne2) {
 								    return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
 								}
 								struct ggml_tensor * ggml_cont_4d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int64_t               ne0,
 								        int64_t               ne1,
 								        int64_t               ne2,
 								        int64_t               ne3) {
 								    GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
 								    bool is_node = false;
 								    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
 								    ggml_format_name(result, "%s (cont)", a->name);
 								    result->op   = GGML_OP_CONT;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    return result;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_reshape
 								struct ggml_tensor * ggml_reshape(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        struct ggml_tensor * b) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_is_contiguous(a));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    if (a->grad) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    if (b->grad) {
 								        // gradient propagation is not supported
 								        //GGML_ASSERT(false);
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (reshaped)", a->name);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    result->op   = GGML_OP_RESHAPE;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_reshape_1d(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        int64_t               ne0) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_is_contiguous(a));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_nelements(a) == ne0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int64_t ne[1] = { ne0 };
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (reshaped)", a->name);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    result->op   = GGML_OP_RESHAPE;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_reshape_2d(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        int64_t               ne0,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        int64_t               ne1) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_is_contiguous(a));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int64_t ne[2] = { ne0, ne1 };
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (reshaped)", a->name);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_RESHAPE;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_reshape_3d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int64_t               ne0,
 								        int64_t               ne1,
 								        int64_t               ne2) {
 								    GGML_ASSERT(ggml_is_contiguous(a));
 								    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
 								    const int64_t ne[3] = { ne0, ne1, ne2 };
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (reshaped)", a->name);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_RESHAPE;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_reshape_4d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int64_t               ne0,
 								        int64_t               ne1,
 								        int64_t               ne2,
 								        int64_t               ne3) {
 								    GGML_ASSERT(ggml_is_contiguous(a));
 								    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
 								    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (reshaped)", a->name);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->op   = GGML_OP_RESHAPE;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								static struct ggml_tensor * ggml_view_impl(
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   n_dims,
 								        const int64_t       * ne,
 								        size_t                offset) {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    bool is_node = false;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    if (a->grad) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_format_name(result, "%s (view)", a->name);
 								    ggml_set_op_params(result, &offset, sizeof(offset));
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    result->op   = GGML_OP_VIEW;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return result;
 								}
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								// ggml_view_1d
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								struct ggml_tensor * ggml_view_1d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        int64_t               ne0,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        size_t                offset) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								// ggml_view_2d
 								struct ggml_tensor * ggml_view_2d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        int64_t               ne0,
 								        int64_t               ne1,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        size_t                nb1,
 								        size_t                offset) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    const int64_t ne[2] = { ne0, ne1 };
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    result->nb[1] = nb1;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->nb[2] = result->nb[1]*ne1;
 								    result->nb[3] = result->nb[2];
 								    return result;
 								}
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								// ggml_view_3d
 								struct ggml_tensor * ggml_view_3d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int64_t               ne0,
 								        int64_t               ne1,
 								        int64_t               ne2,
 								        size_t                nb1,
 								        size_t                nb2,
 								        size_t                offset) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    const int64_t ne[3] = { ne0, ne1, ne2 };
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
 								    result->nb[1] = nb1;
 								    result->nb[2] = nb2;
 								    result->nb[3] = result->nb[2]*ne2;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    return result;
 								}
 								// ggml_view_4d
 								struct ggml_tensor * ggml_view_4d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int64_t               ne0,
 								        int64_t               ne1,
 								        int64_t               ne2,
 								        int64_t               ne3,
 								        size_t                nb1,
 								        size_t                nb2,
 								        size_t                nb3,
 								        size_t                offset) {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->nb[1] = nb1;
 								    result->nb[2] = nb2;
 								    result->nb[3] = nb3;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    return result;
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_permute
 								struct ggml_tensor * ggml_permute(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   axis0,
 								        int                   axis1,
 								        int                   axis2,
 								        int                   axis3) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
 								    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
 								    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
 								    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
 								    GGML_ASSERT(axis0 != axis1);
 								    GGML_ASSERT(axis0 != axis2);
 								    GGML_ASSERT(axis0 != axis3);
 								    GGML_ASSERT(axis1 != axis2);
 								    GGML_ASSERT(axis1 != axis3);
 								    GGML_ASSERT(axis2 != axis3);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (permuted)", a->name);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    int ne[GGML_MAX_DIMS];
 								    int nb[GGML_MAX_DIMS];
 								    ne[axis0] = a->ne[0];
 								    ne[axis1] = a->ne[1];
 								    ne[axis2] = a->ne[2];
 								    ne[axis3] = a->ne[3];
 								    nb[axis0] = a->nb[0];
 								    nb[axis1] = a->nb[1];
 								    nb[axis2] = a->nb[2];
 								    nb[axis3] = a->nb[3];
 								    result->ne[0] = ne[0];
 								    result->ne[1] = ne[1];
 								    result->ne[2] = ne[2];
 								    result->ne[3] = ne[3];
 								    result->nb[0] = nb[0];
 								    result->nb[1] = nb[1];
 								    result->nb[2] = nb[2];
 								    result->nb[3] = nb[3];
 								    result->op   = GGML_OP_PERMUTE;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    int32_t params[] = { axis0, axis1, axis2, axis3 };
 								    ggml_set_op_params(result, params, sizeof(params));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								// ggml_transpose
 								struct ggml_tensor * ggml_transpose(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
 								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    ggml_format_name(result, "%s (transposed)", a->name);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    result->ne[0] = a->ne[1];
 								    result->ne[1] = a->ne[0];
 								    result->nb[0] = a->nb[1];
 								    result->nb[1] = a->nb[0];
 								    result->op   = GGML_OP_TRANSPOSE;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
 								// ggml_get_rows
 								struct ggml_tensor * ggml_get_rows(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    GGML_ASSERT(a->ne[2] == b->ne[1]);
 								    GGML_ASSERT(b->ne[3] == 1);
 								    GGML_ASSERT(b->type == GGML_TYPE_I32);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        is_node = true;
 								    }
 								    // TODO: implement non F32 return
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								    enum ggml_type type = GGML_TYPE_F32;
 								    if (a->type == GGML_TYPE_I32) {
 								        type = a->type;
 								    }
 								    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    result->op   = GGML_OP_GET_ROWS;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_get_rows_back
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_get_rows_back(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_tensor  * b,
 								        struct ggml_tensor  * c) {
 								    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
 								    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    bool is_node = false;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    if (a->grad || b->grad) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    // TODO: implement non F32 return
 								    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
 								    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    result->op   = GGML_OP_GET_ROWS_BACK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_diag
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_diag(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(a->ne[1] == 1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    result->op   = GGML_OP_DIAG;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_diag_mask_inf
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_diag_mask_inf_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   n_past,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        bool                  inplace) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    int32_t params[] = { n_past };
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_DIAG_MASK_INF;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_diag_mask_inf(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   n_past) {
 								    return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_diag_mask_inf_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   n_past) {
 								    return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
 								}
 								// ggml_diag_mask_zero
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_diag_mask_zero_impl(
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   n_past,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        bool                  inplace) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    bool is_node = false;
 								    if (a->grad) {
 								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    int32_t params[] = { n_past };
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_DIAG_MASK_ZERO;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_diag_mask_zero(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   n_past) {
 								    return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_diag_mask_zero_inplace(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        int                   n_past) {
 								    return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
 								}
 								// ggml_soft_max
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_soft_max_impl(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        struct ggml_tensor  * mask,
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        struct ggml_tensor  * pos,
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        float                 scale,
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        float                 max_bias,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        bool                  inplace) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ggml_is_contiguous(a));
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    if (mask) {
 								        GGML_ASSERT(ggml_is_contiguous(mask));
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        GGML_ASSERT(ggml_is_matrix(mask));
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        GGML_ASSERT(ggml_can_repeat_rows(mask, a));
 								    }
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    if (pos) {
 								        GGML_ASSERT(ggml_is_vector(pos));
 								        GGML_ASSERT(pos->type == GGML_TYPE_F32);
 								        GGML_ASSERT(pos->ne[0] == a->ne[0]);
 								    }
 								    if (max_bias > 0.0f) {
 								        GGML_ASSERT(pos);
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    bool is_node = false;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    if (a->grad) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    float params[] = { scale, max_bias };
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    result->op   = GGML_OP_SOFT_MAX;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    result->src[1] = mask;
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    result->src[2] = pos;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_soft_max(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_soft_max_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a) {
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								}
 								struct ggml_tensor * ggml_soft_max_ext(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * mask,
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        struct ggml_tensor  * pos,
 								        float                 scale,
 								        float                 max_bias) {
 								    return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_soft_max_back
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_soft_max_back_impl(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b,
 								        bool                  inplace) {
 								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        is_node = true; // TODO : implement backward pass
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_SOFT_MAX_BACK;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_soft_max_back(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    return ggml_soft_max_back_impl(ctx, a, b, false);
 								}
 								struct ggml_tensor * ggml_soft_max_back_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    return ggml_soft_max_back_impl(ctx, a, b, true);
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_rope
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_rope_impl(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        struct ggml_tensor  * b,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        int                   n_dims,
 								        int                   mode,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        int                   n_ctx,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        int                   n_orig_ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        float                 freq_base,
 								        float                 freq_scale,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        float                 ext_factor,
 								        float                 attn_factor,
 								        float                 beta_fast,
 								        float                 beta_slow,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        float                 xpos_base,
 								        bool                  xpos_down,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        bool                  inplace) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ggml_is_vector(b));
 								    GGML_ASSERT(b->type == GGML_TYPE_I32);
 								    GGML_ASSERT(a->ne[2] == b->ne[0]);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    bool is_node = false;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if (a->grad) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        is_node = true;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
 								    memcpy(params +  5, &freq_base,    sizeof(float));
 								    memcpy(params +  6, &freq_scale,   sizeof(float));
 								    memcpy(params +  7, &ext_factor,   sizeof(float));
 								    memcpy(params +  8, &attn_factor,  sizeof(float));
 								    memcpy(params +  9, &beta_fast,    sizeof(float));
 								    memcpy(params + 10, &beta_slow,    sizeof(float));
 								    memcpy(params + 11, &xpos_base,    sizeof(float));
 								    memcpy(params + 12, &xpos_down,    sizeof(bool));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_ROPE;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    result->src[1] = b;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_rope(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        struct ggml_tensor  * b,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        int                   n_dims,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        int                   mode,
 								        int                   n_ctx) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    return ggml_rope_impl(
 								        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
 								    );
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								}
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								struct ggml_tensor * ggml_rope_inplace(
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        struct ggml_context * ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_tensor  * a,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        struct ggml_tensor  * b,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        int                   n_dims,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        int                   mode,
 								        int                   n_ctx) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    return ggml_rope_impl(
 								        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
 								    );
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								struct ggml_tensor * ggml_rope_custom(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        struct ggml_tensor  * b,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        int                   n_dims,
 								        int                   mode,
 								        int                   n_ctx,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        int                   n_orig_ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        float                 freq_base,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        float                 freq_scale,
 								        float                 ext_factor,
 								        float                 attn_factor,
 								        float                 beta_fast,
 								        float                 beta_slow) {
 								    return ggml_rope_impl(
 								        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
 								        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
 								    );
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								struct ggml_tensor * ggml_rope_custom_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        struct ggml_tensor  * b,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        int                   n_dims,
 								        int                   mode,
 								        int                   n_ctx,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        int                   n_orig_ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        float                 freq_base,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        float                 freq_scale,
 								        float                 ext_factor,
 								        float                 attn_factor,
 								        float                 beta_fast,
 								        float                 beta_slow) {
 								    return ggml_rope_impl(
 								        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
 								        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
 								    );
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								struct ggml_tensor * ggml_rope_xpos_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        struct ggml_tensor  * b,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        int                   n_dims,
 								        float                 base,
 								        bool                  down) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								}
 								// ggml_rope_back
 								struct ggml_tensor * ggml_rope_back(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        struct ggml_tensor  * b,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        int                   n_dims,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        int                   mode,
 								        int                   n_ctx,
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								        int                   n_orig_ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        float                 freq_base,
 								        float                 freq_scale,
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								        float                 ext_factor,
 								        float                 attn_factor,
 								        float                 beta_fast,
 								        float                 beta_slow,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        float                 xpos_base,
 								        bool                  xpos_down) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ggml_is_vector(b));
 								    GGML_ASSERT(b->type == GGML_TYPE_I32);
 								    GGML_ASSERT(a->ne[2] == b->ne[0]);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    bool is_node = false;
 								    if (a->grad) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        is_node = false; // TODO: implement backward
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    }
 								    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
 								    memcpy(params +  5, &freq_base,    sizeof(float));
 								    memcpy(params +  6, &freq_scale,   sizeof(float));
 								    memcpy(params +  7, &ext_factor,   sizeof(float));
 								    memcpy(params +  8, &attn_factor,  sizeof(float));
 								    memcpy(params +  9, &beta_fast,    sizeof(float));
 								    memcpy(params + 10, &beta_slow,    sizeof(float));
 								    memcpy(params + 11, &xpos_base,    sizeof(float));
 								    memcpy(params + 12, &xpos_down,    sizeof(bool));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_ROPE_BACK;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    result->src[1] = b;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
 								// ggml_alibi
 								struct ggml_tensor * ggml_alibi(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   n_past,
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        int                   n_head,
 								        float                 bias_max) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(n_past >= 0);
 								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    // TODO: when implement backward, fix this:
 								    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    int32_t op_params[3] = { n_past, n_head };
 								    memcpy(op_params + 2, &bias_max, sizeof(float));
 								    ggml_set_op_params(result, op_params, sizeof(op_params));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    result->op   = GGML_OP_ALIBI;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								// ggml_clamp
 								struct ggml_tensor * ggml_clamp(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        float                 min,
 								        float                 max) {
 								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    // TODO: when implement backward, fix this:
 								    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    float params[] = { min, max };
 								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
 								    result->op   = GGML_OP_CLAMP;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
 								    return result;
 								}
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// ggml_conv_1d
 								static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
 								    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								GGML_API struct ggml_tensor * ggml_conv_1d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b,
 								        int                   s0,
 								        int                   p0,
 								        int                   d0) {
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    struct ggml_tensor * result =
 								        ggml_mul_mat(ctx,
 								                ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
 								                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    return result;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_conv_1d_ph
 								struct ggml_tensor* ggml_conv_1d_ph(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b,
 								        int                   s,
 								        int                   d) {
 								    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// ggml_conv_transpose_1d
 								static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
 								    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
 								}
 								GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b,
 								        int                   s0,
 								        int                   p0,
 								        int                   d0) {
 								    GGML_ASSERT(ggml_is_matrix(b));
 								    GGML_ASSERT(a->ne[2] == b->ne[1]);
 								    GGML_ASSERT(a->ne[3] == 1);
 								    GGML_ASSERT(p0 == 0);
 								    GGML_ASSERT(d0 == 1);
 								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    const int64_t ne[4] = {
 								        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
 								        a->ne[1], b->ne[2], 1,
 								    };
 								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 								    int32_t params[] = { s0, p0, d0 };
 								    ggml_set_op_params(result, params, sizeof(params));
 								    result->op = GGML_OP_CONV_TRANSPOSE_1D;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    result->src[1] = b;
 								    return result;
 								}
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								// ggml_conv_depthwise
 								struct ggml_tensor * ggml_conv_depthwise_2d(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor * a,
 								    struct ggml_tensor * b,
 								    int                  s0,
 								    int                  s1,
 								    int                  p0,
 								    int                  p1,
 								    int                  d0,
 								    int                  d1) {
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
 								    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
 								                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
 								    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
 								    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
 								    return result;
 								}
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// ggml_conv_2d
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
 								// a: [OC，IC, KH, KW]
 								// b: [N, IC, IH, IW]
 								// result: [N, OH, OW, IC*KH*KW]
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								struct ggml_tensor * ggml_im2col(
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_context * ctx,
 								    struct ggml_tensor  * a,
 								    struct ggml_tensor  * b,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    int                  s0,
 								    int                  s1,
 								    int                  p0,
 								    int                  p1,
 								    int                  d0,
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    int                  d1,
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    bool                 is_2D,
 								    enum ggml_type       dst_type) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    if(is_2D) {
 								        GGML_ASSERT(a->ne[2] == b->ne[2]);
 								    } else {
 								        GGML_ASSERT(a->ne[1] == b->ne[1]);
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
 								    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    const int64_t ne[4] = {
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        OW,
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        is_2D ? OH : b->ne[2],
 								        is_2D ?      b->ne[3] : 1,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    };
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    ggml_set_op_params(result, params, sizeof(params));
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    result->op = GGML_OP_IM2COL;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// a: [OC，IC, KH, KW]
 								// b: [N, IC, IH, IW]
 								// result: [N, OC, OH, OW]
 								struct ggml_tensor * ggml_conv_2d(
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b,
 								        int                  s0,
 								        int                  s1,
 								        int                  p0,
 								        int                  p1,
 								        int                  d0,
 								        int                  d1) {
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    struct ggml_tensor * result =
 								        ggml_mul_mat(ctx,
 								                ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
 								                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												ggml : fix conv_2d batch mode (ggml/737)

Co-authored-by: bssrdf <bssrdf@gmail.com>

											
										
										
											2024-02-20 19:17:09 +00:00
+								    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
 								    result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    return result;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// ggml_conv_2d_sk_p0
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_conv_2d_sk_p0(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor  * b) {
 								    return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_conv_2d_s1_ph
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_conv_2d_s1_ph(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        struct ggml_context * ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b) {
 								    return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
 								}
 								// ggml_conv_transpose_2d_p0
 								static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
 								    return (ins - 1) * s - 2 * p + ks;
 								}
 								struct ggml_tensor * ggml_conv_transpose_2d_p0(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b,
 								        int                   stride) {
 								    GGML_ASSERT(a->ne[3] == b->ne[2]);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    bool is_node = false;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (a->grad || b->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        is_node = true;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t ne[4] = {
 								        ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
 								        ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
 								        a->ne[2], b->ne[3],
 								    };
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 								    ggml_set_op_params_i32(result, 0, stride);
 								    result->op = GGML_OP_CONV_TRANSPOSE_2D;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    return result;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_pool_*
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return (ins + 2 * p - ks) / s + 1;
 								}
 								// ggml_pool_1d
 								struct ggml_tensor * ggml_pool_1d(
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        enum ggml_op_pool     op,
 								        int                   k0,
 								        int                   s0,
 								        int                   p0) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    bool is_node = false;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								    const int64_t ne[4] = {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
 								        a->ne[1],
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								        a->ne[2],
 								        a->ne[3],
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    };
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    int32_t params[] = { op, k0, s0, p0 };
 								    ggml_set_op_params(result, params, sizeof(params));
 								    result->op = GGML_OP_POOL_1D;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    return result;
 								}
 								// ggml_pool_2d
 								struct ggml_tensor * ggml_pool_2d(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        enum ggml_op_pool     op,
 								        int                   k0,
 								        int                   k1,
 								        int                   s0,
 								        int                   s1,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        float                 p0,
 								        float                 p1) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    struct ggml_tensor * result;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t ne[3] = {
 								        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
 								        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
 								        a->ne[2],
 								    };
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
 								    ggml_set_op_params(result, params, sizeof(params));
 								    result->op = GGML_OP_POOL_2D;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    return result;
 								}
 								// ggml_upscale
 								static struct ggml_tensor * ggml_upscale_impl(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor * a,
 								    int scale_factor) {
 								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
 								            a->ne[0] * scale_factor,
 								            a->ne[1] * scale_factor,
 								            a->ne[2], a->ne[3]);
 								    result->op = GGML_OP_UPSCALE;
 								    result->op_params[0] = scale_factor;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    return result;
 								}
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								struct ggml_tensor * ggml_pad(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor  * a,
 								    int p0, int p1, int p2, int p3) {
 								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
 								            a->ne[0] + p0,
 								            a->ne[1] + p1,
 								            a->ne[2] + p2,
 								            a->ne[3] + p3);
 								    result->op = GGML_OP_PAD;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    return result;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_upscale(
 								    struct ggml_context * ctx,
 								    struct ggml_tensor * a,
 								    int scale_factor) {
 								    return ggml_upscale_impl(ctx, a, scale_factor);
 								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								struct ggml_tensor * ggml_arange(
 								    struct ggml_context * ctx,
 								    float start,
 								    float stop,
 								    float step) {
 								    GGML_ASSERT(stop > start);
 								    const int64_t steps = (int64_t) ceilf((stop - start) / step);
 								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
 								    result->op = GGML_OP_ARANGE;
 								    ggml_set_op_params_f32(result, 0, start);
 								    ggml_set_op_params_f32(result, 1, stop);
 								    ggml_set_op_params_f32(result, 2, step);
 								    return result;
 								}
 								struct ggml_tensor * ggml_timestep_embedding(
 								            struct ggml_context * ctx,
 								            struct ggml_tensor  * timesteps,
 								            int                   dim,
 								            int                   max_period) {
 								    bool is_node = false;
 								    if (timesteps->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    int actual_dim = dim;
 								    if (dim % 2 != 0) {
 								        actual_dim = dim + 1;
 								    }
 								    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
 								    result->op = GGML_OP_TIMESTEP_EMBEDDING;
 								    ggml_set_op_params_i32(result, 0, dim);
 								    ggml_set_op_params_i32(result, 1, max_period);
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = timesteps;
 								    return result;
 								}
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								// ggml_argsort
 								struct ggml_tensor * ggml_argsort(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        enum ggml_sort_order  order) {
 								    bool is_node = false;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
 								    ggml_set_op_params_i32(result, 0, (int32_t) order);
 								    result->op   = GGML_OP_ARGSORT;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    return result;
 								}
 								// ggml_top_k
 								struct ggml_tensor * ggml_top_k(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   k) {
 								    GGML_ASSERT(a->ne[0] >= k);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
 								    result = ggml_view_4d(ctx, result,
 								                k, result->ne[1], result->ne[2], result->ne[3],
 								                   result->nb[1], result->nb[2], result->nb[3],
 );
 								    return result;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_flash_attn
 								struct ggml_tensor * ggml_flash_attn(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * q,
 								        struct ggml_tensor  * k,
 								        struct ggml_tensor  * v,
 								        bool                  masked) {
 								    GGML_ASSERT(ggml_can_mul_mat(k, q));
 								    // TODO: check if vT can be multiplied by (k*qT)
 								    bool is_node = false;
 								    if (q->grad || k->grad || v->grad) {
 								        is_node = true;
 								    }
 								    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    int32_t t = masked ? 1 : 0;
 								    ggml_set_op_params(result, &t, sizeof(t));
 								    result->op   = GGML_OP_FLASH_ATTN;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = q;
 								    result->src[1] = k;
 								    result->src[2] = v;
 								    return result;
 								}
 								// ggml_flash_ff
 								struct ggml_tensor * ggml_flash_ff(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * b0,
 								        struct ggml_tensor  * b1,
 								        struct ggml_tensor  * c0,
 								        struct ggml_tensor  * c1) {
 								    GGML_ASSERT(ggml_can_mul_mat(b0, a));
 								    // TODO: more checks
 								    bool is_node = false;
 								    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        is_node = true;
 								    }
 								    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    result->op   = GGML_OP_FLASH_FF;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b0;
 								    result->src[2] = b1;
 								    result->src[3] = c0;
 								    result->src[4] = c1;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    return result;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_flash_attn_back
 								struct ggml_tensor * ggml_flash_attn_back(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * q,
 								        struct ggml_tensor  * k,
 								        struct ggml_tensor  * v,
 								        struct ggml_tensor  * d,
 								        bool                  masked) {
 								    GGML_ASSERT(ggml_can_mul_mat(k, q));
 								    // TODO: check if vT can be multiplied by (k*qT)
 								    // d shape [D,N,ne2,ne3]
 								    // q shape [D,N,ne2,ne3]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // k shape [D,M,kvne2,ne3]
 								    // v shape [M,D,kvne2,ne3]
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int64_t     D = q->ne[0];
 								    const int64_t     N = q->ne[1];
 								    const int64_t     M = k->ne[1];
 								    const int64_t   ne2 = q->ne[2];
 								    const int64_t   ne3 = q->ne[3];
 								    const int64_t kvne2 = k->ne[2];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    GGML_ASSERT(k->ne[0] == D);
 								    GGML_ASSERT(v->ne[0] == M);
 								    GGML_ASSERT(v->ne[1] == D);
 								    GGML_ASSERT(d->ne[0] == D);
 								    GGML_ASSERT(d->ne[1] == N);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(k->ne[2] == kvne2);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(k->ne[3] == ne3);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(v->ne[2] == kvne2);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(v->ne[3] == ne3);
 								    GGML_ASSERT(d->ne[2] == ne2);
 								    GGML_ASSERT(d->ne[3] == ne3);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ne2 % kvne2 == 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    bool is_node = false;
 								    if (q->grad || k->grad || v->grad) {
 								        // when using this operation (in backwards pass) these grads are set.
 								        // we don't want to create (big) grad of our result, so is_node is false.
 								        is_node = false;
 								    }
 								    // store gradients of q, k and v as continuous tensors concatenated in result.
 								    // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int64_t elem_q = ggml_nelements(q);
 								    const int64_t elem_k = ggml_nelements(k);
 								    const int64_t elem_v = ggml_nelements(v);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    enum ggml_type result_type = GGML_TYPE_F32;
 								    GGML_ASSERT(ggml_blck_size(result_type) == 1);
 								    const size_t tsize = ggml_type_size(result_type);
 								    const size_t offs_q = 0;
 								    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
 								    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
 								    const size_t end    = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
 								    const size_t nelements = (end + tsize - 1)/tsize;
 								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    int32_t masked_i = masked ? 1 : 0;
 								    ggml_set_op_params(result, &masked_i, sizeof(masked_i));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    result->op   = GGML_OP_FLASH_ATTN_BACK;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = q;
 								    result->src[1] = k;
 								    result->src[2] = v;
 								    result->src[3] = d;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								// ggml_ssm_conv
 								struct ggml_tensor * ggml_ssm_conv(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * s,
 								        struct ggml_tensor  * x,
 								        struct ggml_tensor  * c,
 								        struct ggml_tensor  * sq) {
 								    GGML_ASSERT(ggml_is_3d(s));
 								    GGML_ASSERT(ggml_is_matrix(x));
 								    GGML_ASSERT(ggml_is_matrix(c));
 								    GGML_ASSERT(ggml_is_matrix(sq));
 								    GGML_ASSERT(sq->type == GGML_TYPE_I32);
 								    const int64_t d_conv   = c->ne[0];
 								    const int64_t d_inner  = c->ne[1];
 								    const int64_t n_tokens = x->ne[1];
 								    const int64_t n_kv     = s->ne[2];
 								    GGML_ASSERT( s->ne[0] == d_conv - 1);
 								    GGML_ASSERT( s->ne[1] == d_inner);
 								    GGML_ASSERT( x->ne[0] == d_inner);
 								    GGML_ASSERT(sq->ne[0] == n_kv);
 								    GGML_ASSERT(sq->ne[1] == n_tokens);
 								    bool is_node = false;
 								    if (s->grad || x->grad || c->grad || sq->grad) {
 								        GGML_ASSERT(false); // TODO: implement
 								        is_node = true;
 								    }
 								    // 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
 								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
 								    result->op   = GGML_OP_SSM_CONV;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = s;
 								    result->src[1] = x;
 								    result->src[2] = c;
 								    result->src[3] = sq;
 								    return result;
 								}
 								// ggml_ssm_scan
 								struct ggml_tensor * ggml_ssm_scan(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * s,
 								        struct ggml_tensor  * x,
 								        struct ggml_tensor  * dt,
 								        struct ggml_tensor  * A,
 								        struct ggml_tensor  * B,
 								        struct ggml_tensor  * C,
 								        struct ggml_tensor  * sq) {
 								    GGML_ASSERT(ggml_is_contiguous(s));
 								    GGML_ASSERT(ggml_is_contiguous(x));
 								    GGML_ASSERT(ggml_is_contiguous(dt));
 								    GGML_ASSERT(ggml_is_contiguous(A));
 								    GGML_ASSERT(sq->type == GGML_TYPE_I32);
 								    GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
 								    GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
 								    GGML_ASSERT(ggml_are_same_shape(x, dt));
 								    {
 								        const int64_t d_state  = s->ne[0];
 								        const int64_t d_inner  = s->ne[1];
 								        const int64_t n_tokens = x->ne[1];
 								        GGML_ASSERT(x->ne[0] == d_inner);
 								        GGML_ASSERT(A->ne[0] == d_state);
 								        GGML_ASSERT(A->ne[1] == d_inner);
 								        GGML_ASSERT(B->ne[0] == d_state);
 								        GGML_ASSERT(B->ne[1] == n_tokens);
 								        GGML_ASSERT(C->ne[0] == d_state);
 								        GGML_ASSERT(C->ne[1] == n_tokens);
 								    }
 								    bool is_node = false;
 								    if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
 								        GGML_ASSERT(false); // TODO: implement
 								        is_node = true;
 								    }
 								    // 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
 								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
 								    result->op   = GGML_OP_SSM_SCAN;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = s;
 								    result->src[1] = x;
 								    result->src[2] = dt;
 								    result->src[3] = A;
 								    result->src[4] = B;
 								    result->src[5] = C;
 								    result->src[6] = sq;
 								    return result;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_win_part
 								struct ggml_tensor * ggml_win_part(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   w) {
 								    GGML_ASSERT(a->ne[3] == 1);
 								    GGML_ASSERT(a->type  == GGML_TYPE_F32);
 								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    // padding
 								    const int px = (w - a->ne[1]%w)%w;
 								    const int py = (w - a->ne[2]%w)%w;
 								    const int npx = (px + a->ne[1])/w;
 								    const int npy = (py + a->ne[2])/w;
 								    const int np  = npx*npy;
 								    const int64_t ne[4] = { a->ne[0], w, w, np, };
 								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    int32_t params[] = { npx, npy, w };
 								    ggml_set_op_params(result, params, sizeof(params));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    result->op   = GGML_OP_WIN_PART;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
 								// ggml_win_unpart
 								struct ggml_tensor * ggml_win_unpart(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   w0,
 								        int                   h0,
 								        int                   w) {
 								    GGML_ASSERT(a->type == GGML_TYPE_F32);
 								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
 								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    int32_t params[] = { w };
 								    ggml_set_op_params(result, params, sizeof(params));
 								    result->op   = GGML_OP_WIN_UNPART;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return result;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_get_rel_pos
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_get_rel_pos(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        int                   qh,
 								        int                   kh) {
 								    GGML_ASSERT(qh == kh);
 								    GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    bool is_node = false;
 								    if (a->grad) {
 								        GGML_ASSERT(false); // TODO: implement backward
 								        is_node = true;
 								    }
 								    const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
 								    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
 								    result->op   = GGML_OP_GET_REL_POS;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    return result;
 								}
 								// ggml_add_rel_pos
 								static struct ggml_tensor * ggml_add_rel_pos_impl(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * pw,
 								        struct ggml_tensor  * ph,
 								        bool                  inplace) {
 								    GGML_ASSERT(ggml_are_same_shape(pw, ph));
 								    GGML_ASSERT(ggml_is_contiguous(a));
 								    GGML_ASSERT(ggml_is_contiguous(pw));
 								    GGML_ASSERT(ggml_is_contiguous(ph));
 								    GGML_ASSERT(ph->type == GGML_TYPE_F32);
 								    GGML_ASSERT(pw->type == GGML_TYPE_F32);
 								    GGML_ASSERT(pw->ne[3] == a->ne[2]);
 								    GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
 								    GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
 								    bool is_node = false;
 								    if (!inplace && (a->grad || pw->grad || ph->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->op   = GGML_OP_ADD_REL_POS;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    result->src[1] = pw;
 								    result->src[2] = ph;
 								    return result;
 								}
 								struct ggml_tensor * ggml_add_rel_pos(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * pw,
 								        struct ggml_tensor  * ph) {
 								    return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
 								}
 								struct ggml_tensor * ggml_add_rel_pos_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        struct ggml_tensor  * pw,
 								        struct ggml_tensor  * ph) {
 								    return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
 								}
 								// gmml_unary
 								static struct ggml_tensor * ggml_unary_impl(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * a,
 								        enum ggml_unary_op op,
 								        bool inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    ggml_set_op_params_i32(result, 0, (int32_t) op);
 								    result->op   = GGML_OP_UNARY;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_unary(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        enum ggml_unary_op op) {
 								    return ggml_unary_impl(ctx, a, op, false);
 								}
 								struct ggml_tensor * ggml_unary_inplace(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor  * a,
 								        enum ggml_unary_op op) {
 								    return ggml_unary_impl(ctx, a, op, true);
 								}
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								// ggml_map_unary
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_map_unary_impl_f32(
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								        struct ggml_context        * ctx,
 								        struct ggml_tensor         * a,
 								        const  ggml_unary_op_f32_t fun,
 								        bool   inplace) {
 								    bool is_node = false;
 								    if (!inplace && a->grad) {
 								        is_node = true;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
 								    result->op = GGML_OP_MAP_UNARY;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_map_unary_f32(
 								        struct ggml_context        * ctx,
 								        struct ggml_tensor         * a,
 								        const  ggml_unary_op_f32_t fun) {
 								    return ggml_map_unary_impl_f32(ctx, a, fun, false);
 								}
 								struct ggml_tensor * ggml_map_unary_inplace_f32(
 								        struct ggml_context        * ctx,
 								        struct ggml_tensor         * a,
 								        const  ggml_unary_op_f32_t fun) {
 								    return ggml_map_unary_impl_f32(ctx, a, fun, true);
 								}
 								// ggml_map_binary
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_map_binary_impl_f32(
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								        struct ggml_context         * ctx,
 								        struct ggml_tensor          * a,
 								        struct ggml_tensor          * b,
 								        const  ggml_binary_op_f32_t fun,
 								        bool   inplace) {
 								    GGML_ASSERT(ggml_are_same_shape(a, b));
 								    bool is_node = false;
 								    if (!inplace && (a->grad || b->grad)) {
 								        is_node = true;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
 								    result->op = GGML_OP_MAP_BINARY;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_map_binary_f32(
 								        struct ggml_context         * ctx,
 								        struct ggml_tensor          * a,
 								        struct ggml_tensor          * b,
 								        const  ggml_binary_op_f32_t fun) {
 								    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
 								}
 								struct ggml_tensor * ggml_map_binary_inplace_f32(
 								        struct ggml_context         * ctx,
 								        struct ggml_tensor          * a,
 								        struct ggml_tensor          * b,
 								        const  ggml_binary_op_f32_t fun) {
 								    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_map_custom1_f32
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_map_custom1_impl_f32(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        const  ggml_custom1_op_f32_t   fun,
 								        bool   inplace) {
 								    bool is_node = false;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if (!inplace && a->grad) {
 								        is_node = true;
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->op = GGML_OP_MAP_CUSTOM1_F32;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    return result;
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								struct ggml_tensor * ggml_map_custom1_f32(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        const  ggml_custom1_op_f32_t   fun) {
 								    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								struct ggml_tensor * ggml_map_custom1_inplace_f32(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        const  ggml_custom1_op_f32_t   fun) {
 								    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_map_custom2_f32
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_map_custom2_impl_f32(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        const  ggml_custom2_op_f32_t   fun,
 								        bool   inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad || b->grad)) {
 								        is_node = true;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->op = GGML_OP_MAP_CUSTOM2_F32;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_map_custom2_f32(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        const  ggml_custom2_op_f32_t   fun) {
 								    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
 								}
 								struct ggml_tensor * ggml_map_custom2_inplace_f32(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        const  ggml_custom2_op_f32_t   fun) {
 								    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_map_custom3_f32
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_map_custom3_impl_f32(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        struct ggml_tensor           * c,
 								        const  ggml_custom3_op_f32_t   fun,
 								        bool   inplace) {
 								    bool is_node = false;
 								    if (!inplace && (a->grad || b->grad || c->grad)) {
 								        is_node = true;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->op = GGML_OP_MAP_CUSTOM3_F32;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
 								    result->src[1] = b;
 								    result->src[2] = c;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
 								struct ggml_tensor * ggml_map_custom3_f32(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        struct ggml_tensor           * c,
 								        const  ggml_custom3_op_f32_t   fun) {
 								    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
 								}
 								struct ggml_tensor * ggml_map_custom3_inplace_f32(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        struct ggml_tensor           * c,
 								        const  ggml_custom3_op_f32_t   fun) {
 								    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_map_custom1
 								struct ggml_map_custom1_op_params {
 								    ggml_custom1_op_t fun;
 								    int n_tasks;
 								    void * userdata;
 								};
 								static struct ggml_tensor * ggml_map_custom1_impl(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        const  ggml_custom1_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata,
 								        bool                           inplace) {
 								    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    bool is_node = false;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (!inplace && a->grad) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        is_node = true;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_map_custom1_op_params params = {
 								        /*.fun      =*/ fun,
 								        /*.n_tasks  =*/ n_tasks,
 								        /*.userdata =*/ userdata
 								    };
 								    ggml_set_op_params(result, (const void *) &params, sizeof(params));
 								    result->op = GGML_OP_MAP_CUSTOM1;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    result->src[0] = a;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_map_custom1(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        const  ggml_custom1_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata) {
 								    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_tensor * ggml_map_custom1_inplace(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        const  ggml_custom1_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata) {
 								    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_map_custom2
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct ggml_map_custom2_op_params {
 								    ggml_custom2_op_t fun;
 								    int n_tasks;
 								    void * userdata;
 								};
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct ggml_tensor * ggml_map_custom2_impl(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        const  ggml_custom2_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata,
 								        bool                           inplace) {
 								    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    bool is_node = false;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (!inplace && (a->grad || b->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    struct ggml_map_custom2_op_params params = {
 								        /*.fun      =*/ fun,
 								        /*.n_tasks  =*/ n_tasks,
 								        /*.userdata =*/ userdata
 								    };
 								    ggml_set_op_params(result, (const void *) &params, sizeof(params));
 								    result->op = GGML_OP_MAP_CUSTOM2;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    result->src[1] = b;
 								    return result;
 								}
 								struct ggml_tensor * ggml_map_custom2(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        const  ggml_custom2_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata) {
 								    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
 								}
 								struct ggml_tensor * ggml_map_custom2_inplace(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        const  ggml_custom2_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata) {
 								    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
 								}
 								// ggml_map_custom3
 								struct ggml_map_custom3_op_params {
 								    ggml_custom3_op_t fun;
 								    int n_tasks;
 								    void * userdata;
 								};
 								static struct ggml_tensor * ggml_map_custom3_impl(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        struct ggml_tensor           * c,
 								        const  ggml_custom3_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata,
 								        bool                           inplace) {
 								    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
 								    bool is_node = false;
 								    if (!inplace && (a->grad || b->grad || c->grad)) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 								    struct ggml_map_custom3_op_params params = {
 								        /*.fun      =*/ fun,
 								        /*.n_tasks  =*/ n_tasks,
 								        /*.userdata =*/ userdata
 								    };
 								    ggml_set_op_params(result, (const void *) &params, sizeof(params));
 								    result->op = GGML_OP_MAP_CUSTOM3;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    result->src[1] = b;
 								    result->src[2] = c;
 								    return result;
 								}
 								struct ggml_tensor * ggml_map_custom3(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        struct ggml_tensor           * c,
 								        const  ggml_custom3_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata) {
 								    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
 								}
 								struct ggml_tensor * ggml_map_custom3_inplace(
 								        struct ggml_context          * ctx,
 								        struct ggml_tensor           * a,
 								        struct ggml_tensor           * b,
 								        struct ggml_tensor           * c,
 								        const  ggml_custom3_op_t       fun,
 								        int                            n_tasks,
 								        void                         * userdata) {
 								    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
 								}
 								// ggml_cross_entropy_loss
 								struct ggml_tensor * ggml_cross_entropy_loss(
 								        struct ggml_context         * ctx,
 								        struct ggml_tensor          * a,
 								        struct ggml_tensor          * b) {
 								    GGML_ASSERT(ggml_are_same_shape(a, b));
 								    bool is_node = false;
 								    if (a->grad || b->grad) {
 								        is_node = true;
 								    }
 								    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
 								    result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
 								    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 								    result->src[0] = a;
 								    result->src[1] = b;
 								    return result;
 								}
 								// ggml_cross_entropy_loss_back
 								struct ggml_tensor * ggml_cross_entropy_loss_back(
 								        struct ggml_context         * ctx,
 								        struct ggml_tensor          * a,
 								        struct ggml_tensor          * b,
 								        struct ggml_tensor          * c) {
 								    GGML_ASSERT(ggml_are_same_shape(a, b));
 								    GGML_ASSERT(ggml_is_scalar(c));
 								    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 								    result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
 								    result->grad = NULL;
 								    result->src[0] = a;
 								    result->src[1] = b;
 								    result->src[2] = c;
 								    return result;
 								}
 								////////////////////////////////////////////////////////////////////////////////
 								void ggml_set_param(
 								        struct ggml_context * ctx,
 								        struct ggml_tensor * tensor) {
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								    tensor->flags |= GGML_TENSOR_FLAG_PARAM;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    GGML_ASSERT(tensor->grad == NULL);
 								    tensor->grad = ggml_dup_tensor(ctx, tensor);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								// ggml_compute_forward_dup
 								static void ggml_compute_forward_dup_same_cont(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
 								    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
 								    GGML_ASSERT(src0->type == dst->type);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
 								    const size_t nb00 = src0->nb[0];
 								    const size_t nb0 = dst->nb[0];
 								    const int ith = params->ith; // thread index
 								    const int nth = params->nth; // number of threads
 								    // parallelize by elements
 								    const int ne = ggml_nelements(dst);
 								    const int dr = (ne + nth - 1) / nth;
 								    const int ie0 = dr * ith;
 								    const int ie1 = MIN(ie0 + dr, ne);
 								    if (ie0 < ie1) {
 								        memcpy(
 								            ((char *)  dst->data + ie0*nb0),
 								            ((char *) src0->data + ie0*nb00),
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            (ie1 - ie0) * ggml_type_size(src0->type));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    }
 								}
 								static void ggml_compute_forward_dup_f16(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    const int ith = params->ith; // thread index
 								    const int nth = params->nth; // number of threads
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								        ggml_compute_forward_dup_same_cont(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    // parallelize by rows
 								    const int nr = ne01;
 								    // number of rows per thread
 								    const int dr = (nr + nth - 1) / nth;
 								    // row range for this thread
 								    const int ir0 = dr * ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    if (src0->type == dst->type &&
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        ne00 == ne0 &&
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        // copy by rows
 								        const size_t rs = ne00*nb00;
 								        for (int64_t i03 = 0; i03 < ne03; i03++) {
 								            for (int64_t i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    memcpy(
 								                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
 								                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
 								                        rs);
 								                }
 								            }
 								        }
 								        return;
 								    }
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    if (ggml_is_contiguous(dst)) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        if (nb00 == sizeof(ggml_fp16_t)) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            if (dst->type == GGML_TYPE_F16) {
 								                size_t id = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                const size_t rs = ne00 * nb00;
 								                char * dst_ptr = (char *) dst->data;
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += rs * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                            memcpy(dst_ptr + id, src0_ptr, rs);
 								                            id += rs;
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								                        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += rs * (ne01 - ir1);
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								                    }
 								                }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            } else if (dst->type == GGML_TYPE_F32) {
 								                size_t id = 0;
 								                float * dst_ptr = (float *) dst->data;
 								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
 								                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            for (int i00 = 0; i00 < ne00; i00++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                id++;
 								                            }
 								                        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * (ne01 - ir1);
 								                    }
 								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } else if (type_traits[dst->type].from_float) {
 								                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 								                size_t id = 0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                char * dst_ptr = (char *) dst->data;
 								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
 								                        id += rs * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
 								                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
 								                            for (int i00 = 0; i00 < ne00; i00++) {
 								                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
 								                            }
 								                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
 								                            id += rs;
 								                        }
 								                        id += rs * (ne01 - ir1);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    }
 								                }
 								            } else {
 								                GGML_ASSERT(false); // TODO: implement
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								            }
 								        } else {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            //printf("%s: this is not optimal - fix me\n", __func__);
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            if (dst->type == GGML_TYPE_F32) {
 								                size_t id = 0;
 								                float * dst_ptr = (float *) dst->data;
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            for (int i00 = 0; i00 < ne00; i00++) {
 								                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
 								                                id++;
 								                            }
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								                        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * (ne01 - ir1);
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								                    }
 								                }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            } else if (dst->type == GGML_TYPE_F16) {
 								                size_t id = 0;
 								                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
 								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            for (int i00 = 0; i00 < ne00; i00++) {
 								                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 								                                dst_ptr[id] = *src0_ptr;
 								                                id++;
 								                            }
 								                        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * (ne01 - ir1);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    }
 								                }
 								            } else {
 								                GGML_ASSERT(false); // TODO: implement
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								            }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        }
 								        return;
 								    }
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    // dst counters
 								    int64_t i10 = 0;
 								    int64_t i11 = 0;
 								    int64_t i12 = 0;
 								    int64_t i13 = 0;
 								    if (dst->type == GGML_TYPE_F16) {
 								        for (int64_t i03 = 0; i03 < ne03; i03++) {
 								            for (int64_t i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                i10 += ne00 * ir0;
 								                while (i10 >= ne0) {
 								                    i10 -= ne0;
 								                    if (++i11 == ne1) {
 								                        i11 = 0;
 								                        if (++i12 == ne2) {
 								                            i12 = 0;
 								                            if (++i13 == ne3) {
 								                                i13 = 0;
 								                            }
 								                        }
 								                    }
 								                }
 								                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 								                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
 								                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
 								                        if (++i10 == ne00) {
 								                            i10 = 0;
 								                            if (++i11 == ne01) {
 								                                i11 = 0;
 								                                if (++i12 == ne02) {
 								                                    i12 = 0;
 								                                    if (++i13 == ne03) {
 								                                        i13 = 0;
 								                                    }
 								                                }
 								                            }
 								                        }
 								                    }
 								                }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                i10 += ne00 * (ne01 - ir1);
 								                while (i10 >= ne0) {
 								                    i10 -= ne0;
 								                    if (++i11 == ne1) {
 								                        i11 = 0;
 								                        if (++i12 == ne2) {
 								                            i12 = 0;
 								                            if (++i13 == ne3) {
 								                                i13 = 0;
 								                            }
 								                        }
 								                    }
 								                }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            }
 								        }
 								    } else if (dst->type == GGML_TYPE_F32) {
 								        for (int64_t i03 = 0; i03 < ne03; i03++) {
 								            for (int64_t i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                i10 += ne00 * ir0;
 								                while (i10 >= ne0) {
 								                    i10 -= ne0;
 								                    if (++i11 == ne1) {
 								                        i11 = 0;
 								                        if (++i12 == ne2) {
 								                            i12 = 0;
 								                            if (++i13 == ne3) {
 								                                i13 = 0;
 								                            }
 								                        }
 								                    }
 								                }
 								                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 								                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
 								                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        if (++i10 == ne0) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            i10 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                            if (++i11 == ne1) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                i11 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                                if (++i12 == ne2) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                    i12 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                                    if (++i13 == ne3) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                        i13 = 0;
 								                                    }
 								                                }
 								                            }
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								                        }
 								                    }
 								                }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                i10 += ne00 * (ne01 - ir1);
 								                while (i10 >= ne0) {
 								                    i10 -= ne0;
 								                    if (++i11 == ne1) {
 								                        i11 = 0;
 								                        if (++i12 == ne2) {
 								                            i12 = 0;
 								                            if (++i13 == ne3) {
 								                                i13 = 0;
 								                            }
 								                        }
 								                    }
 								                }
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								            }
 								        }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    } else {
 								        GGML_ASSERT(false); // TODO: implement
-												ggml : implement ggml_compute_forward_dup_f16() special cases

											
										
										
											2022-12-16 19:50:41 +00:00
+								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_dup_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    const int ith = params->ith; // thread index
 								    const int nth = params->nth; // number of threads
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								        ggml_compute_forward_dup_same_cont(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    // parallelize by rows
 								    const int nr = ne01;
 								    // number of rows per thread
 								    const int dr = (nr + nth - 1) / nth;
 								    // row range for this thread
 								    const int ir0 = dr * ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    if (src0->type == dst->type &&
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        ne00 == ne0 &&
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        // copy by rows
 								        const size_t rs = ne00*nb00;
 								        for (int64_t i03 = 0; i03 < ne03; i03++) {
 								            for (int64_t i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    memcpy(
 								                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
 								                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
 								                        rs);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        }
 								        return;
 								    }
 								    if (ggml_is_contiguous(dst)) {
 								        // TODO: simplify
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        if (nb00 == sizeof(float)) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            if (dst->type == GGML_TYPE_F32) {
 								                size_t id = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                const size_t rs = ne00 * nb00;
 								                char * dst_ptr = (char *) dst->data;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += rs * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                            memcpy(dst_ptr + id, src0_ptr, rs);
 								                            id += rs;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += rs * (ne01 - ir1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                    }
 								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } else if (type_traits[dst->type].from_float) {
 								                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								                size_t id = 0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                char * dst_ptr = (char *) dst->data;
 								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
 								                        id += rs * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
 								                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
 								                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
 								                            id += rs;
 								                        }
 								                        id += rs * (ne01 - ir1);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    }
 								                }
 								            } else {
 								                GGML_ASSERT(false); // TODO: implement
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
 								        } else {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            //printf("%s: this is not optimal - fix me\n", __func__);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            if (dst->type == GGML_TYPE_F32) {
 								                size_t id = 0;
 								                float * dst_ptr = (float *) dst->data;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            for (int i00 = 0; i00 < ne00; i00++) {
 								                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                dst_ptr[id] = *src0_ptr;
 								                                id++;
 								                            }
 								                        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * (ne01 - ir1);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    }
 								                }
 								            } else if (dst->type == GGML_TYPE_F16) {
 								                size_t id = 0;
 								                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
 								                for (int i03 = 0; i03 < ne03; i03++) {
 								                    for (int i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * ir0;
 								                        for (int i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            for (int i00 = 0; i00 < ne00; i00++) {
 								                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 								                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
 								                                id++;
 								                            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        id += ne00 * (ne01 - ir1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                    }
 								                }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            } else {
 								                GGML_ASSERT(false); // TODO: implement
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        return;
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    // dst counters
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    int64_t i10 = 0;
 								    int64_t i11 = 0;
 								    int64_t i12 = 0;
 								    int64_t i13 = 0;
 								    if (dst->type == GGML_TYPE_F32) {
 								        for (int64_t i03 = 0; i03 < ne03; i03++) {
 								            for (int64_t i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                i10 += ne00 * ir0;
 								                while (i10 >= ne0) {
 								                    i10 -= ne0;
 								                    if (++i11 == ne1) {
 								                        i11 = 0;
 								                        if (++i12 == ne2) {
 								                            i12 = 0;
 								                            if (++i13 == ne3) {
 								                                i13 = 0;
 								                            }
 								                        }
 								                    }
 								                }
 								                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 								                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
 								                        memcpy(dst_ptr, src0_ptr, sizeof(float));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        if (++i10 == ne0) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            i10 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                            if (++i11 == ne1) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                i11 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                                if (++i12 == ne2) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                    i12 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                                    if (++i13 == ne3) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                        i13 = 0;
 								                                    }
 								                                }
 								                            }
 								                        }
 								                    }
 								                }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                i10 += ne00 * (ne01 - ir1);
 								                while (i10 >= ne0) {
 								                    i10 -= ne0;
 								                    if (++i11 == ne1) {
 								                        i11 = 0;
 								                        if (++i12 == ne2) {
 								                            i12 = 0;
 								                            if (++i13 == ne3) {
 								                                i13 = 0;
 								                            }
 								                        }
 								                    }
 								                }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            }
 								        }
 								    } else if (dst->type == GGML_TYPE_F16) {
 								        for (int64_t i03 = 0; i03 < ne03; i03++) {
 								            for (int64_t i02 = 0; i02 < ne02; i02++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                i10 += ne00 * ir0;
 								                while (i10 >= ne0) {
 								                    i10 -= ne0;
 								                    if (++i11 == ne1) {
 								                        i11 = 0;
 								                        if (++i12 == ne2) {
 								                            i12 = 0;
 								                            if (++i13 == ne3) {
 								                                i13 = 0;
 								                            }
 								                        }
 								                    }
 								                }
 								                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                    for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 								                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
 								                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        if (++i10 == ne0) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            i10 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                            if (++i11 == ne1) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                i11 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                                if (++i12 == ne2) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                    i12 = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                                    if (++i13 == ne3) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                                        i13 = 0;
 								                                    }
 								                                }
 								                            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                        }
 								                    }
 								                }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                i10 += ne00 * (ne01 - ir1);
 								                while (i10 >= ne0) {
 								                    i10 -= ne0;
 								                    if (++i11 == ne1) {
 								                        i11 = 0;
 								                        if (++i12 == ne2) {
 								                            i12 = 0;
 								                            if (++i13 == ne3) {
 								                                i13 = 0;
 								                            }
 								                        }
 								                    }
 								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
 								        }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    } else {
 								        GGML_ASSERT(false); // TODO: implement
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								}
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
 								static void ggml_compute_forward_dup_bytes(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
 								    GGML_ASSERT(src0->type == dst->type);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								        return;
 								    }
 								    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								        ggml_compute_forward_dup_same_cont(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
 								    GGML_TENSOR_UNARY_OP_LOCALS;
 								    const size_t type_size = ggml_type_size(src0->type);
 								    const int ith = params->ith; // thread index
 								    const int nth = params->nth; // number of threads
 								    // parallelize by rows
 								    const int nr = ne01;
 								    // number of rows per thread
 								    const int dr = (nr + nth - 1) / nth;
 								    // row range for this thread
 								    const int ir0 = dr * ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    if (src0->type == dst->type &&
 								        ne00 == ne0 &&
 								        nb00 == type_size && nb0 == type_size) {
 								        // copy by rows
 								        const size_t rs = ne00 * type_size;
 								        for (int64_t i03 = 0; i03 < ne03; i03++) {
 								            for (int64_t i02 = 0; i02 < ne02; i02++) {
 								                for (int64_t i01 = ir0; i01 < ir1; i01++) {
 								                    memcpy(
 								                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
 								                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
 								                        rs);
 								                }
 								            }
 								        }
 								        return;
 								    }
 								    if (ggml_is_contiguous(dst)) {
 								        size_t id = 0;
 								        char * dst_ptr = (char *) dst->data;
 								        const size_t rs = ne00 * type_size;
 								        if (nb00 == type_size) {
 								            // src0 is contigous on first dimension, copy by rows
 								            for (int64_t i03 = 0; i03 < ne03; i03++) {
 								                for (int64_t i02 = 0; i02 < ne02; i02++) {
 								                    id += rs * ir0;
 								                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
 								                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
 								                        memcpy(dst_ptr + id, src0_ptr, rs);
 								                        id += rs;
 								                    }
 								                    id += rs * (ne01 - ir1);
 								                }
 								            }
 								        } else {
 								            //printf("%s: this is not optimal - fix me\n", __func__);
 								            for (int64_t i03 = 0; i03 < ne03; i03++) {
 								                for (int64_t i02 = 0; i02 < ne02; i02++) {
 								                    id += rs * ir0;
 								                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
 								                        for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
 								                            memcpy(dst_ptr + id, src0_ptr, type_size);
 								                            id += type_size;
 								                        }
 								                    }
 								                    id += rs * (ne01 - ir1);
 								                }
 								            }
 								        }
 								        return;
 								    }
 								    // dst counters
 								    int64_t i10 = 0;
 								    int64_t i11 = 0;
 								    int64_t i12 = 0;
 								    int64_t i13 = 0;
 								    for (int64_t i03 = 0; i03 < ne03; i03++) {
 								        for (int64_t i02 = 0; i02 < ne02; i02++) {
 								            i10 += ne00 * ir0;
 								            while (i10 >= ne0) {
 								                i10 -= ne0;
 								                if (++i11 == ne1) {
 								                    i11 = 0;
 								                    if (++i12 == ne2) {
 								                        i12 = 0;
 								                        if (++i13 == ne3) {
 								                            i13 = 0;
 								                        }
 								                    }
 								                }
 								            }
 								            for (int64_t i01 = ir0; i01 < ir1; i01++) {
 								                for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 								                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
 								                    memcpy(dst_ptr, src0_ptr, type_size);
 								                    if (++i10 == ne0) {
 								                        i10 = 0;
 								                        if (++i11 == ne1) {
 								                            i11 = 0;
 								                            if (++i12 == ne2) {
 								                                i12 = 0;
 								                                if (++i13 == ne3) {
 								                                    i13 = 0;
 								                                }
 								                            }
 								                        }
 								                    }
 								                }
 								            }
 								            i10 += ne00 * (ne01 - ir1);
 								            while (i10 >= ne0) {
 								                i10 -= ne0;
 								                if (++i11 == ne1) {
 								                    i11 = 0;
 								                    if (++i12 == ne2) {
 								                        i12 = 0;
 								                        if (++i13 == ne3) {
 								                            i13 = 0;
 								                        }
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_dup(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								    if (src0->type == dst->type) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								        ggml_compute_forward_dup_bytes(params, dst);
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								        return;
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_dup_f16(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_dup_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								// ggml_compute_forward_add
 								static void ggml_compute_forward_add_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												Add OpenCL add kernel (llama/5151)

* Add OpenCL add kernel

* Put add kernel into different string to stay within MSVC string length limit, disable float16 support due to bad results

											
										
										
											2024-01-26 22:07:32 +00:00
+								#ifdef GGML_USE_CLBLAST
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
-												Add OpenCL add kernel (llama/5151)

* Add OpenCL add kernel

* Put add kernel into different string to stay within MSVC string length limit, disable float16 support due to bad results

											
										
										
											2024-01-26 22:07:32 +00:00
+								        // TODO: OpenCL kernel support full broadcast
 								        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
 								        if (ith == 0) {
 								            ggml_cl_add(src0, src1, dst);
 								        }
 								        return;
 								    }
 								#endif
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int nr  = ggml_nrows(src0);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								    GGML_ASSERT( nb0 == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    if (nb10 == sizeof(float)) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        for (int ir = ir0; ir < ir1; ++ir) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            // src1 is broadcastable across src0 and dst in i1, i2, i3
 								            const int64_t i03 = ir/(ne02*ne01);
 								            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
 								            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            const int64_t i13 = i03 % ne13;
 								            const int64_t i12 = i02 % ne12;
 								            const int64_t i11 = i01 % ne11;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            const int64_t nr0 = ne00 / ne10;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
 								            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 								            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            for (int64_t r = 0; r < nr0; ++r) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#ifdef GGML_USE_ACCELERATE
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#else
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#endif
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        }
 								    } else {
 								        // src1 is not contiguous
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        for (int ir = ir0; ir < ir1; ++ir) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            // src1 is broadcastable across src0 and dst in i1, i2, i3
 								            const int64_t i03 = ir/(ne02*ne01);
 								            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
 								            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
 								            const int64_t i13 = i03 % ne13;
 								            const int64_t i12 = i02 % ne12;
 								            const int64_t i11 = i01 % ne11;
 								            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
 								            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            for (int64_t i0 = 0; i0 < ne0; ++i0) {
 								                const int64_t i10 = i0 % ne10;
 								                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_add_f16_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int nr  = ggml_nrows(src0);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    if (dst->type == GGML_TYPE_F32) {
 								        GGML_ASSERT( nb0 == sizeof(float));
 								    }
 								    else {
 								        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
 								        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    if (nb10 == sizeof(float)) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        if (dst->type == GGML_TYPE_F16) {
 								            for (int ir = ir0; ir < ir1; ++ir) {
 								                // src0, src1 and dst are same shape => same indices
 								                const int i3 = ir/(ne2*ne1);
 								                const int i2 = (ir - i3*ne2*ne1)/ne1;
 								                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
 								                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
 								                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
 								                for (int i = 0; i < ne0; i++) {
 								                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
 								                }
 								            }
 								        } else {
 								            for (int ir = ir0; ir < ir1; ++ir) {
 								                // src0, src1 and dst are same shape => same indices
 								                const int i3 = ir/(ne2*ne1);
 								                const int i2 = (ir - i3*ne2*ne1)/ne1;
 								                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
 								                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
 								                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
 								                for (int i = 0; i < ne0; i++) {
 								                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
 								                }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            }
 								        }
 								    }
 								    else {
 								        // src1 is not contiguous
 								        GGML_ASSERT(false);
 								    }
 								}
 								static void ggml_compute_forward_add_f16_f16(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												Use Accelerate framework on Apple silicon

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

											
										
										
											2022-10-17 18:44:16 +00:00
+								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int nr  = ggml_nrows(src0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    if (nb10 == sizeof(ggml_fp16_t)) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        for (int ir = ir0; ir < ir1; ++ir) {
 								            // src0, src1 and dst are same shape => same indices
 								            const int i3 = ir/(ne2*ne1);
 								            const int i2 = (ir - i3*ne2*ne1)/ne1;
 								            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
 								            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
 								            ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
 								            for (int i = 0; i < ne0; i++) {
 								                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
 								        }
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    else {
 								        // src1 is not contiguous
 								        GGML_ASSERT(false);
 								    }
 								}
 								static void ggml_compute_forward_add_q_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        return;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int nr  = ggml_nrows(src0);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const enum ggml_type type = src0->type;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const enum ggml_type dtype = dst->type;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								    // we don't support permuted src0 or src1
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(nb00 == ggml_type_size(type));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    GGML_ASSERT(nb10 == sizeof(float));
 								    // dst cannot be transposed or permuted
 								    GGML_ASSERT(nb0 <= nb1);
 								    GGML_ASSERT(nb1 <= nb2);
 								    GGML_ASSERT(nb2 <= nb3);
 								    GGML_ASSERT(ggml_is_quantized(src0->type));
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // src0 indices
 								        const int i03 = ir/(ne02*ne01);
 								        const int i02 = (ir - i03*ne02*ne01)/ne01;
 								        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
 								        // src1 and dst are same shape as src0 => same indices
 								        const int i13 = i03;
 								        const int i12 = i02;
 								        const int i11 = i01;
 								        const int i3 = i03;
 								        const int i2 = i02;
 								        const int i1 = i01;
 								        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
 								        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								        assert(ne00 % 32 == 0);
 								        // unquantize row from src0 to temp buffer
 								        dequantize_row_q(src0_row, wdata, ne00);
 								        // add src1
 								        ggml_vec_acc_f32(ne00, wdata, src1_row);
 								        // quantize row to dst
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        if (quantize_row_q != NULL) {
 								            quantize_row_q(wdata, dst_row, ne00);
 								        } else {
 								            memcpy(dst_row, wdata, ne0*nb0);
 								        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_add(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : check ggml_add src1 type (ggml/708)

Co-authored-by: Judd <foldl@boxvest.com>

											
										
										
											2024-01-26 13:04:01 +00:00
+								                if (src1->type == GGML_TYPE_F32) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                    ggml_compute_forward_add_f32(params, dst);
-												ggml : check ggml_add src1 type (ggml/708)

Co-authored-by: Judd <foldl@boxvest.com>

											
										
										
											2024-01-26 13:04:01 +00:00
+								                }
 								                else {
 								                    GGML_ASSERT(false);
 								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        case GGML_TYPE_F16:
 								            {
 								                if (src1->type == GGML_TYPE_F16) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                    ggml_compute_forward_add_f16_f16(params, dst);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                }
 								                else if (src1->type == GGML_TYPE_F32) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                    ggml_compute_forward_add_f16_f32(params, dst);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                }
 								                else {
 								                    GGML_ASSERT(false);
 								                }
 								            } break;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_TYPE_IQ2_XS:
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ1_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_add_q_f32(params, dst);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            } break;
 								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_compute_forward_add1
 								static void ggml_compute_forward_add1_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
 								    GGML_ASSERT(ggml_is_scalar(src1));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nr  = ggml_nrows(src0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT( nb0 == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // src0 and dst are same shape => same indices
 								        const int i3 = ir/(ne2*ne1);
 								        const int i2 = (ir - i3*ne2*ne1)/ne1;
 								        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								#ifdef GGML_USE_ACCELERATE
 								        UNUSED(ggml_vec_add1_f32);
 								        vDSP_vadd(
 								                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
 								                (float *) ((char *) src1->data), 0,
 								                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
 								                ne0);
 								#else
 								        ggml_vec_add1_f32(ne0,
 								                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
 								                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
 								               *(float *) src1->data);
 								#endif
 								    }
 								}
 								static void ggml_compute_forward_add1_f16_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
 								    GGML_ASSERT(ggml_is_scalar(src1));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    // scalar to add
 								    const float v = *(float *) src1->data;
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nr  = ggml_nrows(src0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // src0 and dst are same shape => same indices
 								        const int i3 = ir/(ne2*ne1);
 								        const int i2 = (ir - i3*ne2*ne1)/ne1;
 								        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
 								        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
 								        for (int i = 0; i < ne0; i++) {
 								            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
 								        }
 								    }
 								}
 								static void ggml_compute_forward_add1_f16_f16(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
 								    GGML_ASSERT(ggml_is_scalar(src1));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    // scalar to add
 								    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nr  = ggml_nrows(src0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // src0 and dst are same shape => same indices
 								        const int i3 = ir/(ne2*ne1);
 								        const int i2 = (ir - i3*ne2*ne1)/ne1;
 								        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
 								        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
 								        for (int i = 0; i < ne0; i++) {
 								            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
 								        }
 								    }
 								}
 								static void ggml_compute_forward_add1_q_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
 								    GGML_ASSERT(ggml_is_scalar(src1));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    // scalar to add
 								    const float v = *(float *) src1->data;
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nr  = ggml_nrows(src0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    const enum ggml_type type = src0->type;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
 								    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    // we don't support permuted src0
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(nb00 == ggml_type_size(type));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    // dst cannot be transposed or permuted
 								    GGML_ASSERT(nb0 <= nb1);
 								    GGML_ASSERT(nb1 <= nb2);
 								    GGML_ASSERT(nb2 <= nb3);
 								    GGML_ASSERT(ggml_is_quantized(src0->type));
 								    GGML_ASSERT(dst->type == src0->type);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // src0 and dst are same shape => same indices
 								        const int i3 = ir/(ne2*ne1);
 								        const int i2 = (ir - i3*ne2*ne1)/ne1;
 								        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
 								        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
 								        assert(ne0 % 32 == 0);
 								        // unquantize row from src0 to temp buffer
 								        dequantize_row_q(src0_row, wdata, ne0);
 								        // add src1
 								        ggml_vec_acc1_f32(ne0, wdata, v);
 								        // quantize row to dst
 								        quantize_row_q(wdata, dst_row, ne0);
 								    }
 								}
 								static void ggml_compute_forward_add1(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_add1_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        case GGML_TYPE_F16:
 								            {
 								                if (src1->type == GGML_TYPE_F16) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                    ggml_compute_forward_add1_f16_f16(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								                else if (src1->type == GGML_TYPE_F32) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                    ggml_compute_forward_add1_f16_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								                else {
 								                    GGML_ASSERT(false);
 								                }
 								            } break;
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								        case GGML_TYPE_Q8_1:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_TYPE_IQ2_XS:
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ1_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_add1_q_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_acc
 								static void ggml_compute_forward_acc_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
 								    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
 								    // view src0 and dst with these strides and data offset inbytes during acc
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    // nb0 is implicitly element_size because src0 and dst are contiguous
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    size_t nb1     = ((int32_t *) dst->op_params)[0];
 								    size_t nb2     = ((int32_t *) dst->op_params)[1];
 								    size_t nb3     = ((int32_t *) dst->op_params)[2];
 								    size_t offset  = ((int32_t *) dst->op_params)[3];
 								    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (params->ith != 0) {
 								            return;
 								        }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        // memcpy needs to be synchronized across threads to avoid race conditions.
 								        // => do it in INIT phase
 								        memcpy(
 								            ((char *)  dst->data),
 								            ((char *) src0->data),
 								            ggml_nbytes(dst));
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nr = ggml_nrows(src1);
 								    const int nc = src1->ne[0];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
 								    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    // src0 and dst as viewed during acc
 								    const size_t nb0 = ggml_element_size(src0);
 								    const size_t nb00 = nb0;
 								    const size_t nb01 = nb1;
 								    const size_t nb02 = nb2;
 								    const size_t nb03 = nb3;
 								    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_nbytes(dst));
 								    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));
 								    GGML_ASSERT(nb10 == sizeof(float));
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // src0 and dst are viewed with shape of src1 and offset
 								        // => same indices
 								        const int i3 = ir/(ne12*ne11);
 								        const int i2 = (ir - i3*ne12*ne11)/ne11;
 								        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
 								#ifdef GGML_USE_ACCELERATE
 								        vDSP_vadd(
 								                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
 								                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
 								                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
 								#else
 								        ggml_vec_add_f32(nc,
 								                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
 								                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
 								                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
 								#endif
 								    }
 								}
 								static void ggml_compute_forward_acc(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_acc_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        case GGML_TYPE_F16:
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								        case GGML_TYPE_Q8_1:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_TYPE_IQ2_XS:
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ1_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_sub
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_sub_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int nr  = ggml_nrows(src0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT( nb0 == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    if (nb10 == sizeof(float)) {
 								        for (int ir = 0; ir < nr; ++ir) {
 								            // src0, src1 and dst are same shape => same indices
 								            const int i3 = ir/(ne2*ne1);
 								            const int i2 = (ir - i3*ne2*ne1)/ne1;
 								            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								#ifdef GGML_USE_ACCELERATE
 								            vDSP_vsub(
 								                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
 								                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
 								                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
 								                    ne0);
 								#else
 								            ggml_vec_sub_f32(ne0,
 								                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
 								                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
 								                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
 								#endif
 								                // }
 								            // }
 								        }
 								    } else {
 								        // src1 is not contiguous
 								        for (int ir = 0; ir < nr; ++ir) {
 								            // src0, src1 and dst are same shape => same indices
 								            const int i3 = ir/(ne2*ne1);
 								            const int i2 = (ir - i3*ne2*ne1)/ne1;
 								            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
 								            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
 								            for (int i0 = 0; i0 < ne0; i0++) {
 								                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
 								                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
 								            }
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_sub(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sub_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
 								// ggml_compute_forward_mul
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_mul_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								#if defined(GGML_USE_CLBLAST)
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								        // TODO: OpenCL kernel support full broadcast
 								        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        if (ith == 0) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            ggml_cl_mul(src0, src1, dst);
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        }
 								        return;
 								    }
 								#endif
 								    const int64_t nr = ggml_nrows(src0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT( nb0 == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    if (nb10 == sizeof(float)) {
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        for (int64_t ir = ith; ir < nr; ir += nth) {
 								            // src0 and dst are same shape => same indices
 								            const int64_t i03 = ir/(ne02*ne01);
 								            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
 								            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
 								            const int64_t i13 = i03 % ne13;
 								            const int64_t i12 = i02 % ne12;
 								            const int64_t i11 = i01 % ne11;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            const int64_t nr0 = ne00 / ne10;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
 								            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 								            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            for (int64_t r = 0 ; r < nr0; ++r) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#ifdef GGML_USE_ACCELERATE
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                UNUSED(ggml_vec_mul_f32);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#else
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#endif
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        }
 								    } else {
 								        // src1 is not contiguous
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        for (int64_t ir = ith; ir < nr; ir += nth) {
 								            // src0 and dst are same shape => same indices
 								            // src1 is broadcastable across src0 and dst in i1, i2, i3
 								            const int64_t i03 = ir/(ne02*ne01);
 								            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
 								            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								            const int64_t i13 = i03 % ne13;
 								            const int64_t i12 = i02 % ne12;
 								            const int64_t i11 = i01 % ne11;
 								            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
 								            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            for (int64_t i0 = 0; i0 < ne00; ++i0) {
 								                const int64_t i10 = i0 % ne10;
 								                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_mul(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_mul_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_div
 								static void ggml_compute_forward_div_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int64_t nr = ggml_nrows(src0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT( nb0 == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    if (nb10 == sizeof(float)) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        for (int64_t ir = ith; ir < nr; ir += nth) {
 								            // src0 and dst are same shape => same indices
 								            const int64_t i03 = ir/(ne02*ne01);
 								            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
 								            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
 								            const int64_t i13 = i03 % ne13;
 								            const int64_t i12 = i02 % ne12;
 								            const int64_t i11 = i01 % ne11;
 								            const int64_t nr0 = ne00 / ne10;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
 								            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 								            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 								            for (int64_t r = 0; r < nr0; ++r) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#ifdef GGML_USE_ACCELERATE
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                UNUSED(ggml_vec_div_f32);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#else
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#endif
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        }
 								    } else {
 								        // src1 is not contiguous
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        for (int64_t ir = ith; ir < nr; ir += nth) {
 								            // src0 and dst are same shape => same indices
 								            // src1 is broadcastable across src0 and dst in i1, i2, i3
 								            const int64_t i03 = ir/(ne02*ne01);
 								            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
 								            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            const int64_t i13 = i03 % ne13;
 								            const int64_t i12 = i02 % ne12;
 								            const int64_t i11 = i01 % ne11;
 								            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
 								            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 								            for (int64_t i0 = 0; i0 < ne00; ++i0) {
 								                const int64_t i10 = i0 % ne10;
 								                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
 								            }
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_div(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_div_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_compute_forward_sqr
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_sqr_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    assert(ggml_are_same_shape(src0, dst));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int n     = ggml_nrows(src0);
 								    const int nc    = src0->ne[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    assert( dst->nb[0] == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        ggml_vec_sqr_f32(nc,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                (float *) ((char *) src0->data + i*(src0->nb[1])));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_sqr(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sqr_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_compute_forward_sqrt
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_sqrt_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    assert( dst->nb[0] == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        ggml_vec_sqrt_f32(nc,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_sqrt(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sqrt_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_compute_forward_log
 								static void ggml_compute_forward_log_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(params->ith == 0);
 								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT( dst->nb[0] == sizeof(float));
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    for (int i = 0; i < n; i++) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        ggml_vec_log_f32(nc,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_log(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_log_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
 								// ggml_compute_forward_sum
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_sum_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_is_scalar(dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    assert(ggml_is_scalar(dst));
 								    assert(src0->nb[0] == sizeof(float));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
 								    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    ggml_float sum     = 0;
 								    ggml_float row_sum = 0;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    for (int64_t i03 = 0; i03 < ne03; i03++) {
 								        for (int64_t i02 = 0; i02 < ne02; i02++) {
 								            for (int64_t i01 = 0; i01 < ne01; i01++) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                ggml_vec_sum_f32_ggf(ne00,
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                        &row_sum,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                sum += row_sum;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
 								        }
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    ((float *) dst->data)[0] = sum;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_sum_f16(
 								    const struct ggml_compute_params * params,
 								          struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_is_scalar(dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
 								    assert(src0->nb[0] == sizeof(ggml_fp16_t));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
 								    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    float sum = 0;
 								    float row_sum = 0;
 								    for (int64_t i03 = 0; i03 < ne03; i03++) {
 								        for (int64_t i02 = 0; i02 < ne02; i02++) {
 								            for (int64_t i01 = 0; i01 < ne01; i01++) {
 								                ggml_vec_sum_f16_ggf(ne00,
 								                    &row_sum,
 								                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
 								                sum += row_sum;
 								            }
 								        }
 								    }
 								    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_sum(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sum_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sum_f16(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_compute_forward_sum_rows
 								static void ggml_compute_forward_sum_rows_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(params->ith == 0);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    GGML_ASSERT(dst->nb[0] == sizeof(float));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT(ne0 == 1);
 								    GGML_ASSERT(ne1 == ne01);
 								    GGML_ASSERT(ne2 == ne02);
 								    GGML_ASSERT(ne3 == ne03);
 								    for (int64_t i3 = 0; i3 < ne03; i3++) {
 								        for (int64_t i2 = 0; i2 < ne02; i2++) {
 								            for (int64_t i1 = 0; i1 < ne01; i1++) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
 								                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                float row_sum = 0;
 								                ggml_vec_sum_f32(ne00, &row_sum, src_row);
 								                dst_row[0] = row_sum;
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_sum_rows(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sum_rows_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_compute_forward_mean
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_mean_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    assert(src0->nb[0] == sizeof(float));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    assert(ne0 == 1);
 								    assert(ne1 == ne01);
 								    assert(ne2 == ne02);
 								    assert(ne3 == ne03);
 								    UNUSED(ne0);
 								    UNUSED(ne1);
 								    UNUSED(ne2);
 								    UNUSED(ne3);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    for (int64_t i03 = 0; i03 < ne03; i03++) {
 								        for (int64_t i02 = 0; i02 < ne02; i02++) {
 								            for (int64_t i01 = 0; i01 < ne01; i01++) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                ggml_vec_sum_f32(ne00,
 								                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
 								                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
 								                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
 								            }
 								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_mean(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_mean_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// ggml_compute_forward_argmax
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								static void ggml_compute_forward_argmax_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    assert(params->ith == 0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    assert(src0->nb[0] == sizeof(float));
 								    assert(dst->nb[0] == sizeof(float));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    const int64_t ne00 = src0->ne[0];
 								    const int64_t ne01 = src0->ne[1];
 								    const size_t nb01 = src0->nb[1];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    const size_t nb0 = dst->nb[0];
 								    for (int64_t i1 = 0; i1 < ne01; i1++) {
 								        float * src = (float *) ((char *) src0->data + i1*nb01);
 								        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
 								        int v = 0;
 								        ggml_vec_argmax_f32(ne00, &v, src);
 								        dst_[0] = v;
 								    }
 								}
 								static void ggml_compute_forward_argmax(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_argmax_f32(params, dst);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_repeat
 								static void ggml_compute_forward_repeat_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    GGML_ASSERT(params->ith == 0);
 								    GGML_ASSERT(ggml_can_repeat(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    // guaranteed to be an integer due to the check in ggml_can_repeat
 								    const int nr0 = (int)(ne0/ne00);
 								    const int nr1 = (int)(ne1/ne01);
 								    const int nr2 = (int)(ne2/ne02);
 								    const int nr3 = (int)(ne3/ne03);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    // TODO: support for transposed / permuted tensors
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(nb0  == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    // TODO: maybe this is not optimal?
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    for                         (int i3 = 0; i3 < nr3;  i3++) {
 								        for                     (int k3 = 0; k3 < ne03; k3++) {
 								            for                 (int i2 = 0; i2 < nr2;  i2++) {
 								                for             (int k2 = 0; k2 < ne02; k2++) {
 								                    for         (int i1 = 0; i1 < nr1;  i1++) {
 								                        for     (int k1 = 0; k1 < ne01; k1++) {
 								                            for (int i0 = 0; i0 < nr0;  i0++) {
 								                                ggml_vec_cpy_f32(ne00,
 								                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
 								                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
 								                            }
 								                        }
 								                    }
 								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
 								        }
 								    }
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static void ggml_compute_forward_repeat_f16(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(params->ith == 0);
 								    GGML_ASSERT(ggml_can_repeat(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return;
 								    }
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    // guaranteed to be an integer due to the check in ggml_can_repeat
 								    const int nr0 = (int)(ne0/ne00);
 								    const int nr1 = (int)(ne1/ne01);
 								    const int nr2 = (int)(ne2/ne02);
 								    const int nr3 = (int)(ne3/ne03);
 								    // TODO: support for transposed / permuted tensors
 								    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 								    // TODO: maybe this is not optimal?
 								    for                         (int i3 = 0; i3 < nr3;  i3++) {
 								        for                     (int k3 = 0; k3 < ne03; k3++) {
 								            for                 (int i2 = 0; i2 < nr2;  i2++) {
 								                for             (int k2 = 0; k2 < ne02; k2++) {
 								                    for         (int i1 = 0; i1 < nr1;  i1++) {
 								                        for     (int k1 = 0; k1 < ne01; k1++) {
 								                            for (int i0 = 0; i0 < nr0;  i0++) {
 								                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
 								                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
 								                                // ggml_vec_cpy_f16(ne00, y, x)
 								                                for (int i = 0; i < ne00; ++i) {
 								                                    y[i]  = x[i];
 								                                }
 								                            }
 								                        }
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_repeat(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_TYPE_F16:
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								        case GGML_TYPE_I16:
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_repeat_f16(params, dst);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_F32:
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								        case GGML_TYPE_I32:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_repeat_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_repeat_back
 								static void ggml_compute_forward_repeat_back_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(params->ith == 0);
 								    GGML_ASSERT(ggml_can_repeat(dst, src0));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    // guaranteed to be an integer due to the check in ggml_can_repeat
 								    const int nr0 = (int)(ne00/ne0);
 								    const int nr1 = (int)(ne01/ne1);
 								    const int nr2 = (int)(ne02/ne2);
 								    const int nr3 = (int)(ne03/ne3);
 								    // TODO: support for transposed / permuted tensors
 								    GGML_ASSERT(nb0  == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    if (ggml_is_contiguous(dst)) {
 								        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
 								    } else {
 								        for         (int k3 = 0; k3 < ne3; k3++) {
 								            for     (int k2 = 0; k2 < ne2; k2++) {
 								                for (int k1 = 0; k1 < ne1; k1++) {
 								                    ggml_vec_set_f32(ne0,
 								                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
 );
 								                }
 								            }
 								        }
 								    }
 								    // TODO: maybe this is not optimal?
 								    for                         (int i3 = 0; i3 < nr3; i3++) {
 								        for                     (int k3 = 0; k3 < ne3; k3++) {
 								            for                 (int i2 = 0; i2 < nr2; i2++) {
 								                for             (int k2 = 0; k2 < ne2; k2++) {
 								                    for         (int i1 = 0; i1 < nr1; i1++) {
 								                        for     (int k1 = 0; k1 < ne1; k1++) {
 								                            for (int i0 = 0; i0 < nr0; i0++) {
 								                                ggml_vec_acc_f32(ne0,
 								                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
 								                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
 								                            }
 								                        }
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_repeat_back(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_repeat_back_f32(params, dst);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_concat
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_concat_f32(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(src0->nb[0] == sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int ith = params->ith;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    const int nth = params->nth;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // TODO: support for transposed / permuted tensors
 								    GGML_ASSERT(nb0  == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    GGML_ASSERT(nb10 == sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    for (int i3 = 0; i3 < ne3; i3++) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        for (int i2 = ith; i2 < ne2; i2 += nth) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            if (i2 < ne02) { // src0
 								                for (int i1 = 0; i1 < ne1; i1++) {
 								                    for (int i0 = 0; i0 < ne0; i0++) {
 								                        const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
 								                        *y = *x;
 								                    }
 								                }
 								            } // src1
 								            else {
 								                for (int i1 = 0; i1 < ne1; i1++) {
 								                    for (int i0 = 0; i0 < ne0; i0++) {
 								                        const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
 								                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
 								                        *y = *x;
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_concat(
 								    const struct ggml_compute_params* params,
 								    struct ggml_tensor* dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								        case GGML_TYPE_I32:
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_concat_f32(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_abs
 								static void ggml_compute_forward_abs_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_abs_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
 								static void ggml_compute_forward_abs(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_abs_f32(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_sgn
 								static void ggml_compute_forward_sgn_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_sgn_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_sgn(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sgn_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
 								// ggml_compute_forward_neg
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_neg_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_neg_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_neg(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_neg_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
 								// ggml_compute_forward_step
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_step_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_step_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_step(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_step_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// ggml_compute_forward_tanh
 								static void ggml_compute_forward_tanh_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_tanh_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
 								static void ggml_compute_forward_tanh(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_tanh_f32(params, dst);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_elu
 								static void ggml_compute_forward_elu_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_elu_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
 								static void ggml_compute_forward_elu(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_elu_f32(params, dst);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_compute_forward_relu
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_relu_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_relu_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_relu(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_relu_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
 								// ggml_compute_forward_gelu
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_gelu_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
 								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nc = src0->ne[0];
 								    const int nr = ggml_nrows(src0);
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int i1 = ir0; i1 < ir1; i1++) {
 								        ggml_vec_gelu_f32(nc,
 								                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i1*(src0->nb[1])));
 								#ifndef NDEBUG
 								        for (int k = 0; k < nc; k++) {
 								            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
 								            UNUSED(x);
 								            assert(!isnan(x));
 								            assert(!isinf(x));
 								        }
 								#endif
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_gelu(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_gelu_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_gelu_quick
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static void ggml_compute_forward_gelu_quick_f32(
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
 								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nc = src0->ne[0];
 								    const int nr = ggml_nrows(src0);
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int i1 = ir0; i1 < ir1; i1++) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        ggml_vec_gelu_quick_f32(nc,
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i1*(src0->nb[1])));
 								#ifndef NDEBUG
 								        for (int k = 0; k < nc; k++) {
 								            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
 								            UNUSED(x);
 								            assert(!isnan(x));
 								            assert(!isinf(x));
 								        }
 								#endif
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static void ggml_compute_forward_gelu_quick(
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_gelu_quick_f32(params, dst);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            } break;
 								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_silu
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static void ggml_compute_forward_silu_f32(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
 								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nc = src0->ne[0];
 								    const int nr = ggml_nrows(src0);
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int i1 = ir0; i1 < ir1; i1++) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        ggml_vec_silu_f32(nc,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								#ifndef NDEBUG
 								        for (int k = 0; k < nc; k++) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            UNUSED(x);
 								            assert(!isnan(x));
 								            assert(!isinf(x));
 								        }
 								#endif
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static void ggml_compute_forward_silu(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_silu_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								// ggml_compute_forward_leaky_relu
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								static void ggml_compute_forward_leaky_relu_f32(
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    float negative_slope;
 								    memcpy(&negative_slope, dst->op_params, sizeof(float));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        ggml_vec_leaky_relu_f32(nc,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    }
 								}
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								static void ggml_compute_forward_leaky_relu(
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_leaky_relu_f32(params, dst);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_silu_back
 								static void ggml_compute_forward_silu_back_f32(
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * grad = dst->src[1];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
 								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
 								    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, grad));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    const int nc = src0->ne[0];
 								    const int nr = ggml_nrows(src0);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int i1 = ir0; i1 < ir1; i1++) {
 								        ggml_vec_silu_backward_f32(nc,
 								                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i1*(src0->nb[1])),
 								                (float *) ((char *) grad->data + i1*(grad->nb[1])));
 								#ifndef NDEBUG
 								        for (int k = 0; k < nc; k++) {
 								            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
 								            UNUSED(x);
 								            assert(!isnan(x));
 								            assert(!isinf(x));
 								        }
 								#endif
 								    }
 								}
 								static void ggml_compute_forward_silu_back(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_silu_back_f32(params, dst);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
 								static void ggml_compute_forward_hardswish_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_hardswish_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
 								static void ggml_compute_forward_hardswish(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_hardswish_f32(params, dst);
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								static void ggml_compute_forward_hardsigmoid_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								    assert(params->ith == 0);
 								    assert(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert(dst->nb[0]  == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        ggml_vec_hardsigmoid_f32(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])));
 								    }
 								}
 								static void ggml_compute_forward_hardsigmoid(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_hardsigmoid_f32(params, dst);
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_norm
 								static void ggml_compute_forward_norm_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    float eps;
 								    memcpy(&eps, dst->op_params, sizeof(float));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    GGML_ASSERT(eps > 0.0f);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    // TODO: optimize
 								    for (int64_t i03 = 0; i03 < ne03; i03++) {
 								        for (int64_t i02 = 0; i02 < ne02; i02++) {
 								            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
 								                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
 								                ggml_float sum = 0.0;
 								                for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                    sum += (ggml_float)x[i00];
 								                }
 								                float mean = sum/ne00;
 								                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
 								                ggml_float sum2 = 0.0;
 								                for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                    float v = x[i00] - mean;
 								                    y[i00] = v;
 								                    sum2 += (ggml_float)(v*v);
 								                }
 								                float variance = sum2/ne00;
 								                const float scale = 1.0f/sqrtf(variance + eps);
 								                ggml_vec_scale_f32(ne00, y, scale);
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_norm(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_norm_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_group_rms_norm
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_rms_norm_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Use Accelerate framework on Apple silicon

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

											
										
										
											2022-10-17 18:44:16 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												Use Accelerate framework on Apple silicon

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

											
										
										
											2022-10-17 18:44:16 +00:00
+								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    float eps;
 								    memcpy(&eps, dst->op_params, sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    GGML_ASSERT(eps > 0.0f);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // TODO: optimize
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    for (int64_t i03 = 0; i03 < ne03; i03++) {
 								        for (int64_t i02 = 0; i02 < ne02; i02++) {
 								            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                ggml_float sum = 0.0;
 								                for (int64_t i00 = 0; i00 < ne00; i00++) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    sum += (ggml_float)(x[i00] * x[i00]);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                const float mean = sum/ne00;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                memcpy(y, x, ne00 * sizeof(float));
 								                // for (int i00 = 0; i00 < ne00; i00++) {
 								                //     y[i00] = x[i00];
 								                // }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                const float scale = 1.0f/sqrtf(mean + eps);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								                ggml_vec_scale_f32(ne00, y, scale);
 								            }
 								        }
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_rms_norm(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rms_norm_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_rms_norm_back_f32(
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        return;
 								    }
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    float eps;
 								    memcpy(&eps, dst->op_params, sizeof(float));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    // TODO: optimize
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    for (int64_t i03 = 0; i03 < ne03; i03++) {
 								        for (int64_t i02 = 0; i02 < ne02; i02++) {
 								            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // src1 is same shape as src0 => same indices
 								                const int64_t i11 = i01;
 								                const int64_t i12 = i02;
 								                const int64_t i13 = i03;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
 								                ggml_float sum_xx  = 0.0;
 								                ggml_float sum_xdz = 0.0;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                for (int64_t i00 = 0; i00 < ne00; i00++) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    sum_xx  += (ggml_float)(x[i00] * x[i00]);
 								                    sum_xdz += (ggml_float)(x[i00] * dz[i00]);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                //const float mean     = (float)(sum_xx)/ne00;
 								                const float mean_eps = (float)(sum_xx)/ne00 + eps;
 								                const float sum_eps  = (float)(sum_xx) + eps*ne00;
 								                //const float mean_xdz = (float)(sum_xdz)/ne00;
 								                // we could cache rms from forward pass to improve performance.
 								                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
 								                //const float rms      = sqrtf(mean_eps);
 								                const float rrms     = 1.0f / sqrtf(mean_eps);
 								                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                {
 								                    // z = rms_norm(x)
 								                    //
 								                    // rms_norm(src0) =
 								                    //     scale(
 								                    //         src0,
 								                    //         div(
 								                    //             1,
 								                    //             sqrt(
 								                    //                 add(
 								                    //                     scale(
 								                    //                         sum(
 								                    //                             sqr(
 								                    //                                 src0)),
 								                    //                         (1.0/N)),
 								                    //                     eps))));
 								                    // postorder:
 								                    // ## op    args         grad
 								                    // 00 param src0         grad[#00]
 								                    // 01 const 1
 								                    // 02 sqr   (#00)        grad[#02]
 								                    // 03 sum   (#02)        grad[#03]
 								                    // 04 const 1/N
 								                    // 05 scale (#03, #04)   grad[#05]
 								                    // 06 const eps
 								                    // 07 add   (#05, #06)   grad[#07]
 								                    // 08 sqrt  (#07)        grad[#08]
 								                    // 09 div   (#01,#08)    grad[#09]
 								                    // 10 scale (#00,#09)    grad[#10]
 								                    //
 								                    // backward pass, given grad[#10]
 								                    // #10: scale
 								                    // grad[#00] += scale(grad[#10],#09)
 								                    // grad[#09] += sum(mul(grad[#10],#00))
 								                    // #09: div
 								                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
 								                    // #08: sqrt
 								                    // grad[#07] += mul(grad[#08], div(0.5, #08))
 								                    // #07: add
 								                    // grad[#05] += grad[#07]
 								                    // #05: scale
 								                    // grad[#03] += scale(grad[#05],#04)
 								                    // #03: sum
 								                    // grad[#02] += repeat(grad[#03], #02)
 								                    // #02:
 								                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
 								                    //
 								                    // substitute and simplify:
 								                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
 								                    // grad[#02] = repeat(grad[#03], #02)
 								                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
 								                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
 								                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
 								                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
 								                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
 								                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
 								                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
 								                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
 								                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
 								                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
 								                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
 								                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
 								                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
 								                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
 								                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
 								                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
 								                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
 								                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
 								                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
 								                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
 								                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
 								                    // a = b*c + d*e
 								                    // a = b*c*f/f + d*e*f/f
 								                    // a = (b*c*f + d*e*f)*(1/f)
 								                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
 								                    // a = (b + d*e/c)*c
 								                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
 								                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
 								                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
 								                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
 								                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
 								                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
 								                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
 								                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
 								                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
 								                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
 								                }
 								                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
 								                // post-order:
 								                // dx := x
 								                // dx := scale(dx,-mean_xdz/mean_eps)
 								                // dx := add(dx, dz)
 								                // dx := scale(dx, rrms)
 								                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
 								                ggml_vec_cpy_f32  (ne00, dx, x);
 								                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
 								                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
 								                ggml_vec_acc_f32  (ne00, dx, dz);
 								                ggml_vec_scale_f32(ne00, dx, rrms);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            }
 								        }
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								static void ggml_compute_forward_rms_norm_back(
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rms_norm_back_f32(params, dst);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_group_norm
 								static void ggml_compute_forward_group_norm_f32(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    const float eps = 1e-6f; // TODO: make this a parameter
 								    // TODO: optimize
 								    int n_channels = src0->ne[2];
 								    int n_groups = dst->op_params[0];
 								    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								    for (int i = ith; i < n_groups; i += nth) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        int start = i * n_channels_per_group;
 								        int end = start + n_channels_per_group;
 								        if (end > n_channels) {
 								            end = n_channels;
 								        }
 								        int step = end - start;
 								        for (int64_t i03 = 0; i03 < ne03; i03++) {
 								            ggml_float sum = 0.0;
 								            for (int64_t i02 = start; i02 < end; i02++) {
 								                for (int64_t i01 = 0; i01 < ne01; i01++) {
 								                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								                    ggml_float sumr = 0.0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								                        sumr += (ggml_float)x[i00];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								                    sum += sumr;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                }
 								            }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								            const float mean = sum / (ne00 * ne01 * step);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								            ggml_float sum2 = 0.0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            for (int64_t i02 = start; i02 < end; i02++) {
 								                for (int64_t i01 = 0; i01 < ne01; i01++) {
 								                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
 								                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								                    ggml_float sumr = 0.0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                        float v = x[i00] - mean;
 								                        y[i00] = v;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								                        sumr += (ggml_float)(v * v);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								                    sum2 += sumr;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                }
 								            }
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								            const float variance = sum2 / (ne00 * ne01 * step);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            const float scale = 1.0f / sqrtf(variance + eps);
 								            for (int64_t i02 = start; i02 < end; i02++) {
 								                for (int64_t i01 = 0; i01 < ne01; i01++) {
 								                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
 								                    ggml_vec_scale_f32(ne00, y, scale);
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_group_norm(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_group_norm_f32(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_compute_forward_mul_mat
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-												Use Accelerate framework on Apple silicon

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

											
										
										
											2022-10-17 18:44:16 +00:00
+								// helper function to determine if it is better to use BLAS or not
 								// for large matrices, BLAS is faster
-												ggml : do not sched_yield when calling BLAS (llama/4761)

* ggml : do not sched_yield when calling BLAS

ggml-ci

* ggml : fix do_yield logic

ggml-ci

* ggml : simplify do_yield logic

ggml-ci

											
										
										
											2024-01-05 13:18:21 +00:00
+								static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    //const int64_t ne00 = src0->ne[0];
 								    //const int64_t ne01 = src0->ne[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    const int64_t ne10 = src1->ne[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    const int64_t ne0 = dst->ne[0];
 								    const int64_t ne1 = dst->ne[1];
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
 								    //       all the experts for each batch element and the processing would become incredibly slow
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    // TODO: find the optimal values for these
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    if (dst->op != GGML_OP_MUL_MAT_ID &&
 								        ggml_is_contiguous(src0) &&
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        ggml_is_contiguous(src1) &&
-												ggml : re-enable blas for src0 != F32 (#1583)


											
										
										
											2023-12-01 21:57:52 +00:00
+								      //src0->type == GGML_TYPE_F32 &&
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        src1->type == GGML_TYPE_F32 &&
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
 								        return true;
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    return false;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								#endif
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_mul_mat(
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        const struct ggml_compute_params * params,
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const enum ggml_type type = src0->type;
 								    const bool src1_cont = ggml_is_contiguous(src1);
 								    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
 								    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
 								    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								    int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    GGML_ASSERT(ne0 == ne01);
 								    GGML_ASSERT(ne1 == ne11);
 								    GGML_ASSERT(ne2 == ne12);
 								    GGML_ASSERT(ne3 == ne13);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    // we don't support permuted src0 or src1
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(nb00 == ggml_type_size(type));
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    // dst cannot be transposed or permuted
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(nb0 == sizeof(float));
 								    GGML_ASSERT(nb0 <= nb1);
 								    GGML_ASSERT(nb1 <= nb2);
 								    GGML_ASSERT(nb2 <= nb3);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // broadcast factors
 								    const int64_t r2 = ne12/ne02;
 								    const int64_t r3 = ne13/ne03;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    // nb01 >= nb00 - src0 is not transposed
 								    //   compute by src0 rows
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#if defined(GGML_USE_CLBLAST)
 								    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        }
 								        return;
 								    }
 								#endif
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-												ggml : do not sched_yield when calling BLAS (llama/4761)

* ggml : do not sched_yield when calling BLAS

ggml-ci

* ggml : fix do_yield logic

ggml-ci

* ggml : simplify do_yield logic

ggml-ci

											
										
										
											2024-01-05 13:18:21 +00:00
+								    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        const int64_t ne_plane      = ne01*ne00;
-												ggml : minor type fix (int64_t -> size_t)

											
										
										
											2024-01-28 16:44:58 +00:00
+								        const size_t  desired_wsize = ne13*ne12*ne_plane*sizeof(float);
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        UNUSED(desired_wsize);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								            if (type != GGML_TYPE_F32) {
 								                assert(params->wsize >= desired_wsize);
 								                // parallelize by src0 rows
 								                for (int64_t i13 = 0; i13 < ne13; i13++) {
 								                    for (int64_t i12 = 0; i12 < ne12; i12++) {
 								                        // broadcast src0 into src1 across 2nd,3rd dimension
 								                        const int64_t i03 = i13/r3;
 								                        const int64_t i02 = i12/r2;
 								                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
 								                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
 								                              ggml_to_float_t  const to_float = type_traits[type].to_float;
 								                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
 								                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
 								                        }
 								                    }
 								                }
 								            }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            return;
 								        }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            return;
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        // perform sgemm, parallelization controlled by blas lib
 								        if (ith != 0) {
 								            return;
 								        }
-												minor : clean-up some warnings and style (llama/5094)

* minor : clean-up some warnings and style

ggml-ci

* ggml : add comment

											
										
										
											2024-01-23 12:12:57 +00:00
+								        //const int64_t tgemm0 = ggml_perf_time_us();
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        for (int64_t i13 = 0; i13 < ne13; i13++) {
 								            for (int64_t i12 = 0; i12 < ne12; i12++) {
 								                const int64_t i03 = i13/r3;
 								                const int64_t i02 = i12/r2;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
 								                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
 								                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								                if (type != GGML_TYPE_F32) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                          ne1, ne01, ne10,
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+.0f,    y, ne10,
 								                                  x, ne00,
 .0f,    d, ne01);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            }
 								        }
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								        return;
 								    }
 								#endif
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (ith != 0) {
 								            return;
 								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        if (src1->type != vec_dot_type) {
 								            char * wdata = params->wdata;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            assert(params->wsize >= ne11*ne12*ne13*row_size);
-												sync : ggml (VMM, sync-ggml-am, dotprod ARM fixes, CUDA fixes) (#1691)

* scripts : add sync-ggml-am.sh

* sync : ggml (VMM, ARM dot prod fix, etc.)

* build : fix CUDA build

* ggml : fix some mul mat cases + add tests for src1 F16

https://github.com/ggerganov/ggml/commit/dbd02958fa4f46898f68ca29c27ddcdc58a06f98
											
										
										
											2023-12-29 09:30:47 +00:00
+								            GGML_ASSERT(src1->type == GGML_TYPE_F32);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            for (int64_t i13 = 0; i13 < ne13; ++i13) {
 								                for (int64_t i12 = 0; i12 < ne12; ++i12) {
 								                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
 								                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
 								                        wdata += row_size;
 								                    }
 								                }
 								            }
 								        }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        return;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        return;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    const int64_t nr0 = ne01;          // src0 rows
 								    const int64_t nr1 = ne1*ne12*ne13; // src1 rows
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // distribute the thread work across the inner or outer loop based on which one is larger
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
 								    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t ith0 = ith % nth0;
 								    const int64_t ith1 = ith / nth0;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
 								    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
 								    const int64_t ir010 = dr0*ith0;
 								    const int64_t ir011 = MIN(ir010 + dr0, nr0);
 								    const int64_t ir110 = dr1*ith1;
 								    const int64_t ir111 = MIN(ir110 + dr1, nr1);
 								    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
 								    // threads with no work simply yield (not sure if it helps)
 								    if (ir010 >= ir011 || ir110 >= ir111) {
 								        sched_yield();
 								        return;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    assert(ne12 % ne02 == 0);
 								    assert(ne13 % ne03 == 0);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // block-tiling attempt
 								    const int64_t blck_0 = 16;
 								    const int64_t blck_1 = 16;
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
 								    int64_t nrc = vec_dot_num_rows;
 								    // TODO: currently the mmla kernels support only even numbered rows/cols.
 								    // this check can be removed once they are extended to support odd numbered rows/cols too
 								    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
 								        nrc = 1;
 								    }
 								    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // attempt to reduce false-sharing (does not seem to make a difference)
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								    // 16 * 2, accounting for mmla kernels
 								    float tmp[32];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
 								        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                const int64_t i13 = (ir1/(ne12*ne1));
 								                const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
 								                const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								                // broadcast src0 into src1
 								                const int64_t i03 = i13/r3;
 								                const int64_t i02 = i12/r2;
 								                const int64_t i1 = i11;
 								                const int64_t i2 = i12;
 								                const int64_t i3 = i13;
 								                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
 								                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
 								                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
 								                //       the original src1 data pointer, so we should index using the indices directly
 								                // TODO: this is a bit of a hack, we should probably have a better way to handle this
 								                const char * src1_col = (const char *) wdata +
 								                    (src1_cont || src1->type != vec_dot_type
 								                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
 								                     : (i11*nb11 + i12*nb12 + i13*nb13));
 								                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
 								                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
 								                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
 								                //}
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
 								                    vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
 								                }
 								                for (int cn = 0; cn < nrc; ++cn) {
 								                    memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
 								        }
 								    }
 								}
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								// ggml_compute_forward_mul_mat_id
 								static void ggml_compute_forward_mul_mat_id(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * ids = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
 								    GGML_TENSOR_BINARY_OP_LOCALS
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const enum ggml_type type = src0->type;
 								    const bool src1_cont = ggml_is_contiguous(src1);
 								    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
 								    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
 								    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
 								    GGML_ASSERT(ne0 == ne01);
 								    GGML_ASSERT(ne1 == ne11);
 								    GGML_ASSERT(ne2 == ne12);
 								    GGML_ASSERT(ne3 == ne13);
 								    // we don't support permuted src0 or src1
 								    GGML_ASSERT(nb00 == ggml_type_size(type));
 								    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
 								    // dst cannot be transposed or permuted
 								    GGML_ASSERT(nb0 == sizeof(float));
 								    GGML_ASSERT(nb0 <= nb1);
 								    GGML_ASSERT(nb1 <= nb2);
 								    GGML_ASSERT(nb2 <= nb3);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    // broadcast factors
 								    const int64_t r2 = ne12/ne02;
 								    const int64_t r3 = ne13/ne03;
 								    // row groups
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    const int id   = ggml_get_op_params_i32(dst, 0);
 								    const int n_as = ggml_get_op_params_i32(dst, 1);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    char * wdata_src1_end = (src1->type == vec_dot_type) ?
 								            (char *) params->wdata :
 								            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
 								    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
 								    int64_t * matrix_rows       = matrix_row_counts + n_as;     // [n_as][ne11]
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								   if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (ith != 0) {
 								            return;
 								        }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								        char * wdata = params->wdata;
 								        if (src1->type != vec_dot_type) {
 								            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 								            assert(params->wsize >= ne11*ne12*ne13*row_size);
 								            assert(src1->type == GGML_TYPE_F32);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								            for (int64_t i13 = 0; i13 < ne13; ++i13) {
 								                for (int64_t i12 = 0; i12 < ne12; ++i12) {
 								                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
 								                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
 								                        wdata += row_size;
 								                    }
 								                }
 								            }
 								        }
 								        // initialize matrix_row_counts
 								        GGML_ASSERT(wdata == wdata_src1_end);
 								        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
 								        // group rows by src0 matrix
 								        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
 								            const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
 								            GGML_ASSERT(row_id >= 0 && row_id < n_as);
 								            MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
 								            matrix_row_counts[row_id] += 1;
 								        }
 								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								        return;
 								    }
 								    // compute each matrix multiplication in sequence
 								    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
 								        const int64_t cne1 = matrix_row_counts[cur_a];
 								        if (cne1 == 0) {
 								            continue;
 								        }
 								        const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
 								        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
 								        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 								        const int64_t nr0 = ne01;           // src0 rows
 								        const int64_t nr1 = cne1*ne12*ne13; // src1 rows
 								        //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
 								        // distribute the thread work across the inner or outer loop based on which one is larger
 								        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
 								        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
 								        const int64_t ith0 = ith % nth0;
 								        const int64_t ith1 = ith / nth0;
 								        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
 								        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
 								        const int64_t ir010 = dr0*ith0;
 								        const int64_t ir011 = MIN(ir010 + dr0, nr0);
 								        const int64_t ir110 = dr1*ith1;
 								        const int64_t ir111 = MIN(ir110 + dr1, nr1);
 								        //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
 								        // threads with no work simply yield (not sure if it helps)
 								        if (ir010 >= ir011 || ir110 >= ir111) {
 								            sched_yield();
 								            continue;
 								        }
 								        assert(ne12 % ne02 == 0);
 								        assert(ne13 % ne03 == 0);
 								        // block-tiling attempt
 								        const int64_t blck_0 = 16;
 								        const int64_t blck_1 = 16;
 								        // attempt to reduce false-sharing (does not seem to make a difference)
 								        float tmp[16];
 								        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
 								            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
 								                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
 								                    const int64_t  i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
 								                    const int64_t  i12 = (ir1 - i13*ne12*cne1)/cne1;
 								                    const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
 								                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);
 								                    // broadcast src0 into src1
 								                    const int64_t i03 = i13/r3;
 								                    const int64_t i02 = i12/r2;
 								                    const int64_t i1 = i11;
 								                    const int64_t i2 = i12;
 								                    const int64_t i3 = i13;
 								                    const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
 								                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
 								                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
 								                    //       the original src1 data pointer, so we should index using the indices directly
 								                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
 								                    const char * src1_col = (const char *) wdata +
 								                        (src1_cont || src1->type != vec_dot_type
 								                        ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
 								                        : (i11*nb11 + i12*nb12 + i13*nb13));
 								                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
 								                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
 								                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
 								                    //}
 								                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    }
 								                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
 								                }
 								            }
 								        }
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
 								    #undef MMID_MATRIX_ROW
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// ggml_compute_forward_out_prod
 								static void ggml_compute_forward_out_prod_f32(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // int64_t t0 = ggml_perf_time_us();
 								    // UNUSED(t0);
 								    GGML_TENSOR_BINARY_OP_LOCALS
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ne0  == ne00);
 								    GGML_ASSERT(ne1  == ne10);
 								    GGML_ASSERT(ne2  == ne02);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ne02 == ne12);
 								    GGML_ASSERT(ne3  == ne13);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(ne03 == ne13);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    // we don't support permuted src0 or src1
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    // dst cannot be transposed or permuted
 								    GGML_ASSERT(nb0 == sizeof(float));
 								    // GGML_ASSERT(nb0 <= nb1);
 								    // GGML_ASSERT(nb1 <= nb2);
 								    // GGML_ASSERT(nb2 <= nb3);
 								    // nb01 >= nb00 - src0 is not transposed
 								    //   compute by src0 rows
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    // TODO: #if defined(GGML_USE_CLBLAST)
 								#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 								    bool use_blas = ggml_is_matrix(src0) &&
 								        ggml_is_matrix(src1) &&
 								        ggml_is_contiguous(src0) &&
 								        (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
 								#endif
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
 								        if (use_blas) {
 								            return;
 								        }
 								#endif
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (ith != 0) {
 								            return;
 								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
 								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return;
 								    }
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 								    if (use_blas) {
 								        if (params->ith != 0) { // All threads other than the first do no work.
 								            return;
 								        }
 								        // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
 								        // src0: (k,n)
 								        // src1: (k,m)
 								        // dst:  (m,n)
 								        //
 								        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
 								        // Also expressed as (major,minor)
 								        // a: (m,k): so src1 transposed
 								        // b: (k,n): so src0
 								        // c: (m,n)
 								        //
 								        // However, if ggml_is_transposed(src1) is true, then
 								        // src1->data already contains a transposed version, so sgemm mustn't
 								        // transpose it further.
 								        int n = src0->ne[0];
 								        int k = src0->ne[1];
 								        int m = src1->ne[0];
 								        int transposeA, lda;
 								        if (!ggml_is_transposed(src1)) {
 								            transposeA = CblasTrans;
 								            lda = m;
 								        } else {
 								            transposeA = CblasNoTrans;
 								            lda = k;
 								        }
 								        float * a = (float *) ((char *) src1->data);
 								        float * b = (float *) ((char *) src0->data);
 								        float * c = (float *) ((char *) dst->data);
 								        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
 								        return;
 								    }
 								#endif
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // dst[:,:,:,:] = 0
 								    // for i2,i3:
 								    //   for i1:
 								    //     for i01:
 								    //       for i0:
 								    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
 								    // parallelize by last three dimensions
 								    // total rows in dst
 								    const int64_t nr = ne1*ne2*ne3;
 								    // rows per thread
 								    const int64_t dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int64_t ir0 = dr*ith;
 								    const int64_t ir1 = MIN(ir0 + dr, nr);
 								    // block-tiling attempt
 								    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
 								    const int64_t blck_1 = 16;
 								    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
 								        const int64_t bir1 = MIN(bir + blck_1, ir1);
 								        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
 								            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
 								            for (int64_t ir = bir; ir < bir1; ++ir) {
 								                // dst indices
 								                const int64_t i3 = ir/(ne2*ne1);
 								                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
 								                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
 								                const int64_t i02 = i2;
 								                const int64_t i03 = i3;
 								                //const int64_t i10 = i1;
 								                const int64_t i12 = i2;
 								                const int64_t i13 = i3;
 								#if GGML_VEC_MAD_UNROLL > 2
 								                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
 								                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
 								                    const int64_t i11 = i01;
 								                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
 								                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
 								                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 								                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
 								                }
 								                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
 								                    const int64_t i11 = i01;
 								                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
 								                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
 								                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 								                    ggml_vec_mad_f32(ne0, d, s0, *s1);
 								                }
 								#else
 								                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
 								                    const int64_t i11 = i01;
 								                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
 								                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
 								                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 								                    ggml_vec_mad_f32(ne0, d, s0, *s1);
 								                }
 								#endif
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            }
 								        }
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    //int64_t t1 = ggml_perf_time_us();
 								    //static int64_t acc = 0;
 								    //acc += t1 - t0;
 								    //if (t1 - t0 > 10) {
 								    //    printf("\n");
 								    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
 								    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
 								    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
 								    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
 								    //}
 								}
 								static void ggml_compute_forward_out_prod_q_f32(
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // int64_t t0 = ggml_perf_time_us();
 								    // UNUSED(t0);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const enum ggml_type type = src0->type;
 								    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ne02 == ne12);
 								    GGML_ASSERT(ne03 == ne13);
 								    GGML_ASSERT(ne2  == ne12);
 								    GGML_ASSERT(ne3  == ne13);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // we don't support permuted src0 dim0
 								    GGML_ASSERT(nb00 == ggml_type_size(type));
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // dst dim0 cannot be transposed or permuted
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(nb0 == sizeof(float));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // GGML_ASSERT(nb0 <= nb1);
 								    // GGML_ASSERT(nb1 <= nb2);
 								    // GGML_ASSERT(nb2 <= nb3);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(ne0 == ne00);
 								    GGML_ASSERT(ne1 == ne10);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ne2 == ne02);
 								    GGML_ASSERT(ne3 == ne03);
 								    // nb01 >= nb00 - src0 is not transposed
 								    //   compute by src0 rows
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (ith != 0) {
 								            return;
 								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // parallelize by last three dimensions
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // total rows in dst
 								    const int64_t nr = ne1*ne2*ne3;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    // rows per thread
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t dr = (nr + nth - 1)/nth;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    // row range for this thread
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t ir0 = dr*ith;
 								    const int64_t ir1 = MIN(ir0 + dr, nr);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // dst[:,:,:,:] = 0
 								    // for i2,i3:
 								    //   for i1:
 								    //     for i01:
 								    //       for i0:
 								    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    for (int64_t ir = ir0; ir < ir1; ++ir) {
 								        // dst indices
 								        const int64_t i3 = ir/(ne2*ne1);
 								        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
 								        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        const int64_t i02 = i2;
 								        const int64_t i03 = i3;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        //const int64_t i10 = i1;
 								        const int64_t i12 = i2;
 								        const int64_t i13 = i3;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        for (int64_t i01 = 0; i01 < ne01; ++i01) {
 								            const int64_t i11 = i01;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
 								            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
 								            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            dequantize_row_q(s0, wdata, ne0);
 								            ggml_vec_mad_f32(ne0, d, wdata, *s1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    //int64_t t1 = ggml_perf_time_us();
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    //static int64_t acc = 0;
 								    //acc += t1 - t0;
 								    //if (t1 - t0 > 10) {
 								    //    printf("\n");
 								    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
 								    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
 								    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
 								    //}
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_out_prod(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_TYPE_IQ2_XS:
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ1_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_out_prod_q_f32(params, dst);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_F16:
 								            {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                GGML_ASSERT(false); // todo
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                // ggml_compute_forward_out_prod_f16_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_out_prod_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_scale
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_scale_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    GGML_ASSERT(ggml_is_contiguous(src0));
 								    GGML_ASSERT(ggml_is_contiguous(dst));
 								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
 								    // scale factor
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    float v;
 								    memcpy(&v, dst->op_params, sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nc = src0->ne[0];
 								    const int nr = ggml_nrows(src0);
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const size_t nb01 = src0->nb[1];
 								    const size_t nb1 = dst->nb[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    for (int i1 = ir0; i1 < ir1; i1++) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        if (dst->data != src0->data) {
 								            // src0 is same shape as dst => same indices
 								            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
 								        }
 								        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_scale(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_scale_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_compute_forward_set
 								static void ggml_compute_forward_set_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
 								    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
 								    // view src0 and dst with these strides and data offset inbytes during set
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    // nb0 is implicitly element_size because src0 and dst are contiguous
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    size_t nb1     = ((int32_t *) dst->op_params)[0];
 								    size_t nb2     = ((int32_t *) dst->op_params)[1];
 								    size_t nb3     = ((int32_t *) dst->op_params)[2];
 								    size_t offset  = ((int32_t *) dst->op_params)[3];
 								    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (params->ith != 0) {
 								            return;
 								        }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        // memcpy needs to be synchronized across threads to avoid race conditions.
 								        // => do it in INIT phase
 								        memcpy(
 								            ((char *)  dst->data),
 								            ((char *) src0->data),
 								            ggml_nbytes(dst));
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nr = ggml_nrows(src1);
 								    const int nc = src1->ne[0];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
 								    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    // src0 and dst as viewed during set
 								    const size_t nb0 = ggml_element_size(src0);
 								    const int im0 = (ne10 == 0 ? 0 : ne10-1);
 								    const int im1 = (ne11 == 0 ? 0 : ne11-1);
 								    const int im2 = (ne12 == 0 ? 0 : ne12-1);
 								    const int im3 = (ne13 == 0 ? 0 : ne13-1);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    GGML_ASSERT(nb10 == sizeof(float));
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // src0 and dst are viewed with shape of src1 and offset
 								        // => same indices
 								        const int i3 = ir/(ne12*ne11);
 								        const int i2 = (ir - i3*ne12*ne11)/ne11;
 								        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
 								        ggml_vec_cpy_f32(nc,
 								                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
 								                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
 								    }
 								}
 								static void ggml_compute_forward_set(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_set_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        case GGML_TYPE_F16:
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								        case GGML_TYPE_Q8_1:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_TYPE_IQ2_XS:
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ1_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_compute_forward_cpy
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_cpy(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    ggml_compute_forward_dup(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								// ggml_compute_forward_cont
 								static void ggml_compute_forward_cont(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    ggml_compute_forward_dup(params, dst);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_compute_forward_reshape
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_reshape(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
 								    // NOP
 								    UNUSED(params);
 								    UNUSED(dst);
 								}
 								// ggml_compute_forward_view
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_view(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								        const struct ggml_tensor * dst) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // NOP
 								    UNUSED(params);
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    UNUSED(dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								// ggml_compute_forward_permute
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_permute(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								        const struct ggml_tensor * dst) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // NOP
 								    UNUSED(params);
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    UNUSED(dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								// ggml_compute_forward_transpose
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_transpose(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								        const struct ggml_tensor * dst) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // NOP
 								    UNUSED(params);
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    UNUSED(dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								// ggml_compute_forward_get_rows
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								static void ggml_compute_forward_get_rows_q(
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        return;
 								    }
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
 								    const int64_t nc = ne00;
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    const int64_t nr = ggml_nelements(src1);
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    const enum ggml_type type = src0->type;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    assert(ne0  == nc);
 								    assert(ne02 == ne11);
 								    assert(nb00 == ggml_type_size(type));
 								    assert(ggml_nrows(dst) == nr);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    const int ith = params->ith;
 								    const int nth = params->nth;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int64_t i = ir0; i < ir1; ++i) {
 								        const int64_t i12 = i/(ne11*ne10);
 								        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
 								        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
 								        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 								        dequantize_row_q(
 								                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
 								                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_get_rows_f16(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    const int64_t nc = ne00;
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    const int64_t nr = ggml_nelements(src1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    assert(ne0  == nc);
 								    assert(ne02 == ne11);
 								    assert(nb00 == sizeof(ggml_fp16_t));
 								    assert(ggml_nrows(dst) == nr);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int64_t i = ir0; i < ir1; ++i) {
 								        const int64_t i12 = i/(ne11*ne10);
 								        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
 								        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
 								        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 								        ggml_fp16_to_fp32_row(
 								                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
 								                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_get_rows_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    const int64_t nc = ne00;
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    const int64_t nr = ggml_nelements(src1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    assert(ne0  == nc);
 								    assert(ne02 == ne11);
 								    assert(nb00 == sizeof(float));
 								    assert(ggml_nrows(dst) == nr);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    const int ith = params->ith;
 								    const int nth = params->nth;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int64_t i = ir0; i < ir1; ++i) {
 								        const int64_t i12 = i/(ne11*ne10);
 								        const int64_t i11 = (i - i12*ne11*ne10)/ne10;
 								        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
 								        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 								        ggml_vec_cpy_f32(nc,
 								                (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
 								                (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_get_rows(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								        case GGML_TYPE_Q8_1:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_TYPE_IQ2_XS:
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ1_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rows_q(params, dst);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rows_f16(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
-												ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639)

* add more int ops

* ggml_compute_forward_dup_bytes

* add tests

* PR comments

* tests : minor indentations

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-29 17:07:03 +00:00
+								        case GGML_TYPE_I32:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rows_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
 								    //static bool first = true;
 								    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
 								    //if (first) {
 								    //    first = false;
 								    //} else {
 								    //    for (int k = 0; k < dst->ne[1]; ++k) {
 								    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
 								    //            for (int i = 0; i < 16; ++i) {
 								    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
 								    //            }
 								    //            printf("\n");
 								    //        }
 								    //        printf("\n");
 								    //    }
 								    //    printf("\n");
 								    //    exit(0);
 								    //}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								// ggml_compute_forward_get_rows_back
 								static void ggml_compute_forward_get_rows_back_f32_f16(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(params->ith == 0);
 								    GGML_ASSERT(ggml_is_contiguous(dst));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (params->ith != 0) {
 								            return;
 								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        memset(dst->data, 0, ggml_nbytes(dst));
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    const int nc = src0->ne[0];
 								    const int nr = ggml_nelements(src1);
 								    GGML_ASSERT( dst->ne[0] == nc);
 								    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
 								    for (int i = 0; i < nr; ++i) {
 								        const int r = ((int32_t *) src1->data)[i];
 								        for (int j = 0; j < nc; ++j) {
 								            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
 								            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
 								        }
 								    }
 								}
 								static void ggml_compute_forward_get_rows_back_f32(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(params->ith == 0);
 								    GGML_ASSERT(ggml_is_contiguous(dst));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (params->ith != 0) {
 								            return;
 								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        memset(dst->data, 0, ggml_nbytes(dst));
 								    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    const int nc = src0->ne[0];
 								    const int nr = ggml_nelements(src1);
 								    GGML_ASSERT( dst->ne[0] == nc);
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    for (int i = 0; i < nr; ++i) {
 								        const int r = ((int32_t *) src1->data)[i];
 								        ggml_vec_add_f32(nc,
 								                (float *) ((char *)  dst->data + r*dst->nb[1]),
 								                (float *) ((char *)  dst->data + r*dst->nb[1]),
 								                (float *) ((char *) src0->data + i*src0->nb[1]));
 								    }
 								}
 								static void ggml_compute_forward_get_rows_back(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rows_back_f32_f16(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rows_back_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								    //static bool first = true;
 								    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
 								    //if (first) {
 								    //    first = false;
 								    //} else {
 								    //    for (int k = 0; k < dst->ne[1]; ++k) {
 								    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
 								    //            for (int i = 0; i < 16; ++i) {
 								    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
 								    //            }
 								    //            printf("\n");
 								    //        }
 								    //        printf("\n");
 								    //    }
 								    //    printf("\n");
 								    //    exit(0);
 								    //}
 								}
 								// ggml_compute_forward_diag
 								static void ggml_compute_forward_diag_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(params->ith == 0);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
 								    // TODO: handle transposed/permuted matrices
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(ne00 == ne0);
 								    GGML_ASSERT(ne00 == ne1);
 								    GGML_ASSERT(ne01 == 1);
 								    GGML_ASSERT(ne02 == ne2);
 								    GGML_ASSERT(ne03 == ne3);
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    GGML_ASSERT(nb0  == sizeof(float));
 								    for (int i3 = 0; i3 < ne3; i3++) {
 								        for (int i2 = 0; i2 < ne2; i2++) {
 								            for (int i1 = 0; i1 < ne1; i1++) {
 								                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
 								                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
 								                for (int i0 = 0; i0 < i1; i0++) {
 								                    d[i0] = 0;
 								                }
 								                d[i1] = s[i1];
 								                for (int i0 = i1+1; i0 < ne0; i0++) {
 								                    d[i0] = 0;
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_diag(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_diag_f32(params, dst);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_diag_mask_inf
 								static void ggml_compute_forward_diag_mask_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst,
 								        const float value) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    const int  n_past  = ((int32_t *) dst->op_params)[0];
 								    const bool inplace = src0->data == dst->data;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(n_past >= 0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (ith != 0) {
 								            return;
 								        }
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        // memcpy needs to be synchronized across threads to avoid race conditions.
 								        // => do it in INIT phase
 								        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
 								        GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
 								        memcpy(
 								            ((char *)  dst->data),
 								            ((char *) src0->data),
 								            ggml_nbytes(dst));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // TODO: handle transposed/permuted matrices
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    const int nr = src0->ne[1];
 								    const int nz = n/nr;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT( dst->nb[0] == sizeof(float));
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    for (int k = 0; k < nz; k++) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        for (int j = ith; j < nr; j += nth) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            for (int i = n_past; i < nc; i++) {
 								                if (i > n_past + j) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
 								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_diag_mask_inf(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								static void ggml_compute_forward_diag_mask_zero(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_diag_mask_f32(params, dst, 0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
 								// ggml_compute_forward_soft_max
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_soft_max_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
 								    const struct ggml_tensor * src2 = dst->src[2];
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    assert(ggml_is_contiguous(dst));
 								    assert(ggml_are_same_shape(src0, dst));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        return;
 								    }
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    float scale    = 1.0f;
 								    float max_bias = 0.0f;
 								    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
 								    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    // TODO: handle transposed/permuted matrices
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    const int64_t ne11 = src1 ? src1->ne[1] : 1;
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    // TODO: is this supposed to be ceil instead of floor?
 								    //       https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
 								    const uint32_t n_head_kv   = ne02;
 								    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
 								    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
 								    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    const int nc = src0->ne[0];
 								    const int nr = ggml_nrows(src0);
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
 								    float * pos = src2 ? (float *) src2->data : src0->data;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    for (int i1 = ir0; i1 < ir1; i1++) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
 								        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
 								        // broadcast the mask across rows
 								        float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
 								        ggml_vec_cpy_f32  (nc, wp, sp);
 								        ggml_vec_scale_f32(nc, wp, scale);
 								        if (mp) {
 								            ggml_vec_acc_f32(nc, wp, mp);
 								        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        // ALiBi bias
 								        if (max_bias > 0.0f) {
 								            const uint32_t h  = (i1/ne01)%ne02; // head
 								            const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
 								            for (int i = 0; i < nc; i++) {
 								                wp[i] = wp[i] + slope*pos[i];
 								            }
 								        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#ifndef NDEBUG
 								        for (int i = 0; i < nc; ++i) {
 								            //printf("p[%d] = %f\n", i, p[i]);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            assert(!isnan(wp[i]));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        }
 								#endif
 								        float max = -INFINITY;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        ggml_vec_max_f32(nc, &max, wp);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								        ggml_float sum = 0.0;
 								        uint16_t scvt;
 								        for (int i = 0; i < nc; i++) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            if (wp[i] == -INFINITY) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                dp[i] = 0.0f;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            } else {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
 								                ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                memcpy(&scvt, &s, sizeof(scvt));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                sum += (ggml_float)val;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                dp[i] = val;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            }
 								        }
 								        assert(sum > 0.0);
 								        sum = 1.0/sum;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        ggml_vec_scale_f32(nc, dp, sum);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								#ifndef NDEBUG
 								        for (int i = 0; i < nc; ++i) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            assert(!isnan(dp[i]));
 								            assert(!isinf(dp[i]));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        }
 								#endif
 								    }
 								}
 								static void ggml_compute_forward_soft_max(
 								        const struct ggml_compute_params * params,
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_soft_max_f32(params, dst);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_soft_max_back
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static void ggml_compute_forward_soft_max_back_f32(
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(ggml_is_contiguous(src0));
 								    GGML_ASSERT(ggml_is_contiguous(src1));
 								    GGML_ASSERT(ggml_is_contiguous(dst));
 								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
 								    GGML_ASSERT(ggml_are_same_shape(src1, dst));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        return;
 								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // TODO: handle transposed/permuted matrices
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    const int nc = src0->ne[0];
 								    const int nr = ggml_nrows(src0);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int i1 = ir0; i1 < ir1; i1++) {
 								        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
 								        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
 								        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
 								#ifndef NDEBUG
 								        for (int i = 0; i < nc; ++i) {
 								            //printf("p[%d] = %f\n", i, p[i]);
 								            assert(!isnan(dy[i]));
 								            assert(!isnan(y[i]));
 								        }
 								#endif
 								        // Jii = yi - yi*yi
 								        // Jij = -yi*yj
 								        // J = diag(y)-y.T*y
 								        // dx = J * dy
 								        // dxk = sum_i(Jki * dyi)
 								        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
 								        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
 								        // dxk = -yk * dot(y, dy) + yk*dyk
 								        // dxk = yk * (- dot(y, dy) + dyk)
 								        // dxk = yk * (dyk - dot(y, dy))
 								        //
 								        // post-order:
 								        // dot_y_dy := dot(y, dy)
 								        // dx := dy
 								        // dx := dx - dot_y_dy
 								        // dx := dx * y
 								        // linear runtime, no additional memory
 								        float dot_y_dy = 0;
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        ggml_vec_cpy_f32 (nc, dx, dy);
 								        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
 								        ggml_vec_mul_f32 (nc, dx, dx, y);
 								#ifndef NDEBUG
 								        for (int i = 0; i < nc; ++i) {
 								            assert(!isnan(dx[i]));
 								            assert(!isinf(dx[i]));
 								        }
 								#endif
 								    }
 								}
 								static void ggml_compute_forward_soft_max_back(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_soft_max_back_f32(params, dst);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_alibi
 								static void ggml_compute_forward_alibi_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    assert(params->ith == 0);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    //const int n_past = ((int32_t *) dst->op_params)[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int n_head = ((int32_t *) dst->op_params)[1];
 								    float max_bias;
 								    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
 								    const int64_t ne1 = src0->ne[1]; // seq_len_without_past
 								    const int64_t ne2 = src0->ne[2]; // n_head -> this is k
 								    //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int64_t n  = ggml_nrows(src0);
 								    const int64_t ne2_ne3 = n/ne1; // ne2*ne3
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const size_t nb0 = src0->nb[0];
 								    const size_t nb1 = src0->nb[1];
 								    const size_t nb2 = src0->nb[2];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    //const int nb3 = src0->nb[3];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(nb0 == sizeof(float));
 								    GGML_ASSERT(n_head == ne2);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								    // add alibi to src0 (KQ_scaled)
 								    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
 								    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    for (int64_t k = 0; k < ne2_ne3; k++) {
 								        // TODO: k*nb2 or k*nb3
 								        float m_k;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        if (k < n_heads_log2_floor) {
 								            m_k = powf(m0, k + 1);
 								        } else {
 								            m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
 								        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        for (int64_t i = 0; i < ne0; i++) {
 								            for (int64_t j = 0; j < ne1; j++) {
 								                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
 								                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                pdst[0] = i * m_k + src[0];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_alibi_f16(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    assert(params->ith == 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    //const int n_past = ((int32_t *) dst->op_params)[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int n_head = ((int32_t *) dst->op_params)[1];
 								    float max_bias;
 								    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
 								    const int ne1 = src0->ne[1]; // seq_len_without_past
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int ne2 = src0->ne[2]; // n_head -> this is k
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    //const int ne3 = src0->ne[3]; // 1 -> bsz
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    const int n  = ggml_nrows(src0);
 								    const int ne2_ne3 = n/ne1; // ne2*ne3
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    const int nb0 = src0->nb[0];
 								    const int nb1 = src0->nb[1];
 								    const int nb2 = src0->nb[2];
 								    //const int nb3 = src0->nb[3];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(n_head == ne2);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    // add alibi to src0 (KQ_scaled)
 								    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
 								    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								    for (int k = 0; k < ne2_ne3; k++) {
 								        // TODO: k*nb2 or k*nb3
 								        float m_k;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        if (k < n_heads_log2_floor) {
 								            m_k = powf(m0, k + 1);
 								        } else {
 								            m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : add ALiBi support for ggml_soft_max_ext (llama/5488)

											
										
										
											2024-02-19 13:18:09 +00:00
+								        for (int i = 0; i < ne0; i++) {
 								            for (int j = 0; j < ne1; j++) {
 								                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
 								                float       *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                // we return F32
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								static void ggml_compute_forward_alibi(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_alibi_f16(params, dst);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_alibi_f32(params, dst);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								        case GGML_TYPE_Q8_1:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_TYPE_IQ2_XS:
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ1_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q8_K:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_I8:
 								        case GGML_TYPE_I16:
 								        case GGML_TYPE_I32:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_I64:
 								        case GGML_TYPE_F64:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_COUNT:
 								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								// ggml_compute_forward_clamp
 								static void ggml_compute_forward_clamp_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    assert(params->ith == 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        return;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    float min;
 								    float max;
 								    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
 								    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    const size_t nb00 = src0->nb[0];
 								    const size_t nb01 = src0->nb[1];
 								    const size_t nb0 = dst->nb[0];
 								    const size_t nb1 = dst->nb[1];
 								    GGML_ASSERT( nb0 == sizeof(float));
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    for (int j = ith; j < n; j += nth) {
 								        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
 								        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
 								        for (int i = 0; i < nc; i++) {
 								            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
 								        }
 								    }
 								}
 								static void ggml_compute_forward_clamp(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_clamp_f32(params, dst);
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								            } break;
 								        case GGML_TYPE_F16:
 								        case GGML_TYPE_Q4_0:
 								        case GGML_TYPE_Q4_1:
 								        case GGML_TYPE_Q5_0:
 								        case GGML_TYPE_Q5_1:
 								        case GGML_TYPE_Q8_0:
 								        case GGML_TYPE_Q8_1:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q2_K:
 								        case GGML_TYPE_Q3_K:
 								        case GGML_TYPE_Q4_K:
 								        case GGML_TYPE_Q5_K:
 								        case GGML_TYPE_Q6_K:
-												SOTA 2-bit quants (llama/4773)

* iq2_xxs: basics

* iq2_xxs: scalar and AVX2 dot products

Needed to change Q8_K to have quants in the -127...127 range,
else the IQ2_XXS AVX implementation becomes very awkward.
The alternative would have been to use Q8_0 instead. Perhaps
I'll change later, for now this is what we have.

* iq2_xxs: ARM_NEON dot product

Somehow strangely slow (112 ms/token).

* iq2_xxs: WIP Metal

Dequantize works, something is still wrong with the
dot product.

* iq2_xxs: Metal dot product now works

We have
PP-512 = 475 t/s
TG-128 = 47.3 t/s

Not the greatest performance, but not complete garbage either.

* iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s

* iq2_xxs: slighty faster dot product

TG-128 is now 50.9 t/s

* iq2_xxs: even faster Metal dot product

TG-128 is now 54.1 t/s.

Strangely enough, putting the signs lookup table
into shared memory has a bigger impact than the
grid values being in shared memory.

* iq2_xxs: dequantize CUDA kernel - fix conflict with master

* iq2_xxs: quantized CUDA dot product (MMVQ)

We get TG-128 = 153.1 t/s

* iq2_xxs: slightly faster CUDA dot product

TG-128 is now at 155.1 t/s.

* iq2_xxs: add to llama ftype enum

* iq2_xxs: fix MoE on Metal

* Fix missing MMQ ops when on hipBLAS

I had put the ggml_supports_mmq call at the wrong place.

* Fix bug in qequantize_row_iq2_xxs

The 0.25f factor was missing.
Great detective work by @ggerganov!

* Fixing tests

* PR suggestion

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-08 15:02:32 +00:00
+								        case GGML_TYPE_IQ2_XXS:
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								        case GGML_TYPE_IQ2_XS:
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS:
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ1_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:
-												sync : llama.cpp (ggml/0)

ggml-ci

											
										
										
											2024-02-21 14:19:39 +00:00
+								        case GGML_TYPE_IQ4_NL:
-												IQ4_XS: a 4.25 bpw quantization (llama/5747)

* Try IQ4_NL with blocks of 64 - does not look good

* iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32

* iq4_xs: CUDA works - 133.2 t/s

* iq4_xs: AVX2 dot product

* iq4_xs: ARM_NEON dot product

* iq4_nl: Metal implementation

As usual, Metal / Apple Silicon don't like my quants.

* iq3_xs: minor fix

* iq4_xs: shrink by using IQ3_S for attn_k and attn_q

* iq4_xs: revert using IQ3_S for attn_k and attn_v

PPL vs size is good, but CPU performance suffers: on M2 Max
TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X
to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when
using IQ3_S vs 133 t/s with pure IQ4_XS.

* Fix CI

* iq4_xs: Added forgotten check for 256 divisibility

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-27 14:34:24 +00:00
+								        case GGML_TYPE_IQ4_XS:
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_Q8_K:
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        case GGML_TYPE_I8:
 								        case GGML_TYPE_I16:
 								        case GGML_TYPE_I32:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_I64:
 								        case GGML_TYPE_F64:
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        case GGML_TYPE_COUNT:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								// ggml_compute_forward_rope
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static float rope_yarn_ramp(const float low, const float high, const int i0) {
 								    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
 								    return 1 - MIN(1, MAX(0, y));
 								}
 								// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 								// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 								static void rope_yarn(
 								    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
 								    float * cos_theta, float * sin_theta
 								) {
 								    // Get n-d rotational scaling corrected for extrapolation
 								    float theta_interp = freq_scale * theta_extrap;
 								    float theta = theta_interp;
 								    if (ext_factor != 0.0f) {
 								        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
 								        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
 								        // Get n-d magnitude scaling corrected for interpolation
 								        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
 								    }
 								    *cos_theta = cosf(theta) * mscale;
 								    *sin_theta = sinf(theta) * mscale;
 								}
 								// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 								// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
 								static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
 								    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
 								}
-												ggml: cache sin/cos for RoPE (llama/4908)

											
										
										
											2024-01-13 20:41:37 +00:00
+								static void ggml_rope_cache_init(
 								     float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
 								     float * cache, float sin_sign, float theta_scale
 								) {
 								    float theta = theta_base;
 								    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
 								        rope_yarn(
 								            theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
 								        );
 								        cache[i0 + 1] *= sin_sign;
 								        theta *= theta_scale;
 								    }
 								}
-												ggml : introduce GGML_CALL function annotation (llama/4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms

											
										
										
											2024-01-16 11:16:33 +00:00
+								GGML_CALL void ggml_rope_yarn_corr_dims(
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
 								) {
 								    // start and end correction dims
-												ggml : avoid duplicating function calls using MIN/MAX macros (llama/5325)

* Avoid duplicating function calls when using MIN/MAX macros.

Since these copy "a" and "b" they ask the compiler to evaluate one of them twice. The compiler doesn't have a problem with removing the duplication in something like MAX(0, x + 2), but in some cases we're calling functions, and those calls just happen twice.
By explicitly evaluating at the expression we get smaller and faster code without duplicate calls. See ggml_rope_yarn_corr_dims in Compiler Explorer:

https://godbolt.org/z/Ee4KMrvKh

Code behaves exactly the same.

* Update ggml.c

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-05 11:13:57 +00:00
+								    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
 								    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
 								    dims[0] = MAX(0, start);
 								    dims[1] = MIN(n_dims - 1, end);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_rope_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								        struct ggml_tensor * dst,
 								        const bool forward) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    // these two only relevant for xPos RoPE:
 								    float xpos_base;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    bool  xpos_down;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    //const int n_past     = ((int32_t *) dst->op_params)[0];
 								    const int n_dims     = ((int32_t *) dst->op_params)[1];
 								    const int mode       = ((int32_t *) dst->op_params)[2];
 								    const int n_ctx      = ((int32_t *) dst->op_params)[3];
 								    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
 								    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
 								    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
 								    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
 								    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
 								    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 								    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
 								    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
 								    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    GGML_ASSERT(nb00 == sizeof(float));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    const int nr = ggml_nrows(dst);
 								    GGML_ASSERT(n_dims <= ne0);
 								    GGML_ASSERT(n_dims % 2 == 0);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    // row index used to determine which thread to use
 								    int ir = 0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const float inv_ndims = -1.f/n_dims;
 								    float corr_dims[2];
 								    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    const bool is_neox = mode & 2;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    const bool is_glm  = mode & 4;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								    // backward process uses inverse rotation by cos and sin.
 								    // cos and sin build a rotation matrix, where the inverse is the transpose.
 								    // this essentially just switches the sign of sin.
 								    const float sin_sign = forward ? 1.0f : -1.0f;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int32_t * pos = (const int32_t *) src1->data;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    for (int64_t i3 = 0; i3 < ne3; i3++) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        for (int64_t i2 = 0; i2 < ne2; i2++) {
 								            const int64_t p = pos[i2];
-												ggml: cache sin/cos for RoPE (llama/4908)

											
										
										
											2024-01-13 20:41:37 +00:00
 								            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
 								            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
 								                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
 								            }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            for (int64_t i1 = 0; i1 < ne1; i1++) {
 								                if (ir++ < ir0) continue;
 								                if (ir   > ir1) break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                float theta_base = (float)p;
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                if (is_glm) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    theta_base = MIN(p, n_ctx - 2);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                    float block_theta = MAX(p - (n_ctx - 2), 0);
 								                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        const float cos_theta = cosf(theta_base);
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                        const float sin_theta = sinf(theta_base) * sin_sign;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                        const float cos_block_theta = cosf(block_theta);
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        theta_base *= theta_scale;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                        block_theta *= theta_scale;
 								                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 								                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
 								                        const float x0 = src[0];
 								                        const float x1 = src[n_dims/2];
 								                        const float x2 = src[n_dims];
 								                        const float x3 = src[n_dims/2*3];
 								                        dst_data[0]          = x0*cos_theta - x1*sin_theta;
 								                        dst_data[n_dims/2]   = x0*sin_theta + x1*cos_theta;
 								                        dst_data[n_dims]     = x2*cos_block_theta - x3*sin_block_theta;
 								                        dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
 								                    }
 								                } else if (!is_neox) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-												ggml: cache sin/cos for RoPE (llama/4908)

											
										
										
											2024-01-13 20:41:37 +00:00
+								                        const float cos_theta = cache[i0 + 0];
 								                        const float sin_theta = cache[i0 + 1];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        // zeta scaling for xPos only:
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        if (xpos_down) zeta = 1.0f / zeta;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
 								                        const float x0 = src[0];
 								                        const float x1 = src[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
 								                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    }
 								                } else {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    // TODO: this might be wrong for ne0 != n_dims - need double check
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
 								                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    theta_base *= freq_scale;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    for (int64_t ic = 0; ic < ne0; ic += 2) {
 								                        if (ic < n_dims) {
 								                            const int64_t ib = 0;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            // simplified from `(ib * n_dims + ic) * inv_ndims`
 								                            float cur_rot = inv_ndims * ic - ib;
 								                            float cos_theta, sin_theta;
 								                            rope_yarn(
 								                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
 								                                &cos_theta, &sin_theta
 								                            );
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                            sin_theta *= sin_sign;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            theta_base *= theta_scale;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            const int64_t i0 = ib*n_dims + ic/2;
 								                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                            const float x0 = src[0];
 								                            const float x1 = src[n_dims/2];
 								                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
 								                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                        } else {
 								                            const int64_t i0 = ic;
 								                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 								                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 								                            dst_data[0] = src[0];
 								                            dst_data[1] = src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                        }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								                    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
 								        }
 								    }
 								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								static void ggml_compute_forward_rope_f16(
 								        const struct ggml_compute_params * params,
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								        struct ggml_tensor * dst,
 								        const bool forward) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    //const int n_past     = ((int32_t *) dst->op_params)[0];
 								    const int n_dims     = ((int32_t *) dst->op_params)[1];
 								    const int mode       = ((int32_t *) dst->op_params)[2];
 								    const int n_ctx      = ((int32_t *) dst->op_params)[3];
 								    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
 								    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
 								    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
 								    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
 								    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
 								    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
 								    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
 								    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
 								    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nr = ggml_nrows(dst);
 								    GGML_ASSERT(n_dims <= ne0);
 								    GGML_ASSERT(n_dims % 2 == 0);
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    // row index used to determine which thread to use
 								    int ir = 0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const float inv_ndims = -1.f/n_dims;
 								    float corr_dims[2];
 								    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								    const bool is_neox = mode & 2;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    const bool is_glm  = mode & 4;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								    // backward process uses inverse rotation by cos and sin.
 								    // cos and sin build a rotation matrix, where the inverse is the transpose.
 								    // this essentially just switches the sign of sin.
 								    const float sin_sign = forward ? 1.0f : -1.0f;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int32_t * pos = (const int32_t *) src1->data;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    for (int64_t i3 = 0; i3 < ne3; i3++) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        for (int64_t i2 = 0; i2 < ne2; i2++) {
 								            const int64_t p = pos[i2];
-												ggml: cache sin/cos for RoPE (llama/4908)

											
										
										
											2024-01-13 20:41:37 +00:00
 								            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
 								            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
 								                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
 								            }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            for (int64_t i1 = 0; i1 < ne1; i1++) {
 								                if (ir++ < ir0) continue;
 								                if (ir   > ir1) break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                float theta_base = (float)p;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                if (is_glm) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    theta_base = MIN(p, n_ctx - 2);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                    float block_theta = MAX(p - (n_ctx - 2), 0);
 								                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        const float cos_theta = cosf(theta_base);
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                        const float sin_theta = sinf(theta_base) * sin_sign;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                        const float cos_block_theta = cosf(block_theta);
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        theta_base *= theta_scale;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                        block_theta *= theta_scale;
 								                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 								                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
 								                        const float x0 = GGML_FP16_TO_FP32(src[0]);
 								                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
 								                        const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
 								                        const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
 								                        dst_data[0]          = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
 								                        dst_data[n_dims/2]   = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
 								                        dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
 								                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
 								                    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                } else if (!is_neox) {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-												ggml: cache sin/cos for RoPE (llama/4908)

											
										
										
											2024-01-13 20:41:37 +00:00
+								                        const float cos_theta = cache[i0 + 0];
 								                        const float sin_theta = cache[i0 + 1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 								                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 								                        const float x0 = GGML_FP16_TO_FP32(src[0]);
 								                        const float x1 = GGML_FP16_TO_FP32(src[1]);
 								                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
 								                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
 								                    }
 								                } else {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    // TODO: this might be wrong for ne0 != n_dims - need double check
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
 								                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    theta_base *= freq_scale;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    for (int64_t ic = 0; ic < ne0; ic += 2) {
 								                        if (ic < n_dims) {
 								                            const int64_t ib = 0;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            // simplified from `(ib * n_dims + ic) * inv_ndims`
 								                            float cur_rot = inv_ndims * ic - ib;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            float cos_theta, sin_theta;
 								                            rope_yarn(
 								                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
 								                                &cos_theta, &sin_theta
 								                            );
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                            sin_theta *= sin_sign;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								                            theta_base *= theta_scale;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                            const int64_t i0 = ib*n_dims + ic/2;
 								                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 								                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 								                            const float x0 = GGML_FP16_TO_FP32(src[0]);
 								                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                        } else {
 								                            const int64_t i0 = ic;
 								                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 								                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 								                            dst_data[0] = src[0];
 								                            dst_data[1] = src[1];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                        }
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_rope(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rope_f16(params, dst, true);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rope_f32(params, dst, true);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_rope_back
 								static void ggml_compute_forward_rope_back(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (src0->type) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rope_f16(params, dst, false);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rope_f32(params, dst, false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								    }
 								}
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								// ggml_compute_forward_conv_transpose_1d
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								static void ggml_compute_forward_conv_transpose_1d_f16_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int nk = ne00*ne01*ne02;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nb10 == sizeof(float));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (ith != 0) {
 								            return;
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        memset(params->wdata, 0, params->wsize);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
 								        {
 								            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								            for (int64_t i02 = 0; i02 < ne02; i02++) {
 								                for (int64_t i01 = 0; i01 < ne01; i01++) {
 								                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
 								                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
 								                    for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                        dst_data[i00*ne02 + i02] = src[i00];
 								                    }
 								                }
 								            }
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        // permute source data (src1) from (L x Cin) to (Cin x L)
 								        {
 								            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
 								            ggml_fp16_t * dst_data = wdata;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								            for (int64_t i11 = 0; i11 < ne11; i11++) {
 								                const float * const src = (float *)((char *) src1->data + i11*nb11);
 								                for (int64_t i10 = 0; i10 < ne10; i10++) {
 								                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
 								        }
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        // need to zero dst since we are accumulating into it
 								        memset(dst->data, 0, ggml_nbytes(dst));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return;
 								    }
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // total rows in dst
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int nr = ne1;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
 								    ggml_fp16_t * const wdata_src = wdata + nk;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    for (int i1 = ir0; i1 < ir1; i1++) {
 								        float * dst_data = (float *)((char *) dst->data + i1*nb1);
 								        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
 								        for (int i10 = 0; i10 < ne10; i10++) {
 								            const int i1n = i10*ne11;
 								            for (int i00 = 0; i00 < ne00; i00++) {
 								                float v = 0;
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                ggml_vec_dot_f16(ne02, &v, 0,
 								                        (ggml_fp16_t *)    wdata_src + i1n, 0,
 								                        (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								                dst_data[i10*s0 + i00] += v;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
 								        }
 								    }
 								}
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								static void ggml_compute_forward_conv_transpose_1d_f32(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int nk = ne00*ne01*ne02;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    GGML_ASSERT(nb00 == sizeof(float));
 								    GGML_ASSERT(nb10 == sizeof(float));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (ith != 0) {
 								            return;
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        memset(params->wdata, 0, params->wsize);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
 								        {
 								            float * const wdata = (float *) params->wdata + 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								            for (int64_t i02 = 0; i02 < ne02; i02++) {
 								                for (int64_t i01 = 0; i01 < ne01; i01++) {
 								                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
 								                    float * dst_data = wdata + i01*ne00*ne02;
 								                    for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                        dst_data[i00*ne02 + i02] = src[i00];
 								                    }
 								                }
 								            }
 								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        // prepare source data (src1)
 								        {
 								            float * const wdata = (float *) params->wdata + nk;
 								            float * dst_data = wdata;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								            for (int64_t i11 = 0; i11 < ne11; i11++) {
 								                const float * const src = (float *)((char *) src1->data + i11*nb11);
 								                for (int64_t i10 = 0; i10 < ne10; i10++) {
 								                    dst_data[i10*ne11 + i11] = src[i10];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
 								        }
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        // need to zero dst since we are accumulating into it
 								        memset(dst->data, 0, ggml_nbytes(dst));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return;
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return;
 								    }
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // total rows in dst
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int nr = ne1;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    float * const wdata     = (float *) params->wdata + 0;
 								    float * const wdata_src = wdata + nk;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    for (int i1 = ir0; i1 < ir1; i1++) {
 								        float * dst_data = (float *)((char *) dst->data + i1*nb1);
 								        float * wdata_kernel = wdata + i1*ne02*ne00;
 								        for (int i10 = 0; i10 < ne10; i10++) {
 								            const int i1n = i10*ne11;
 								            for (int i00 = 0; i00 < ne00; i00++) {
 								                float v = 0;
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                ggml_vec_dot_f32(ne02, &v, 0,
 								                        wdata_src + i1n, 0,
 								                        wdata_kernel + i00*ne02, 0, 1);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								                dst_data[i10*s0 + i00] += v;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
 								        }
 								    }
 								}
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								static void ggml_compute_forward_conv_transpose_1d(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_conv_transpose_1d_f32(params, dst);
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    }
 								}
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								// src0: kernel [OC, IC, KH, KW]
 								// src1: image [N, IC, IH, IW]
 								// dst:  result [N, OH, OW, IC*KH*KW]
 								static void ggml_compute_forward_im2col_f32(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
 								    GGML_TENSOR_BINARY_OP_LOCALS;
 								    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
 								    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
 								    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
 								    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
 								    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
 								    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
 								    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int64_t N  = is_2D ? ne13 : ne12;
 								    const int64_t IC = is_2D ? ne12 : ne11;
 								    const int64_t IH = is_2D ? ne11 : 1;
 								    const int64_t IW = ne10;
 								    const int64_t KH = is_2D ? ne01 : 1;
 								    const int64_t KW = ne00;
 								    const int64_t OH = is_2D ? ne2 : 1;
 								    const int64_t OW = ne1;
 								    int ofs0 = is_2D ? nb13 : nb12;
 								    int ofs1 = is_2D ? nb12 : nb11;
 								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nb10 == sizeof(float));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								        return;
 								    }
 								    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
 								    {
 								        float * const wdata = (float *) dst->data;
 								        for (int64_t in = 0; in < N; in++) {
 								            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
 								                for (int64_t iow = 0; iow < OW; iow++) {
 								                    for (int64_t iic = ith; iic < IC; iic += nth) {
 								                        // micro kernel
 								                        float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
 								                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
 								                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
 								                            for (int64_t ikw = 0; ikw < KW; ikw++) {
 								                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
 								                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
 								                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
 								                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
 								                                } else {
 								                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
 								                                }
 								                            }
 								                        }
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								// src0: kernel [OC, IC, KH, KW]
 								// src1: image [N, IC, IH, IW]
 								// dst:  result [N, OH, OW, IC*KH*KW]
 								static void ggml_compute_forward_im2col_f16(
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F16);
 								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
 								    GGML_TENSOR_BINARY_OP_LOCALS;
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
 								    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
 								    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
 								    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
 								    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
 								    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
 								    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    const int64_t N  = is_2D ? ne13 : ne12;
 								    const int64_t IC = is_2D ? ne12 : ne11;
 								    const int64_t IH = is_2D ? ne11 : 1;
 								    const int64_t IW = ne10;
 								    const int64_t KH = is_2D ? ne01 : 1;
 								    const int64_t KW = ne00;
 								    const int64_t OH = is_2D ? ne2 : 1;
 								    const int64_t OW = ne1;
 								    int ofs0 = is_2D ? nb13 : nb12;
 								    int ofs1 = is_2D ? nb12 : nb11;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nb10 == sizeof(float));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return;
 								    }
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    {
 								        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
 								        for (int64_t in = 0; in < N; in++) {
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
 								                for (int64_t iow = 0; iow < OW; iow++) {
 								                    for (int64_t iic = ith; iic < IC; iic += nth) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								                        // micro kernel
 								                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
 								                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
 								                            for (int64_t ikw = 0; ikw < KW; ikw++) {
 								                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
 								                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
 								                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
 								                                } else {
 								                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
 								                                }
 								                            }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        }
 								                    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
 								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    }
 								}
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								static void ggml_compute_forward_im2col(
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    switch (dst->type) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_im2col_f16(params, dst);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_im2col_f32(params, dst);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_conv_transpose_2d
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_conv_transpose_2d(
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 								    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 								    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_BINARY_OP_LOCALS
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int nk = ne00*ne01*ne02*ne03;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nb10 == sizeof(float));
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (ith != 0) {
 								            return;
 								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        memset(params->wdata, 0, params->wsize);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
 								        {
 								            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            for (int64_t i03 = 0; i03 < ne03; i03++) {
 								                for (int64_t i02 = 0; i02 < ne02; i02++) {
 								                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
 								                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
 								                    for (int64_t i01 = 0; i01 < ne01; i01++) {
 								                        for (int64_t i00 = 0; i00 < ne00; i00++) {
 								                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
 								                        }
 								                    }
 								                }
 								            }
 								        }
 								        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
 								        {
 								            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
 								            for (int i12 = 0; i12 < ne12; i12++) {
 								                for (int i11 = 0; i11 < ne11; i11++) {
 								                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
 								                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
 								                    for (int i10 = 0; i10 < ne10; i10++) {
 								                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
 								                    }
 								                }
 								            }
 								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        memset(dst->data, 0, ggml_nbytes(dst));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
 								    const int32_t stride = ggml_get_op_params_i32(dst, 0);
 								    // total patches in dst
 								    const int np = ne2;
 								    // patches per thread
 								    const int dp = (np + nth - 1)/nth;
 								    // patch range for this thread
 								    const int ip0 = dp*ith;
 								    const int ip1 = MIN(ip0 + dp, np);
 								    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 								    ggml_fp16_t * const wdata_src = wdata + nk;
 								    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
 								        float * dst_data = (float *)((char *) dst->data + i2*nb2);
 								        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
 								        for (int i11 = 0; i11 < ne11; i11++) {
 								            for (int i10 = 0; i10 < ne10; i10++) {
 								                const int i1n = i11*ne10*ne12 + i10*ne12;
 								                for (int i01 = 0; i01 < ne01; i01++) {
 								                    for (int i00 = 0; i00 < ne00; i00++) {
 								                        float v = 0;
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                        ggml_vec_dot_f16(ne03, &v, 0,
 								                                wdata_src + i1n, 0,
 								                                wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
 								// ggml_compute_forward_pool_1d_sk_p0
 								static void ggml_compute_forward_pool_1d_sk_p0(
 								        const struct ggml_compute_params * params,
 								        const enum ggml_op_pool op,
 								        const int k,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    assert(src->type == GGML_TYPE_F32);
 								    assert(params->ith == 0);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
 								    const char * cdata = (const char *)src->data;
 								    const char * const data_end = cdata + ggml_nbytes(src);
 								    float * drow = (float *)dst->data;
 								    const int64_t rs = dst->ne[0];
 								    while (cdata < data_end) {
 								        const float * const srow = (const float *)cdata;
 								        int j = 0;
 								        for (int64_t i = 0; i < rs; ++i) {
 								            switch (op) {
 								                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
 								                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
 								                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
 								            }
 								            for (int ki = 0; ki < k; ++ki) {
 								                switch (op) {
 								                    case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
 								                    case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
 								                    case GGML_OP_POOL_COUNT:                        GGML_ASSERT(false); break;
 								                }
 								                ++j;
 								            }
 								            switch (op) {
 								                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
 								                case GGML_OP_POOL_MAX:                       break;
 								                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
 								            }
 								        }
 								        cdata += src->nb[1];
 								        drow  += rs;
 								    }
 								}
 								// ggml_compute_forward_pool_1d
 								static void ggml_compute_forward_pool_1d(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
 								    const int32_t * opts = (const int32_t *)dst->op_params;
 								    enum ggml_op_pool op = opts[0];
 								    const int k0 = opts[1];
 								    const int s0 = opts[2];
 								    const int p0 = opts[3];
 								    GGML_ASSERT(p0 == 0); // padding not supported
 								    GGML_ASSERT(k0 == s0); // only s = k supported
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								// ggml_compute_forward_pool_2d
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static void ggml_compute_forward_pool_2d(
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src = dst->src[0];
-												llava : add MobileVLM support (llama/5132)

* New Feature:
    1. Sum_Rows:
        fix cuda kernel overflow
        fix block shape error when nrows too big
    2. Im2Col:
        Support Batch in cuda
        Support f32 to f32 both in cpu && cuda
    3. DepthWiseConv:
        Support by Im2Col && MulMat
    4. Pool_2d:
        Supoort avg pooling in cuda
    5. HardSigmoid:
        Imp in cuda
    6. HardSwish:
        Imp in cuda

* fix tabs instead of spaces

* code clean

* CUDA POOL2D

* ADD POOL2D test case in test-backend-ops.cpp

* code clean

* fix pool2d_kernel

nits

* fix bug in pool2d kernel

* fix avg pooling, count_include_pad

nits

* test-backend-ops : add more pool_2d tests

* cuda : fix warnings and formatting

* ggml : check types in release builds too in pool_2d

* test-backend-ops : remove f16 pool_2d tests

* cuda : more style fixes

* Add assert in ggml_cuda_op_pool2d

* pool2d float padding fallback

* test-backend-ops : add dst_type to im2col

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-01-31 13:10:15 +00:00
+								    GGML_ASSERT(src->type == GGML_TYPE_F32);
 								    GGML_ASSERT(params->ith == 0);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int32_t * opts = (const int32_t *)dst->op_params;
 								    enum ggml_op_pool op = opts[0];
 								    const int k0 = opts[1];
 								    const int k1 = opts[2];
 								    const int s0 = opts[3];
 								    const int s1 = opts[4];
 								    const int p0 = opts[5];
 								    const int p1 = opts[6];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const char * cdata = (const char*)src->data;
 								    const char * const data_end = cdata + ggml_nbytes(src);
 								    const int64_t px = dst->ne[0];
 								    const int64_t py = dst->ne[1];
 								    const int64_t pa = px * py;
 								    float * dplane = (float *)dst->data;
 								    const int ka = k0 * k1;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int offset0 = -p0;
 								    const int offset1 = -p1;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    while (cdata < data_end) {
 								        for (int oy = 0; oy < py; ++oy) {
 								            float * const drow = dplane + oy * px;
 								            for (int ox = 0; ox < px; ++ox) {
 								                float * const out =  drow + ox;
 								                switch (op) {
 								                    case GGML_OP_POOL_AVG:     *out = 0;        break;
 								                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
 								                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
 								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                const int ix = offset0 + ox * s0;
 								                const int iy = offset1 + oy * s1;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								                for (int ky = 0; ky < k1; ++ky) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
 								                    for (int kx = 0; kx < k0; ++kx) {
 								                        int j = ix + kx;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        if (j < 0 || j >= src->ne[0]) continue;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        switch (op) {
 								                            case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
 								                            case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
 								                            case GGML_OP_POOL_COUNT:                GGML_ASSERT(false); break;
 								                        }
 								                    }
 								                }
 								                switch (op) {
 								                    case GGML_OP_POOL_AVG:           *out /= ka; break;
 								                    case GGML_OP_POOL_MAX:                       break;
 								                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
 								                }
 								            }
 								        }
 								        cdata  += src->nb[2];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        dplane += pa;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								// ggml_compute_forward_upscale
 								static void ggml_compute_forward_upscale_f32(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    const int ith = params->ith;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    const int nth = params->nth;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    const int scale_factor = dst->op_params[0];
 								    // TODO: optimize
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    for (int64_t i3 = 0; i3 < ne3; i3++) {
 								        const int64_t i03 = i3;
 								        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
 								            const int64_t i02 = i2;
 								            for (int64_t i1 = 0; i1 < ne1; i1++) {
 								                const int64_t i01 = i1 / scale_factor;
 								                for (int64_t i0 = 0; i0 < ne0; i0++) {
 								                    const int64_t i00 = i0 / scale_factor;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 								                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								                    *y = *x;
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_upscale(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_upscale_f32(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								// ggml_compute_forward_pad
 								static void ggml_compute_forward_pad_f32(
 								    const struct ggml_compute_params * params,
 								          struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        return;
 								    }
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    GGML_ASSERT( dst->nb[0] == sizeof(float));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    GGML_TENSOR_UNARY_OP_LOCALS
 								    float * dst_ptr = (float *) dst->data;
 								    // TODO: optimize
 								    for (int64_t i2 = 0; i2 < ne2; ++i2) {
 								        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
 								            for (int64_t i0 = 0; i0 < ne0; ++i0) {
 								                for (int64_t i3 = 0; i3 < ne3; ++i3) {
 								                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
 								                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 								                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
 								                        dst_ptr[dst_idx] = *src_ptr;
 								                    } else {
 								                        dst_ptr[dst_idx] = 0;
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_pad(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_pad_f32(params, dst);
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
 								// ggml_compute_forward_arange
 								static void ggml_compute_forward_arange_f32(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
 								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
 								        return;
 								    }
 								    GGML_ASSERT(dst->nb[0] == sizeof(float));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const float start = ggml_get_op_params_f32(dst, 0);
 								    const float stop  = ggml_get_op_params_f32(dst, 1);
 								    const float step  = ggml_get_op_params_f32(dst, 2);
 								    const int64_t steps = (int64_t) ceilf((stop - start) / step);
 								    GGML_ASSERT(ggml_nelements(dst) == steps);
 								    for (int64_t i = ith; i < steps; i+= nth) {
 								        float value = start + step * i;
 								        ((float *)dst->data)[i] = value;
 								    }
 								}
 								static void ggml_compute_forward_arange(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
 								    switch (dst->type) {
 								        case GGML_TYPE_F32:
 								            {
 								                ggml_compute_forward_arange_f32(params, dst);
 								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								static void ggml_compute_forward_timestep_embedding_f32(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
 								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
 								        return;
 								    }
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    GGML_TENSOR_UNARY_OP_LOCALS
 								    const int dim = ggml_get_op_params_i32(dst, 0);
 								    const int max_period = ggml_get_op_params_i32(dst, 1);
 								    int half = dim / 2;
 								    for (int64_t i = 0; i < ne00; i++) {
 								        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
 								        for (int64_t j = ith; j < half; j += nth) {
 								            float timestep = ((float *)src0->data)[i];
 								            float freq = (float)expf(-logf(max_period) * j / half);
 								            float arg = timestep * freq;
 								            embed_data[j] = cosf(arg);
 								            embed_data[j + half] = sinf(arg);
 								        }
 								        if (dim % 2 != 0 && ith == 0) {
 								            embed_data[dim] = 0.f;
 								        }
 								    }
 								}
 								static void ggml_compute_forward_timestep_embedding(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
 								                ggml_compute_forward_timestep_embedding_f32(params, dst);
 								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								// ggml_compute_forward_argsort
 								static void ggml_compute_forward_argsort_f32(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        return;
 								    }
 								    GGML_TENSOR_UNARY_OP_LOCALS
 								    GGML_ASSERT(nb0 == sizeof(float));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int64_t nr = ggml_nrows(src0);
 								    enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
 								    for (int64_t i = ith; i < nr; i += nth) {
 								        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
 								        const float * src_data = (float *)((char *) src0->data + i*nb01);
 								        for (int64_t j = 0; j < ne0; j++) {
 								            dst_data[j] = j;
 								        }
 								        // C doesn't have a functional sort, so we do a bubble sort instead
 								        for (int64_t j = 0; j < ne0; j++) {
 								            for (int64_t k = j + 1; k < ne0; k++) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                if ((order == GGML_SORT_ORDER_ASC  && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
 								                    (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                    int32_t tmp = dst_data[j];
 								                    dst_data[j] = dst_data[k];
 								                    dst_data[k] = tmp;
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_argsort(
 								    const struct ggml_compute_params * params,
 								    struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_argsort_f32(params, dst);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_flash_attn
 								static void ggml_compute_forward_flash_attn_f32(
 								        const struct ggml_compute_params * params,
 								        const bool masked,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * q = dst->src[0];
 								    const struct ggml_tensor * k = dst->src[1];
 								    const struct ggml_tensor * v = dst->src[2];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
 								    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int64_t D = neq0;
 								    const int64_t N = neq1;
 								    const int64_t P = nek1 - N;
 								    const int64_t M = P + N;
 								    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
 								    GGML_ASSERT(ne0 == D);
 								    GGML_ASSERT(ne1 == N);
 								    GGML_ASSERT(P >= 0);
 								    GGML_ASSERT(nbq0 == sizeof(float));
 								    GGML_ASSERT(nbk0 == sizeof(float));
 								    GGML_ASSERT(nbv0 == sizeof(float));
 								    GGML_ASSERT(neq0 == D);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    GGML_ASSERT(nek0 == D);
 								    GGML_ASSERT(nev1 == D);
 								    GGML_ASSERT(neq1 == N);
 								    GGML_ASSERT(nek1 == N + P);
 								    GGML_ASSERT(nev1 == D);
 								    // dst cannot be transposed or permuted
 								    GGML_ASSERT(nb0 == sizeof(float));
 								    GGML_ASSERT(nb0 <= nb1);
 								    GGML_ASSERT(nb1 <= nb2);
 								    GGML_ASSERT(nb2 <= nb3);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        return;
 								    }
 								    // parallelize by q rows using ggml_vec_dot_f32
 								    // total rows in q
 								    const int nr = neq1*neq2*neq3;
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    const float scale = 1.0f/sqrtf(D);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // q indices
 								        const int iq3 = ir/(neq2*neq1);
 								        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
 								        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
 								        for (int i = M; i < Mup; ++i) {
 								            S[i] = -INFINITY;
 								        }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
 								        for (int64_t ic = 0; ic < masked_begin; ++ic) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            // k indices
 								            const int ik3 = iq3;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            const int ik2 = iq2 % nek2;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            const int ik1 = ic;
 								            // S indices
 								            const int i1 = ik1;
 								            ggml_vec_dot_f32(neq0,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                    S + i1, 0,
 								                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
 								                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
 								        // scale
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        ggml_vec_scale_f32(masked_begin, S, scale);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        for (int64_t i = masked_begin; i < M; i++) {
 								            S[i] = -INFINITY;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
 								        // softmax
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        // exclude known -INF S[..] values from max and loop
 								        // dont forget to set their SW values to zero
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        {
 								            float max = -INFINITY;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            ggml_vec_max_f32(masked_begin, &max, S);
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            ggml_float sum = 0.0;
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								            {
-												ggml : fix bug in new soft max computation

											
										
										
											2023-01-07 19:00:07 +00:00
+								#ifdef GGML_SOFT_MAX_ACCELERATE
 								                max = -max;
 								                vDSP_vsadd(S, 1, &max, S, 1, Mup);
 								                vvexpf(S, S, &Mup);
 								                ggml_vec_sum_f32(Mup, &sum, S);
 								#else
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
 								                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    if (i >= masked_begin) {
 								                        break;
 								                    }
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                    float * SS = S + i;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        if (i + j >= masked_begin) {
 								                            break;
 								                        } else if (SS[j] == -INFINITY) {
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                            SS[j] = 0.0f;
 								                        } else {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#ifndef GGML_FLASH_ATTN_EXP_FP16
 								                            const float val = expf(SS[j] - max);
 								#else
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
 								                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#endif
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            sump[j] += (ggml_float)val;
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                            SS[j] = val;
 								                        }
 								                    }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                }
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
 								                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
 								                    sum += sump[i];
 								                }
 								#endif
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            assert(sum > 0.0);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								            sum = 1.0/sum;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            ggml_vec_scale_f32(masked_begin, S, sum);
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
 								#ifndef NDEBUG
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            for (int i = 0; i < masked_begin; ++i) {
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                assert(!isnan(S[i]));
 								                assert(!isinf(S[i]));
 								            }
 								#endif
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        for (int64_t ic = 0; ic < nev1; ++ic) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            // dst indices
 								            const int i1 = iq1;
 								            const int i2 = iq2;
 								            const int i3 = iq3;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            // v indices
 								            const int iv2 = iq2 % nev2;
 								            const int iv3 = iq3;
 								            ggml_vec_dot_f32(masked_begin,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
 								                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
 								                    S, 0, 1);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_flash_attn_f16(
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        const struct ggml_compute_params * params,
 								        const bool masked,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * q = dst->src[0];
 								    const struct ggml_tensor * k = dst->src[1];
 								    const struct ggml_tensor * v = dst->src[2];
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
 								    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    const int64_t D = neq0;
 								    const int64_t N = neq1;
 								    const int64_t P = nek1 - N;
 								    const int64_t M = P + N;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    GGML_ASSERT(ne0 == D);
 								    GGML_ASSERT(ne1 == N);
 								    GGML_ASSERT(P >= 0);
 								    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(neq0 == D);
 								    GGML_ASSERT(nek0 == D);
 								    GGML_ASSERT(nev1 == D);
 								    GGML_ASSERT(neq1 == N);
 								    GGML_ASSERT(nek1 == N + P);
 								    GGML_ASSERT(nev1 == D);
 								    // dst cannot be transposed or permuted
 								    GGML_ASSERT(nb0 == sizeof(float));
 								    GGML_ASSERT(nb0 <= nb1);
 								    GGML_ASSERT(nb1 <= nb2);
 								    GGML_ASSERT(nb2 <= nb3);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        return;
 								    }
 								    // parallelize by q rows using ggml_vec_dot_f32
 								    // total rows in q
 								    const int nr = neq1*neq2*neq3;
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    const float scale = 1.0f/sqrtf(D);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // q indices
 								        const int iq3 = ir/(neq2*neq1);
 								        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
 								        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
 								        for (int i = M; i < Mup; ++i) {
 								            S[i] = -INFINITY;
 								        }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            for (int64_t ic = 0; ic < nek1; ++ic) {
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                // k indices
 								                const int ik3 = iq3;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                const int ik2 = iq2 % nek2;
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                const int ik1 = ic;
-												ggml : unroll ggml_vec_dot_f16 in ggml_compute_forward_flash_attn_f16

											
										
										
											2023-01-07 15:32:23 +00:00
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                // S indices
 								                const int i1 = ik1;
-												ggml : unroll ggml_vec_dot_f16 in ggml_compute_forward_flash_attn_f16

											
										
										
											2023-01-07 15:32:23 +00:00
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                ggml_vec_dot_f16(neq0,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                        S + i1, 0,
 								                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
 								                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								            }
 								        } else {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                // k indices
 								                const int ik3 = iq3;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                const int ik2 = iq2 % nek2;
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                const int ik1 = ic;
 								                // S indices
 								                const int i1 = ik1;
 								                ggml_vec_dot_f16_unroll(neq0, nbk1,
 								                        S + i1,
 								                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
 								                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
 								            }
-												ggml : unroll ggml_vec_dot_f16 in ggml_compute_forward_flash_attn_f16

											
										
										
											2023-01-07 15:32:23 +00:00
+								        }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								        // scale
 								        ggml_vec_scale_f32(nek1, S, scale);
 								        if (masked) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            for (int64_t i = P; i < M; i++) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                if (i > P + iq1) {
 								                    S[i] = -INFINITY;
 								                }
 								            }
 								        }
 								        // softmax
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
 								        // dont forget to set their S values to zero
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        {
 								            float max = -INFINITY;
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								            ggml_vec_max_f32(M, &max, S);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            ggml_float sum = 0.0;
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								            {
-												ggml : fix bug in new soft max computation

											
										
										
											2023-01-07 19:00:07 +00:00
+								#ifdef GGML_SOFT_MAX_ACCELERATE
 								                max = -max;
 								                vDSP_vsadd(S, 1, &max, S, 1, Mup);
 								                vvexpf(S, S, &Mup);
 								                ggml_vec_sum_f32(Mup, &sum, S);
 								#else
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
 								                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
 								                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
 								                    float * SS = S + i;
 								                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
 								                        if (SS[j] == -INFINITY) {
 								                            SS[j] = 0.0f;
 								                        } else {
 								                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
 								                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                            sump[j] += (ggml_float)val;
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                            SS[j] = val;
 								                        }
 								                    }
 								                }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
 								                    sum += sump[i];
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                }
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
+								#endif
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            assert(sum > 0.0);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								            sum = 1.0/sum;
 								            ggml_vec_scale_f32(M, S, sum);
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
 								#ifndef NDEBUG
 								            for (int i = 0; i < M; ++i) {
 								                assert(!isnan(S[i]));
 								                assert(!isinf(S[i]));
 								            }
 								#endif
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
-												ggml : fix bug in new soft max computation

											
										
										
											2023-01-07 19:00:07 +00:00
+								        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        for (int64_t i = 0; i < M; i++) {
-												ggml : use macros to inline FP16 <-> FP32 conversions

											
										
										
											2022-12-06 20:05:33 +00:00
+								            S16[i] = GGML_FP32_TO_FP16(S[i]);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            for (int64_t ic = 0; ic < nev1; ++ic) {
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                // dst indices
 								                const int i1 = iq1;
 								                const int i2 = iq2;
 								                const int i3 = iq3;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // v indices
 								                const int iv2 = iq2 % nev2;
 								                const int iv3 = iq3;
 								                ggml_vec_dot_f16(nev0,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)), 0,
 								                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
 								                        S16, 0, 1);
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								            }
 								        } else {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                // dst indices
 								                const int i1 = iq1;
 								                const int i2 = iq2;
 								                const int i3 = iq3;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // v indices
 								                const int iv2 = iq2 % nev2;
 								                const int iv3 = iq3;
 								                ggml_vec_dot_f16_unroll(nev0, nbv1,
 								                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
 								                        ((char *)             v->data + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
-												ggml : improve vec_dot_f16 unrolling in flash_attn_f16

											
										
										
											2023-01-08 09:41:18 +00:00
+								                        S16);
 								            }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_flash_attn(
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        const struct ggml_compute_params * params,
 								        const bool masked,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * q = dst->src[0];
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    switch (q->type) {
 								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_flash_attn_f16(params, masked, dst);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_flash_attn_f32(params, masked, dst);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
 								    }
 								}
 								// ggml_compute_forward_flash_ff
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_flash_ff_f16(
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * a = dst->src[0];  // F16
 								    const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
 								    const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
 								    const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
 								    const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb)
 								    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb)
 								    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb)
 								    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb)
 								    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne)
 								    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb)
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    const int64_t D = nea0;
 								    //const int64_t N = nea1;
 								    const int64_t M = neb01;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								    GGML_ASSERT(ne0 == nea0);
 								    GGML_ASSERT(ne1 == nea1);
 								    GGML_ASSERT(ne2 == nea2);
 								    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nbb10 == sizeof(float));
 								    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
 								    GGML_ASSERT(nbc10 == sizeof(float));
 								    GGML_ASSERT(neb00 == D);
 								    GGML_ASSERT(neb01 == M);
 								    GGML_ASSERT(neb10 == M);
 								    GGML_ASSERT(neb11 == 1);
 								    GGML_ASSERT(nec00 == M);
 								    GGML_ASSERT(nec01 == D);
 								    GGML_ASSERT(nec10 == D);
 								    GGML_ASSERT(nec11 == 1);
 								    // dst cannot be transposed or permuted
 								    GGML_ASSERT(nb0 == sizeof(float));
 								    GGML_ASSERT(nb0 <= nb1);
 								    GGML_ASSERT(nb1 <= nb2);
 								    GGML_ASSERT(nb2 <= nb3);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        return;
 								    }
 								    // parallelize by a rows using ggml_vec_dot_f32
 								    // total rows in a
 								    const int nr = nea1*nea2*nea3;
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // a indices
 								        const int ia3 = ir/(nea2*nea1);
 								        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
 								        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
 								        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        for (int64_t ic = 0; ic < neb01; ++ic) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            // b0 indices
 								            const int ib03 = ia3;
 								            const int ib02 = ia2;
 								            const int ib01 = ic;
 								            // S indices
 								            const int i1 = ib01;
 								            ggml_vec_dot_f16(nea0,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                    S + i1, 0,
 								                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
 								                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)), 0, 1);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
 								        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
 								        //ggml_vec_gelu_f32(neb01, S, S);
 								        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        for (int64_t i = 0; i < M; i++) {
-												ggml : use macros to inline FP16 <-> FP32 conversions

											
										
										
											2022-12-06 20:05:33 +00:00
+								            S16[i] = GGML_FP32_TO_FP16(S[i]);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
 								        ggml_vec_gelu_f16(neb01, S16, S16);
 								        {
 								            // dst indices
 								            const int i1 = ia1;
 								            const int i2 = ia2;
 								            const int i3 = ia3;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            for (int64_t ic = 0; ic < nec01; ++ic) {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
 								                ggml_vec_dot_f16(neb01,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)), 0,
 								                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
 								                        S16, 0, 1);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            }
 								            ggml_vec_add_f32(nec01,
 								                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
 								                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
 								                    (float *) c1->data);
 								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward_flash_ff(
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * b0 = dst->src[1];
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								    switch (b0->type) {
 								        case GGML_TYPE_F16:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_flash_ff_f16(params, dst);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
 								        case GGML_TYPE_F32:
 								            {
 								                GGML_ASSERT(false); // TODO
 								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_flash_attn_back
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static void ggml_compute_forward_flash_attn_back_f32(
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const bool masked,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * q = dst->src[0];
 								    const struct ggml_tensor * k = dst->src[1];
 								    const struct ggml_tensor * v = dst->src[2];
 								    const struct ggml_tensor * d = dst->src[3];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
 								    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
 								    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
 								    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int64_t D = neq0;
 								    const int64_t N = neq1;
 								    const int64_t P = nek1 - N;
 								    const int64_t M = P + N;
 								    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
 								    const int mxDM = MAX(D, Mup);
 								    // GGML_ASSERT(ne0 == D);
 								    // GGML_ASSERT(ne1 == N);
 								    GGML_ASSERT(P >= 0);
 								    GGML_ASSERT(nbq0 == sizeof(float));
 								    GGML_ASSERT(nbk0 == sizeof(float));
 								    GGML_ASSERT(nbv0 == sizeof(float));
 								    GGML_ASSERT(neq0 == D);
 								    GGML_ASSERT(nek0 == D);
 								    GGML_ASSERT(nev1 == D);
 								    GGML_ASSERT(ned0 == D);
 								    GGML_ASSERT(neq1 == N);
 								    GGML_ASSERT(nek1 == N + P);
 								    GGML_ASSERT(nev1 == D);
 								    GGML_ASSERT(ned1 == N);
 								    // dst cannot be transposed or permuted
 								    GGML_ASSERT(nb0 == sizeof(float));
 								    GGML_ASSERT(nb0 <= nb1);
 								    GGML_ASSERT(nb1 <= nb2);
 								    GGML_ASSERT(nb2 <= nb3);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (ith == 0) {
 								            memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
 								        }
 								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int64_t elem_q = ggml_nelements(q);
 								    const int64_t elem_k = ggml_nelements(k);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    enum ggml_type result_type = dst->type;
 								    GGML_ASSERT(ggml_blck_size(result_type) == 1);
 								    const size_t tsize = ggml_type_size(result_type);
 								    const size_t offs_q = 0;
 								    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
 								    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
 								    void * grad_q = (char *) dst->data;
 								    void * grad_k = (char *) dst->data + offs_k;
 								    void * grad_v = (char *) dst->data + offs_v;
 								    const size_t nbgq1 = nb0*neq0;
 								    const size_t nbgq2 = nb0*neq0*neq1;
 								    const size_t nbgq3 = nb0*neq0*neq1*neq2;
 								    const size_t nbgk1 = nb0*nek0;
 								    const size_t nbgk2 = nb0*nek0*nek1;
 								    const size_t nbgk3 = nb0*nek0*nek1*neq2;
 								    const size_t nbgv1 = nb0*nev0;
 								    const size_t nbgv2 = nb0*nev0*nev1;
 								    const size_t nbgv3 = nb0*nev0*nev1*neq2;
 								    // parallelize by k rows using ggml_vec_dot_f32
 								    // total rows in k
 								    const int nr = nek2*nek3;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    const float scale = 1.0f/sqrtf(D);
 								    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // how often k2 (and v2) is repeated in q2
 								    int nrep = neq2/nek2;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    for (int ir = ir0; ir < ir1; ++ir) {
 								        // q indices
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const int ik3 = ir/(nek2);
 								        const int ik2 = ir - ik3*nek2;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const int iq3 = ik3;
 								        const int id3 = ik3;
 								        const int iv3 = ik3;
 								        const int iv2 = ik2;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        for (int irep = 0; irep < nrep; ++irep) {
 								            const int iq2 = ik2 + irep*nek2;
 								            const int id2 = iq2;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            // (ik2 + irep*nek2) % nek2 == ik2
 								            for (int iq1 = 0; iq1 < neq1; ++iq1) {
 								                const int id1 = iq1;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // not sure about CACHE_LINE_SIZE_F32..
 								                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
 								                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
 								                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                for (int i = M; i < Mup; ++i) {
 								                    S[i] = -INFINITY;
 								                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
 								                for (int64_t ic = 0; ic < masked_begin; ++ic) {
 								                    // k indices
 								                    const int ik1 = ic;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    // S indices
 								                    const int i1 = ik1;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    ggml_vec_dot_f32(neq0,
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                            S + i1, 0,
 								                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
 								                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // scale
 								                ggml_vec_scale_f32(masked_begin, S, scale);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                for (int64_t i = masked_begin; i < M; i++) {
 								                    S[i] = -INFINITY;
 								                }
 								                // softmax
 								                // exclude known -INF S[..] values from max and loop
 								                // dont forget to set their SM values to zero
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    float max = -INFINITY;
 								                    ggml_vec_max_f32(masked_begin, &max, S);
 								                    ggml_float sum = 0.0;
 								                    {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#ifdef GGML_SOFT_MAX_ACCELERATE
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        max = -max;
 								                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
 								                        vvexpf(SM, SM, &Mup);
 								                        ggml_vec_sum_f32(Mup, &sum, SM);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#else
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
 								                        ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
 								                            if (i >= masked_begin) {
 								                                break;
 								                            }
 								                            float * SR =  S + i;
 								                            float * SW = SM + i;
 								                            for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
 								                                if (i + j >= masked_begin) {
 								                                    break;
 								                                } else if (SR[j] == -INFINITY) {
 								                                    SW[j] = 0.0f;
 								                                } else {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#ifndef GGML_FLASH_ATTN_EXP_FP16
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                    const float val = expf(SR[j] - max);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#else
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                    ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
 								                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
 								                                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#endif
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                    sump[j] += (ggml_float)val;
 								                                    SW[j] = val;
 								                                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                            }
 								                        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
 								                            sum += sump[i];
 								                        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#endif
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    assert(sum > 0.0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    sum = 1.0/sum;
 								                    ggml_vec_scale_f32(masked_begin, SM, sum);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // step-by-step explanation
 								                {
 								                    // forward-process                    shape      grads from backward process
 								                    // parallel_for ik2,ik3:
 								                    //  for irep:
 								                    //   iq2 = ik2 + irep*nek2
 								                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
 								                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
 								                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
 								                    //   for iq1:
 								                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
 								                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
 								                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
 								                    //    S0     = -Inf                   [D,1,1,1]
 								                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
 								                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
 								                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
 								                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
 								                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
 								                    //   ~S5[i]  = dot(vcur[:,i], S4)
 								                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
 								                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
 								                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
 								                    // dst                               backward-/ grad[dst]                 = d
 								                    //
 								                    // output gradients with their dependencies:
 								                    //
 								                    // grad[kcur] = grad[S1].T @ qcur
 								                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
 								                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
 								                    // grad[S4]   = grad[S5] @ vcur
 								                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
 								                    // grad[qcur] = grad[S1]   @ kcur
 								                    // grad[vcur] = grad[S5].T @ S4
 								                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
 								                    //
 								                    // in post-order:
 								                    //
 								                    // S1         = qcur @ kcur.T
 								                    // S2         = S1 * scale
 								                    // S3         = diag_mask_inf(S2, P)
 								                    // S4         = softmax(S3)
 								                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
 								                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
 								                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
 								                    // grad[qcur] = grad[S1]   @ kcur
 								                    // grad[kcur] = grad[S1].T @ qcur
 								                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
 								                    //
 								                    // using less variables (SM=S4):
 								                    //
 								                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
 								                    // SM            = softmax(S)
 								                    // S             = d[:D,iq1,iq2,iq3] @ vcur
 								                    // dot_SM_gradSM = dot(SM, S)
 								                    // S             = SM * (S - dot(SM, S))
 								                    // S             = diag_mask_zero(S, P) * scale
 								                    //
 								                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
 								                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
 								                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
 								                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
 								                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
 								                // for ic:
 								                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
 								                // exclude known future zero S[..] values from operation
 								                ggml_vec_set_f32(masked_begin, S, 0);
 								                for (int64_t ic = 0; ic < D; ++ic) {
 								                    ggml_vec_mad_f32(masked_begin,
 								                            S,
 								                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
 								                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
 								                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // S = SM * (S - dot(SM, S))
 								                float dot_SM_gradSM = 0;
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
 								                ggml_vec_mul_f32 (masked_begin, S, S, SM);
 								                // S = diag_mask_zero(S, P) * scale
 								                // already done by above ggml_vec_set_f32
 								                // exclude known zero S[..] values from operation
 								                ggml_vec_scale_f32(masked_begin, S, scale);
 								                // S    shape [M,1]
 								                // SM   shape [M,1]
 								                // kcur shape [D,M]
 								                // qcur shape [D,1]
 								                // vcur shape [M,D]
 								                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
 								                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
 								                // for ic:
 								                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
 								                // exclude known zero S[..] values from loop
 								                for (int64_t ic = 0; ic < masked_begin; ++ic) {
 								                    ggml_vec_mad_f32(D,
 								                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
 								                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
 								                            S[ic]);
 								                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
 								                // for ic:
 								                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
 								                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
 								                // exclude known zero S[..] values from loop
 								                for (int64_t ic = 0; ic < masked_begin; ++ic) {
 								                    ggml_vec_mad_f32(D,
 								                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
 								                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
 								                            S[ic]);
 								                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
 								                // for ic:
 								                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
 								                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
 								                // exclude known zero SM[..] values from mad
 								                for (int64_t ic = 0; ic < D; ++ic) {
 								                    ggml_vec_mad_f32(masked_begin,
 								                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
 								                            SM,
 								                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
 								                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_flash_attn_back(
 								        const struct ggml_compute_params * params,
 								        const bool masked,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * q = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (q->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_flash_attn_back_f32(params, masked, dst);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								// ggml_compute_forward_ssm_conv
 								static void ggml_compute_forward_ssm_conv_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
 								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
 								        return;
 								    }
 								    const struct ggml_tensor * src0 = dst->src[0]; // conv_state
 								    const struct ggml_tensor * src1 = dst->src[1]; // x
 								    const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
 								    const struct ggml_tensor * src3 = dst->src[3]; // state_seq
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int nc   = src2->ne[0]; // d_conv
 								    const int nr   = src0->ne[1]; // d_inner
 								    const int n_t  = src1->ne[1]; // n_tokens
 								    const int n_kv = src0->ne[2]; // max number of sequences in the batch
 								    GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    GGML_ASSERT(src1->nb[0] == sizeof(float));
 								    GGML_ASSERT(src2->nb[0] == sizeof(float));
 								    GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
 								    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
 								    // for use with the destination state offset between sequences
 								    GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    const int ir  = ir1 - ir0;
 								    if (n_kv > 1) {
 								        // multiple sequences means it's hard to know when it's the first time a state is read,
 								        // so copy them all over to the destination, just to be sure.
 								        for (int i3 = 0; i3 < n_kv; ++i3) {
 								            float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
 								            float * s  = (float *) ((char *)  dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
 								            // can't use memcpy because of d_conv vs d_conv - 1
 								            for (int i1 = 0; i1 < ir; ++i1) {
 								                for (int i0 = 0; i0 < nc - 1; ++i0) {
 								                    // copy s0 to last (d_conv - 1) columns of s
 								                    s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
 								                }
 								            }
 								        }
 								    }
 								    for (int i2 = 0; i2 < n_t; ++i2) {
 								        int32_t * sq = (int32_t *) ((char *) src3->data +  i2*(src3->nb[1])); // {n_kv, n_tokens}
 								        float *   x  = (float *)   ((char *)  dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
 								        float *   s  = (float *)   ((char *)  dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
 								        float *   s0; // {d_conv - 1, d_inner, n_kv}
 								        float *   x0 = (float *)   ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
 								        float *   c  = (float *)   ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
 								        int ne0s0;
 								        GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
 								        // avoid needing to copy the state for the first token
 								        if (i2 == 0) {
 								            s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
 								            ne0s0 = src0->ne[0];
 								        } else {
 								            // the source is the last (d_conv - 1) columns of the destination
 								            s0 = s + 1;
 								            ne0s0 = nc;
 								        }
 								        // d_inner
 								        for (int i1 = 0; i1 < ir; ++i1) {
 								            // shift state left
 								            for (int i0 = 0; i0 < nc - 1; ++i0) {
 								                s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
 								            }
 								            // insert x on the last column
 								            s[(nc - 1) + i1*nc] = x0[i1];
 								        }
 								        // handle copies when there are multiple output states
 								        for (int i3 = 1; i3 < n_kv; ++i3) {
 								            int32_t seq = sq[i3];
 								            if (0 <= seq && seq < n_kv) {
 								                float * s1 = s + (seq - sq[0])*nc*nr;
 								                memcpy(s1, s, nc*ir*sizeof(float));
 								            } else {
 								                // stop at negative or too big seq_ids
 								                break;
 								            }
 								        }
 								        // it seems a little faster when this is separate from the state shift
 								        for (int i1 = 0; i1 < ir; ++i1) {
 								            // rowwise dot product
 								            float sumf = 0.0f;
 								            for (int i0 = 0; i0 < nc; ++i0) {
 								                int i = i0 + i1*nc;
 								                sumf += s[i] * c[i];
 								            }
 								            x[i1] = sumf;
 								        }
 								    }
 								}
 								static void ggml_compute_forward_ssm_conv(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
 								    switch (dst->src[0]->type) {
 								        case GGML_TYPE_F32:
 								            {
 								                ggml_compute_forward_ssm_conv_f32(params, dst);
 								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_ssm_scan
 								static void ggml_compute_forward_ssm_scan_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
 								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
 								        return;
 								    }
 								    const struct ggml_tensor * src0 = dst->src[0]; // s
 								    const struct ggml_tensor * src1 = dst->src[1]; // x
 								    const struct ggml_tensor * src2 = dst->src[2]; // dt
 								    const struct ggml_tensor * src3 = dst->src[3]; // A
 								    const struct ggml_tensor * src4 = dst->src[4]; // B
 								    const struct ggml_tensor * src5 = dst->src[5]; // C
 								    const struct ggml_tensor * src6 = dst->src[6]; // sq
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    const int64_t nc   = src0->ne[0]; // d_state
 								    const int64_t nr   = src0->ne[1]; // d_inner
 								    const int64_t n_t  = src1->ne[1]; // number of tokens in the batch
 								    const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
 								    GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
 								    GGML_ASSERT(src0->nb[0] == sizeof(float));
 								    GGML_ASSERT(src1->nb[0] == sizeof(float));
 								    GGML_ASSERT(src2->nb[0] == sizeof(float));
 								    GGML_ASSERT(src3->nb[0] == sizeof(float));
 								    GGML_ASSERT(src4->nb[0] == sizeof(float));
 								    GGML_ASSERT(src5->nb[0] == sizeof(float));
 								    // required for the dot product between s and C, and when copying the states
 								    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
 								    // required for per-sequence offsets for states
 								    GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
 								    // required to get correct offset for state destination (i.e. src1->nb[2])
 								    GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    const int ir  = ir1 - ir0;
 								    if (n_kv > 1) {
 								        // it's hard to know if the source states have already been copied
 								        // when there are multiple, so copy them already.
 								        for (int i3 = 0; i3 < n_kv; ++i3) {
 								            float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
 								            float * s  = (float *) ((char *)  dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
 								            memcpy(s, s0, nc*ir*sizeof(float));
 								        }
 								    }
 								    for (int i2 = 0; i2 < n_t; ++i2) {
 								        int32_t * sq = (int32_t *) ((char *) src6->data +  i2*(src6->nb[1])); // {n_kv, n_tokens}
 								        float *   y  = (float *)   ((char *)  dst->data + ir0*(src1->nb[0]) +    i2*(src1->nb[1])); // {d_inner, n_tokens}
 								        float *   s  = (float *)   ((char *)  dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
 								        float *   s0;
 								        float *   x  = (float *)   ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
 								        float *   dt = (float *)   ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
 								        float *   A  = (float *)   ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
 								        float *   B  = (float *)   ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tokens}
 								        float *   C  = (float *)   ((char *) src5->data +  i2*(src5->nb[1])); // {d_state, n_tokens}
 								        GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
 								        // avoid needing to copy the state for the first token
 								        if (i2 == 0) {
 								            s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
 								        } else {
 								            // otherwise the source is the same as the destination
 								            s0 = s;
 								        }
 								        // d_inner
 								        for (int i1 = 0; i1 < ir; ++i1) {
 								            // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
 								            float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
 								            float x_dt = x[i1] * dt_soft_plus;
 								            float sumf = 0.0f;
 								            // d_state
 								            for (int i0 = 0; i0 < nc; ++i0) {
 								                int i = i0 + i1*nc;
 								                // state = prev_state * dA + dB * x
 								                float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
 								                // y = rowwise_dotprod(state, C)
 								                sumf += state * C[i0];
 								                s[i] = state;
 								            }
 								            y[i1] = sumf;
 								        }
 								        // handle copies when there are multiple output states
 								        for (int i3 = 1; i3 < n_kv; ++i3) {
 								            int32_t seq = sq[i3];
 								            if (0 <= seq && seq < n_kv) {
 								                float * s1 = s + (seq - sq[0])*nc*nr;
 								                memcpy(s1, s, nc*ir*sizeof(float));
 								            } else {
 								                // stop at negative or too big seq_ids
 								                break;
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_ssm_scan(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
 								    switch (dst->src[0]->type) {
 								        case GGML_TYPE_F32:
 								            {
 								                ggml_compute_forward_ssm_scan_f32(params, dst);
 								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_win_part
 								static void ggml_compute_forward_win_part_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
 								    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
 								    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
 								    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    assert(ne00 == ne0);
 								    assert(ne3  == nep0*nep1);
 								    // TODO: optimize / multi-thread
 								    for (int py = 0; py < nep1; ++py) {
 								        for (int px = 0; px < nep0; ++px) {
 								            const int64_t i3 = py*nep0 + px;
 								            for (int64_t i2 = 0; i2 < ne2; ++i2) {
 								                for (int64_t i1 = 0; i1 < ne1; ++i1) {
 								                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
 								                        const int64_t i02 = py*w + i2;
 								                        const int64_t i01 = px*w + i1;
 								                        const int64_t i00 = i0;
 								                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
 								                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
 								                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
 								                            ((float *) dst->data)[i] = 0.0f;
 								                        } else {
 								                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
 								                        }
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_win_part(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_win_part_f32(params, dst);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_win_unpart
 								static void ggml_compute_forward_win_unpart_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
 								    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int32_t w = ((const int32_t *)(dst->op_params))[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    // padding
 								    const int px = (w - ne1%w)%w;
 								    //const int py = (w - ne2%w)%w;
 								    const int npx = (px + ne1)/w;
 								    //const int npy = (py + ne2)/w;
 								    assert(ne0 == ne00);
 								    // TODO: optimize / multi-thread
 								    for (int64_t i2 = 0; i2 < ne2; ++i2) {
 								        for (int64_t i1 = 0; i1 < ne1; ++i1) {
 								            for (int64_t i0 = 0; i0 < ne0; ++i0) {
 								                const int ip2 = i2/w;
 								                const int ip1 = i1/w;
 								                const int64_t i02 = i2%w;
 								                const int64_t i01 = i1%w;
 								                const int64_t i00 = i0;
 								                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
 								                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
 								                ((float *) dst->data)[j] = ((float *) src0->data)[i];
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_win_unpart(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_win_unpart_f32(params, dst);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								//gmml_compute_forward_unary
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_unary(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const enum ggml_unary_op op = ggml_get_unary_op(dst);
 								    switch (op) {
 								        case GGML_UNARY_OP_ABS:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_abs(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_SGN:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sgn(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_NEG:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_neg(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_STEP:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_step(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_TANH:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_tanh(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_ELU:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_elu(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_RELU:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_relu(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_GELU:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_gelu(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_GELU_QUICK:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_gelu_quick(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_UNARY_OP_SILU:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_silu(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								        case GGML_UNARY_OP_HARDSWISH:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_hardswish(params, dst);
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								            } break;
 								        case GGML_UNARY_OP_HARDSIGMOID:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_hardsigmoid(params, dst);
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_get_rel_pos
 								static void ggml_compute_forward_get_rel_pos_f16(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_TENSOR_UNARY_OP_LOCALS
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int64_t w = ne1;
 								    ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
 								    ggml_fp16_t * dst_data  = (ggml_fp16_t *) dst->data;
 								    for (int64_t i2 = 0; i2 < ne2; ++i2) {
 								        for (int64_t i1 = 0; i1 < ne1; ++i1) {
 								            const int64_t pos = (w - i1 - 1) + i2;
 								            for (int64_t i0 = 0; i0 < ne0; ++i0) {
 								                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
 								            }
 								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_get_rel_pos(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (src0->type) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_TYPE_F16:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rel_pos_f16(params, dst);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_add_rel_pos
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_add_rel_pos_f32(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
 								    const struct ggml_tensor * src2 = dst->src[2];
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (!inplace && params->type == GGML_TASK_TYPE_INIT) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        if (params->ith != 0) {
 								            return;
 								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
 								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
 								    int64_t t0 = ggml_perf_time_us();
 								    UNUSED(t0);
 								    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
 								    float * src1_data = (float *) src1->data;
 								    float * src2_data = (float *) src2->data;
 								    float * dst_data  = (float *) dst->data;
 								    const int64_t ne10 = src1->ne[0];
 								    const int64_t ne11 = src1->ne[1];
 								    const int64_t ne12 = src1->ne[2];
 								    const int64_t ne13 = src1->ne[3];
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    // total patches in dst
 								    const int np = ne13;
 								    // patches per thread
 								    const int dp = (np + nth - 1)/nth;
 								    // patch range for this thread
 								    const int ip0 = dp*ith;
 								    const int ip1 = MIN(ip0 + dp, np);
 								    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
 								        for (int64_t i12 = 0; i12 < ne12; ++i12) {
 								            for (int64_t i11 = 0; i11 < ne11; ++i11) {
 								                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
 								                for (int64_t i10 = 0; i10 < ne10; ++i10) {
 								                    const int64_t jp0  = jp1 + i10;
 								                    const float src1_e = src1_data[jp0];
 								                    const float src2_e = src2_data[jp0];
 								                    const int64_t jdh = jp0 * ne10;
 								                    const int64_t jdw = jdh - (ne10 - 1) * i10;
 								                    for (int64_t j = 0; j < ne10; ++j) {
 								                        dst_data[jdh + j     ] += src2_e;
 								                        dst_data[jdw + j*ne10] += src1_e;
 								                    }
 								                }
 								            }
 								        }
 								    }
 								}
 								static void ggml_compute_forward_add_rel_pos(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_add_rel_pos_f32(params, dst);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
 								// ggml_compute_forward_map_unary
 								static void ggml_compute_forward_map_unary_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst,
 								        const ggml_unary_op_f32_t fun) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
 								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
 								    assert( dst->nb[0] == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    for (int i = 0; i < n; i++) {
 								        fun(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                (float *) ((char *) src0->data + i*(src0->nb[1])));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_map_unary(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        const ggml_unary_op_f32_t fun) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_unary_f32(params, dst, fun);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_map_binary
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_map_binary_f32(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        const ggml_binary_op_f32_t fun) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    assert(params->ith == 0);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int n  = ggml_nrows(src0);
 								    const int nc = src0->ne[0];
 								    assert( dst->nb[0] == sizeof(float));
 								    assert(src0->nb[0] == sizeof(float));
 								    assert(src1->nb[0] == sizeof(float));
 								    for (int i = 0; i < n; i++) {
 								        fun(nc,
 								                (float *) ((char *) dst->data  + i*( dst->nb[1])),
 								                (float *) ((char *) src0->data + i*(src0->nb[1])),
 								                (float *) ((char *) src1->data + i*(src1->nb[1])));
 								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_map_binary(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        const ggml_binary_op_f32_t fun) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    switch (src0->type) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_binary_f32(params, dst, fun);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_map_custom1
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_map_custom1_f32(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        const ggml_custom1_op_f32_t fun) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * a = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    assert(params->ith == 0);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    fun(dst, a);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_map_custom2
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void ggml_compute_forward_map_custom2_f32(
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst,
 								        const ggml_custom2_op_f32_t fun) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * a = dst->src[0];
 								    const struct ggml_tensor * b = dst->src[1];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    assert(params->ith == 0);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    fun(dst, a, b);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
 								// ggml_compute_forward_map_custom3
 								static void ggml_compute_forward_map_custom3_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst,
 								        const ggml_custom3_op_f32_t fun) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * a = dst->src[0];
 								    const struct ggml_tensor * b = dst->src[1];
 								    const struct ggml_tensor * c = dst->src[1];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    assert(params->ith == 0);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
 								    fun(dst, a, b, c);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// ggml_compute_forward_map_custom1
 								static void ggml_compute_forward_map_custom1(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * a = dst->src[0];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								    struct ggml_map_custom1_op_params p;
 								    memcpy(&p, dst->op_params, sizeof(p));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								    p.fun(dst, a, params->ith, params->nth, p.userdata);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								// ggml_compute_forward_map_custom2
 								static void ggml_compute_forward_map_custom2(
 								        const struct ggml_compute_params * params,
 								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * a = dst->src[0];
 								    const struct ggml_tensor * b = dst->src[1];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
 								    }
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								    struct ggml_map_custom2_op_params p;
 								    memcpy(&p, dst->op_params, sizeof(p));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								    p.fun(dst, a, b, params->ith, params->nth, p.userdata);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								// ggml_compute_forward_map_custom3
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								static void ggml_compute_forward_map_custom3(
 								        const struct ggml_compute_params * params,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								              struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * a = dst->src[0];
 								    const struct ggml_tensor * b = dst->src[1];
 								    const struct ggml_tensor * c = dst->src[2];
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								    struct ggml_map_custom3_op_params p;
 								    memcpy(&p, dst->op_params, sizeof(p));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								    p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
 								// ggml_compute_forward_cross_entropy_loss
 								static void ggml_compute_forward_cross_entropy_loss_f32(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(ggml_is_contiguous(src0));
 								    GGML_ASSERT(ggml_is_contiguous(src1));
 								    GGML_ASSERT(ggml_is_scalar(dst));
 								    GGML_ASSERT(ggml_are_same_shape(src0, src1));
 								    const int ith = params->ith;
 								    const int nth = params->nth;
 								    float * sums = (float *) params->wdata;
 								    // TODO: handle transposed/permuted matrices
 								    const int nc = src0->ne[0];
 								    const int nr = ggml_nrows(src0);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (ith == 0) {
 								            memset(sums, 0, sizeof(float) * (nth + nth * nc));
 								        }
 								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (ith == 0) {
 								            float * dp = (float *) dst->data;
 								            ggml_vec_sum_f32(nth, dp, sums);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								            dp[0] *= -1.0f / (float) nr;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        }
 								        return;
 								    }
 								    const double eps = 1e-9;
 								    // rows per thread
 								    const int dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int ir0 = dr*ith;
 								    const int ir1 = MIN(ir0 + dr, nr);
 								    for (int i1 = ir0; i1 < ir1; i1++) {
 								        float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
 								        float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        float * st = ((float *) params->wdata) + nth + ith*nc;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								#ifndef NDEBUG
 								        for (int i = 0; i < nc; ++i) {
 								            //printf("p[%d] = %f\n", i, p[i]);
 								            assert(!isnan(s0[i]));
 								            assert(!isnan(s1[i]));
 								        }
 								#endif
 								        // soft_max
 								        ggml_float sum = 0.0;
 								        {
 								            float max = -INFINITY;
 								            ggml_vec_max_f32(nc, &max, s0);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								            uint16_t scvt; UNUSED(scvt);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            for (int i = 0; i < nc; i++) {
 								                if (s0[i] == -INFINITY) {
 								                    st[i] = 0.0f;
 								                } else {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#ifndef GGML_CROSS_ENTROPY_EXP_FP16
 								                    const float s = s0[i] - max;
 								                    const float val = expf(s);
 								#else
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
 								                    memcpy(&scvt, &s, sizeof(scvt));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#endif
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    sum += (ggml_float)val;
 								                    st[i] = val;
 								                }
 								            }
 								            assert(sum > 0.0);
 								            // sum = 1.0/sum;
 								        }
 								        // avoid log(0) by rescaling from [0..1] to [eps..1]
 								        sum = (1.0 - eps) / sum;
 								        ggml_vec_scale_f32(nc, st, sum);
 								        ggml_vec_add1_f32(nc, st, st, eps);
 								        ggml_vec_log_f32(nc, st, st);
 								        ggml_vec_mul_f32(nc, st, st, s1);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        float st_sum = 0;
 								        ggml_vec_sum_f32(nc, &st_sum, st);
 								        sums[ith] += st_sum;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								#ifndef NDEBUG
 								        for (int i = 0; i < nc; ++i) {
 								            assert(!isnan(st[i]));
 								            assert(!isinf(st[i]));
 								        }
 								#endif
 								    }
 								}
 								static void ggml_compute_forward_cross_entropy_loss(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_cross_entropy_loss_f32(params, dst);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        default:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								    }
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								// ggml_compute_forward_cross_entropy_loss_back
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static void ggml_compute_forward_cross_entropy_loss_back_f32(
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								        const struct ggml_compute_params * params,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
 								    const struct ggml_tensor * src1 = dst->src[1];
 								    const struct ggml_tensor * opt0 = dst->src[2];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    GGML_ASSERT(ggml_is_contiguous(dst));
 								    GGML_ASSERT(ggml_is_contiguous(src0));
 								    GGML_ASSERT(ggml_is_contiguous(src1));
 								    GGML_ASSERT(ggml_is_contiguous(opt0));
 								    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
 								    const int64_t ith = params->ith;
 								    const int64_t nth = params->nth;
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        return;
 								    }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    const double eps = 1e-9;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    // TODO: handle transposed/permuted matrices
 								    const int64_t nc = src0->ne[0];
 								    const int64_t nr = ggml_nrows(src0);
 								    // rows per thread
 								    const int64_t dr = (nr + nth - 1)/nth;
 								    // row range for this thread
 								    const int64_t ir0 = dr*ith;
 								    const int64_t ir1 = MIN(ir0 + dr, nr);
 								    float * d   = (float *) opt0->data;
 								    for (int64_t i1 = ir0; i1 < ir1; i1++) {
 								        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
 								        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
 								        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
 								#ifndef NDEBUG
 								        for (int i = 0; i < nc; ++i) {
 								            //printf("p[%d] = %f\n", i, p[i]);
 								            assert(!isnan(s0[i]));
 								            assert(!isnan(s1[i]));
 								        }
 								#endif
 								        // soft_max
 								        ggml_float sum = 0.0;
 								        {
 								            float max = -INFINITY;
 								            ggml_vec_max_f32(nc, &max, s0);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								            uint16_t scvt; UNUSED(scvt);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            for (int i = 0; i < nc; i++) {
 								                if (s0[i] == -INFINITY) {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                    ds0[i] = 0.0f;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                } else {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#ifndef GGML_CROSS_ENTROPY_EXP_FP16
 								                    const float s = s0[i] - max;
 								                    const float val = expf(s);
 								#else
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
 								                    memcpy(&scvt, &s, sizeof(scvt));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								#endif
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    sum += (ggml_float)val;
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                    ds0[i] = val;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
 								            }
 								            assert(sum > 0.0);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								            sum = (1.0 - eps)/sum;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
 								        ggml_vec_scale_f32(nc, ds0, sum);
 								        ggml_vec_add1_f32(nc, ds0, ds0, eps);
 								        ggml_vec_sub_f32(nc, ds0, ds0, s1);
 								        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#ifndef NDEBUG
 								        for (int i = 0; i < nc; ++i) {
 								            assert(!isnan(ds0[i]));
 								            assert(!isinf(ds0[i]));
 								        }
 								#endif
 								    }
 								}
 								static void ggml_compute_forward_cross_entropy_loss_back(
 								        const struct ggml_compute_params * params,
 								        struct ggml_tensor * dst) {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
 								    const struct ggml_tensor * src0 = dst->src[0];
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								    switch (src0->type) {
 								        case GGML_TYPE_F32:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_cross_entropy_loss_back_f32(params, dst);
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        default:
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								            {
 								                GGML_ASSERT(false);
 								            } break;
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								/////////////////////////////////
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(params);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return;
 								    }
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								#if defined(GGML_USE_VULKAN)
-												Basic Vulkan Multi-GPU implementation (llama/5321)

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-07 06:54:50 +00:00
+								    const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								#ifdef GGML_VULKAN_CHECK_RESULTS
 								    if (skip_cpu) {
-												Basic Vulkan Multi-GPU implementation (llama/5321)

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-07 06:54:50 +00:00
+								        ggml_vk_check_results_1_cpu_assist(params, tensor);
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								    }
 								#endif
 								    if (skip_cpu) {
 								        return;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
 								    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								#endif // GGML_USE_VULKAN
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    switch (tensor->op) {
 								        case GGML_OP_DUP:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_dup(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_ADD:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_add(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_ADD1:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_add1(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        case GGML_OP_ACC:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_acc(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_SUB:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sub(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_MUL:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_mul(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_DIV:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_div(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_SQR:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sqr(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_SQRT:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sqrt(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_LOG:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_log(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_SUM:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sum(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_SUM_ROWS:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_sum_rows(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_MEAN:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_mean(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        case GGML_OP_ARGMAX:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_argmax(params, tensor);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_REPEAT:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_repeat(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_REPEAT_BACK:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_repeat_back(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_CONCAT:
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_concat(params, tensor);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_SILU_BACK:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_silu_back(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_NORM:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_norm(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        case GGML_OP_RMS_NORM:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rms_norm(params, tensor);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_RMS_NORM_BACK:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rms_norm_back(params, tensor);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_OP_GROUP_NORM:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_group_norm(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_MUL_MAT:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_mul_mat(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        case GGML_OP_MUL_MAT_ID:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_mul_mat_id(params, tensor);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            } break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_OUT_PROD:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_out_prod(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_SCALE:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_scale(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_SET:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_set(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_CPY:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_cpy(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        case GGML_OP_CONT:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_cont(params, tensor);
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_RESHAPE:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_reshape(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_VIEW:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_view(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_PERMUTE:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_permute(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_TRANSPOSE:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_transpose(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_GET_ROWS:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rows(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_GET_ROWS_BACK:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rows_back(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
 								        case GGML_OP_DIAG:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_diag(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_DIAG_MASK_INF:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_diag_mask_inf(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_DIAG_MASK_ZERO:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_diag_mask_zero(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_SOFT_MAX:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_soft_max(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_SOFT_MAX_BACK:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_soft_max_back(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_ROPE:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rope(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_ROPE_BACK:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_rope_back(params, tensor);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            } break;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        case GGML_OP_ALIBI:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_alibi(params, tensor);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            } break;
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								        case GGML_OP_CLAMP:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_clamp(params, tensor);
-												ggml : sync latest ggml repo

- new Q4 and Q8 quantization
- updated CUDA

											
										
										
											2023-05-20 15:56:30 +00:00
+								            } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_OP_CONV_TRANSPOSE_1D:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_conv_transpose_1d(params, tensor);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            } break;
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        case GGML_OP_IM2COL:
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_im2col(params, tensor);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_CONV_TRANSPOSE_2D:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_conv_transpose_2d(params, tensor);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_OP_POOL_1D:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_pool_1d(params, tensor);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_OP_POOL_2D:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_pool_2d(params, tensor);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_OP_UPSCALE:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_upscale(params, tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        case GGML_OP_PAD:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_pad(params, tensor);
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								            } break;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								        case GGML_OP_ARANGE:
 								            {
 								                ggml_compute_forward_arange(params, tensor);
 								            } break;
 								        case GGML_OP_TIMESTEP_EMBEDDING:
 								            {
 								                ggml_compute_forward_timestep_embedding(params, tensor);
 								            } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        case GGML_OP_ARGSORT:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_argsort(params, tensor);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            } break;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        case GGML_OP_LEAKY_RELU:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_leaky_relu(params, tensor);
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								            } break;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        case GGML_OP_FLASH_ATTN:
 								            {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                const int32_t t = ggml_get_op_params_i32(tensor, 0);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(t == 0 || t == 1);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                const bool masked = t != 0;
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_flash_attn(params, masked, tensor);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
 								        case GGML_OP_FLASH_FF:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_flash_ff(params, tensor);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_FLASH_ATTN_BACK:
 								            {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                int32_t t = ggml_get_op_params_i32(tensor, 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                GGML_ASSERT(t == 0 || t == 1);
 								                bool masked = t != 0;
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_flash_attn_back(params, masked, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								        case GGML_OP_SSM_CONV:
 								            {
 								                ggml_compute_forward_ssm_conv(params, tensor);
 								            } break;
 								        case GGML_OP_SSM_SCAN:
 								            {
 								                ggml_compute_forward_ssm_scan(params, tensor);
 								            } break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_WIN_PART:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_win_part(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
 								        case GGML_OP_WIN_UNPART:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_win_unpart(params, tensor);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_OP_UNARY:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_unary(params, tensor);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_OP_GET_REL_POS:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_get_rel_pos(params, tensor);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								        case GGML_OP_ADD_REL_POS:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_add_rel_pos(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            } break;
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								        case GGML_OP_MAP_UNARY:
 								            {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                ggml_unary_op_f32_t fun;
 								                memcpy(&fun, tensor->op_params, sizeof(fun));
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_unary(params, tensor, fun);
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								            }
 								            break;
 								        case GGML_OP_MAP_BINARY:
 								            {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                ggml_binary_op_f32_t fun;
 								                memcpy(&fun, tensor->op_params, sizeof(fun));
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_binary(params, tensor, fun);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            }
 								            break;
 								        case GGML_OP_MAP_CUSTOM1_F32:
 								            {
 								                ggml_custom1_op_f32_t fun;
 								                memcpy(&fun, tensor->op_params, sizeof(fun));
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_custom1_f32(params, tensor, fun);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            }
 								            break;
 								        case GGML_OP_MAP_CUSTOM2_F32:
 								            {
 								                ggml_custom2_op_f32_t fun;
 								                memcpy(&fun, tensor->op_params, sizeof(fun));
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_custom2_f32(params, tensor, fun);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            }
 								            break;
 								        case GGML_OP_MAP_CUSTOM3_F32:
 								            {
 								                ggml_custom3_op_f32_t fun;
 								                memcpy(&fun, tensor->op_params, sizeof(fun));
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_custom3_f32(params, tensor, fun);
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								            }
 								            break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_MAP_CUSTOM1:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_custom1(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
 								            break;
 								        case GGML_OP_MAP_CUSTOM2:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_custom2(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
 								            break;
 								        case GGML_OP_MAP_CUSTOM3:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_map_custom3(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
 								            break;
 								        case GGML_OP_CROSS_ENTROPY_LOSS:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_cross_entropy_loss(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
 								            break;
 								        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
 								            {
-												ggml : compute forward no longer pass src tensors (ggml/729)

* refactored compute forward to not pass in the src tensors each time

* fix merge issues with flags

* missed one place in the last commit to fix the is_param / flags issue

* minor spacing fix

* fixed some variable assignments so all tests locally are passing

* new change after merge fix

---------

Co-authored-by: siddharthvader <siddharth@coinlist.co>

											
										
										
											2024-02-21 12:34:53 +00:00
+								                ggml_compute_forward_cross_entropy_loss_back(params, tensor);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
 								            break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_NONE:
 								            {
 								                // nop
 								            } break;
 								        case GGML_OP_COUNT:
 								            {
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								                GGML_ASSERT(false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												minor : small code cleanups (#302)

* Small code cleanups

- fix indentation
- remove extra semicolons
- remove extra break after returns in case statements
- remove unnecessary call to .data() on string
- use empty() instead of checking size()
- no need to check for nullptr before free
- remove unnecessary initialization of string to ""

* minor : switch case always break

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2022-12-22 15:06:19 +00:00
+								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								////////////////////////////////////////////////////////////////////////////////
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static size_t ggml_hash_size(size_t min_sz) {
 								    // next primes after powers of two
 								    static const size_t primes[] = {
 , 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
 , 4099, 8209, 16411, 32771, 65537, 131101,
 , 524309, 1048583, 2097169, 4194319, 8388617,
 								        16777259, 33554467, 67108879, 134217757, 268435459,
 								        536870923, 1073741827, 2147483659
 								    };
 								    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
 								    // find the smallest prime that is larger or equal to min_sz
 								    size_t l = 0;
 								    size_t r = n_primes;
 								    while (l < r) {
 								        size_t m = (l + r)/2;
 								        if (primes[m] < min_sz) {
 								            l = m + 1;
 								        } else {
 								            r = m;
 								        }
 								    }
 								    size_t sz = l < n_primes ? primes[l] : min_sz | 1;
 								    return sz;
 								}
 								static size_t ggml_hash(const void * p) {
 								    return (size_t)p;
 								}
 								size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
 								    size_t h = ggml_hash(key) % hash_set.size;
 								    // linear probing
 								    size_t i = h;
 								    while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
 								        i = (i + 1) % hash_set.size;
 								        if (i == h) {
 								            // visited all hash table entries -> not found
 								            return GGML_HASHTABLE_FULL;
 								        }
 								    }
 								    return i;
 								}
 								bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
 								    size_t i = ggml_hash_find(hash_set, key);
 								    return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
 								}
 								size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
 								    size_t i = ggml_hash_find(hash_set, key);
 								    GGML_ASSERT(i != GGML_HASHTABLE_FULL);
 								    if (hash_set.keys[i] == key) {
 								        return GGML_HASHTABLE_ALREADY_EXISTS;
 								    }
 								    // insert
 								    GGML_ASSERT(hash_set.keys[i] == NULL);
 								    hash_set.keys[i] = key;
 								    return i;
 								}
 								size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
 								    size_t i = ggml_hash_find(hash_set, key);
 								    GGML_ASSERT(i != GGML_HASHTABLE_FULL);
 								    hash_set.keys[i] = key;
 								    return i;
 								}
-												llama : ggml-backend integration (llama/4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (llama/4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (llama/4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

											
										
										
											2024-01-12 19:07:38 +00:00
+								struct ggml_hash_set ggml_hash_set_new(size_t size) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    size = ggml_hash_size(size);
 								    struct ggml_hash_set result;
 								    result.size = size;
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								    result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
 								    return result;
 								}
 								static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								    GGML_FREE(hash_set.keys);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								}
 								struct hash_map {
 								    struct ggml_hash_set set;
 								    struct ggml_tensor ** vals;
 								};
 								static struct hash_map * ggml_new_hash_map(size_t size) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								    struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    result->set = ggml_hash_set_new(size);
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								    result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
 								    return result;
 								}
 								static void ggml_hash_map_free(struct hash_map * map) {
 								    ggml_hash_set_free(map->set);
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								    GGML_FREE(map->vals);
 								    GGML_FREE(map);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								}
 								// gradient checkpointing
 								static struct ggml_tensor * ggml_recompute_graph_node(
 								        struct ggml_context * ctx,
 								        struct ggml_cgraph  * graph,
 								        struct hash_map     * replacements,
 								        struct ggml_tensor  * node) {
 								    if (node == NULL) {
 								        return NULL;
 								    }
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								    if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return node;
 								    }
 								    if (!ggml_hash_contains(graph->visited_hash_table, node)) {
 								        return node;
 								    }
 								    int count_children = 0;
 								    for (int k = 0; k < GGML_MAX_SRC; ++k) {
 								        if (node->src[k]) {
 								            ++count_children;
 								        }
 								    }
 								    if (count_children == 0) {
 								        return node;
 								    }
 								    size_t i = ggml_hash_find(replacements->set, node);
 								    GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
 								    if (replacements->set.keys[i] == node) {
 								        return replacements->vals[i];
 								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    // insert clone into replacements
 								    GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
 								    replacements->set.keys[i] = node;
 								    replacements->vals[i] = clone;
 								    clone->op       = node->op;
 								    clone->grad     = node->grad;
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								    clone->flags    = node->flags;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    clone->extra    = node->extra;
 								    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
 								        clone->nb[k] = node->nb[k];
 								    }
 								    for (int k = 0; k < GGML_MAX_SRC; ++k) {
 								        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
 								    }
 								    if (node->view_src != NULL) {
 								        clone->data = (node->view_src->data == NULL)
 								                        ? NULL // view_src not yet allocated
 								                        : (char *) node->view_src->data // view_src already allocated
 								                                 + node->view_offs;
 								        clone->view_src  = node->view_src;
 								        clone->view_offs = node->view_offs;
 								    }
 								    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
 								    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
 								    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
 								    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
 								    return clone;
 								}
 								void ggml_build_backward_gradient_checkpointing(
 								        struct ggml_context   * ctx,
 								        struct ggml_cgraph    * gf,
 								        struct ggml_cgraph    * gb,
 								        struct ggml_cgraph    * gb_tmp,
 								        struct ggml_tensor  * * checkpoints,
 								        int                     n_checkpoints) {
 								    ggml_graph_cpy(gf, gb_tmp);
 								    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
 								    if (n_checkpoints <= 0) {
 								        ggml_graph_cpy(gb_tmp, gb);
 								        return;
 								    }
 								    struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
 								    // insert checkpoints in replacements
 								    for (int i = 0; i < n_checkpoints; ++i) {
 								        size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
 								        GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
 								        GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
 								        replacements->set.keys[k] = checkpoints[i];
 								        replacements->vals[k]     = checkpoints[i];
 								    }
 								    ggml_graph_cpy(gf, gb);
 								    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
 								    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
 								    // by recomputing them from checkpoints
 								    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
 								        struct ggml_tensor * node = gb_tmp->nodes[i];
 								        for (int k = 0; k < GGML_MAX_SRC; ++k) {
 								            // insert new tensors recomputing src, reusing already made replacements,
 								            // remember replacements: remember new tensors with mapping from corresponding gf nodes
 								            // recurse for input tensors,
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								            // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
 								        }
 								        // insert rewritten backward node with replacements made into resulting backward graph gb
 								        ggml_build_forward_expand(gb, node);
 								    }
 								    ggml_hash_map_free(replacements);
 								}
 								// functions to change gradients considering the case that input a might be initial gradient with zero value
 								static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
 								    if (ggml_hash_contains(zero_table, a)) {
 								        return b;
 								    } else {
 								        return ggml_add_impl(ctx, a, b, false);
 								    }
 								}
 								static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
 								    if (ggml_hash_contains(zero_table, a)) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								        struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
 								    } else {
 								        return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
 								    }
 								}
 								static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
 								    if (ggml_hash_contains(zero_table, a)) {
 								        return ggml_repeat(ctx, b, a);
 								    } else {
 								        return ggml_add1_impl(ctx, a, b, false);
 								    }
 								}
 								static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
 								    if (ggml_hash_contains(zero_table, a)) {
 								        return ggml_neg(ctx, b);
 								    } else {
 								        return ggml_sub_impl(ctx, a, b, false);
 								    }
 								}
 								static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_tensor * src0 = tensor->src[0];
 								    struct ggml_tensor * src1 = tensor->src[1];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    switch (tensor->op) {
 								        case GGML_OP_DUP:
 								            {
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
 								        case GGML_OP_ADD:
 								            {
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								                if (src1->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_ADD1:
 								            {
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								                if (src1->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src1->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                        src1->grad,
 								                        ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								            } break;
 								        case GGML_OP_ACC:
 								            {
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								                if (src1->grad) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    const size_t nb1     = ((int32_t *) tensor->op_params)[0];
 								                    const size_t nb2     = ((int32_t *) tensor->op_params)[1];
 								                    const size_t nb3     = ((int32_t *) tensor->op_params)[2];
 								                    const size_t offset  = ((int32_t *) tensor->op_params)[3];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                    struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
 								                        tensor->grad,
 								                        src1->grad->ne[0],
 								                        src1->grad->ne[1],
 								                        src1->grad->ne[2],
 								                        src1->grad->ne[3],
 								                        nb1, nb2, nb3, offset);
 								                    src1->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            src1->grad,
 								                            ggml_reshape(ctx,
 								                                ggml_cont(ctx, tensor_grad_view),
 								                                src1->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_SUB:
 								            {
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								                if (src1->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
 								        case GGML_OP_MUL:
 								            {
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                                src0->grad,
 								                                ggml_mul(ctx, src1, tensor->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								                if (src1->grad) {
 								                    src1->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                                src1->grad,
 								                                ggml_mul(ctx, src0, tensor->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
 								        case GGML_OP_DIV:
 								            {
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                                src0->grad,
 								                                ggml_div(ctx, tensor->grad, src1),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								                if (src1->grad) {
 								                    src1->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_sub_or_set(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                                src1->grad,
 								                                ggml_mul(ctx,
 								                                    tensor->grad,
 								                                    ggml_div(ctx, tensor, src1)),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
 								        case GGML_OP_SQR:
 								            {
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                                src0->grad,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                ggml_scale(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                                    ggml_mul(ctx, src0, tensor->grad),
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+.0f),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
 								        case GGML_OP_SQRT:
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            {
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                src0->grad,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                                ggml_scale(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                    ggml_div(ctx,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                                        tensor->grad,
 								                                        tensor),
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+.5f),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								            } break;
 								        case GGML_OP_LOG:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                                src0->grad,
 								                                ggml_div(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                    tensor->grad,
 								                                    src0),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
 								        case GGML_OP_SUM:
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            {
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add1_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                src0->grad,
 								                                tensor->grad,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								            } break;
 								        case GGML_OP_SUM_ROWS:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                                src0->grad,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                ggml_repeat(ctx,
 								                                    tensor->grad,
 								                                    src0->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
 								        case GGML_OP_MEAN:
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        case GGML_OP_ARGMAX:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false); // TODO: implement
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_REPEAT:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                            src0->grad,
 								                            ggml_repeat_back(ctx, tensor->grad, src0->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
 								            } break;
 								        case GGML_OP_REPEAT_BACK:
 								            {
 								                if (src0->grad) {
 								                    // TODO: test this
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                            src0->grad,
 								                            ggml_repeat(ctx, tensor->grad, src0->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_CONCAT:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                GGML_ASSERT(false); // TODO: implement
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_SILU_BACK:
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_NORM:
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_RMS_NORM:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                // necessary for llama
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                if (src0->grad) {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                    float eps;
 								                    memcpy(&eps, tensor->op_params, sizeof(float));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                            src0->grad,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                            ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_RMS_NORM_BACK:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_GROUP_NORM:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_MUL_MAT:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // https://cs231n.github.io/optimization-2/#staged
 								                // # forward pass
 								                // s0 = np.random.randn(5, 10)
 								                // s1 = np.random.randn(10, 3)
 								                // t = s0.dot(s1)
 								                // # now suppose we had the gradient on t from above in the circuit
 								                // dt = np.random.randn(*t.shape) # same shape as t
 								                // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
 								                // ds1 = t.T.dot(dt)
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                // tensor.shape [m,p,qq,rr]
 								                // src0.shape   [n,m,q1,r1]
 								                // src1.shape   [n,p,qq,rr]
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                // necessary for llama
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    struct ggml_tensor * s1_tg =
 								                        ggml_out_prod(ctx, // [n,m,qq,rr]
 								                            src1,          // [n,p,qq,rr]
 								                            tensor->grad); // [m,p,qq,rr]
 								                    const int64_t qq = s1_tg->ne[2];
 								                    const int64_t rr = s1_tg->ne[3];
 								                    const int64_t q1 = src0->ne[2];
 								                    const int64_t r1 = src0->ne[3];
 								                    const bool ne2_broadcasted = qq > q1;
 								                    const bool ne3_broadcasted = rr > r1;
 								                    if (ne2_broadcasted || ne3_broadcasted) {
 								                        // sum broadcast repetitions of s1_tg into shape of src0
 								                        s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
 								                    }
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
 								                                src0->grad, // [n,m,q1,r1]
 								                                s1_tg,      // [n,m,q1,r1]
 								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								                if (src1->grad) {
 								                    src1->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
 								                                src1->grad,                            // [n,p,qq,rr]
 								                                // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
 								                                //     ggml_cont(ctx,                  // [m,n,q1,r1]
 								                                //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
 								                                //     tensor->grad),                  // [m,p,qq,rr]
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
 								                                // // avoid transpose of src0, rather transpose smaller tensor->grad
 								                                // // and then use ggml_out_prod
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                ggml_out_prod(ctx,                  // [n,p,qq,rr]
 								                                    src0,                           // [n,m,q1,r1]
 								                                    ggml_transpose(ctx,             // [p,m,qq,rr]
 								                                        tensor->grad)),             // [m,p,qq,rr]
 								                                zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        case GGML_OP_MUL_MAT_ID:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_OUT_PROD:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_SCALE:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
 								                if (src0->grad) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    float s;
 								                    memcpy(&s, tensor->op_params, sizeof(float));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            src0->grad,
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                            ggml_scale_impl(ctx, tensor->grad, s, false),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								            } break;
 								        case GGML_OP_SET:
 								            {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                const size_t nb1     = ((int32_t *) tensor->op_params)[0];
 								                const size_t nb2     = ((int32_t *) tensor->op_params)[1];
 								                const size_t nb3     = ((int32_t *) tensor->op_params)[2];
 								                const size_t offset  = ((int32_t *) tensor->op_params)[3];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                struct ggml_tensor * tensor_grad_view = NULL;
 								                if (src0->grad || src1->grad) {
 								                    GGML_ASSERT(src0->type == tensor->type);
 								                    GGML_ASSERT(tensor->grad->type == tensor->type);
 								                    GGML_ASSERT(tensor->grad->type == src1->grad->type);
 								                    tensor_grad_view = ggml_view_4d(ctx,
 								                        tensor->grad,
 								                        src1->grad->ne[0],
 								                        src1->grad->ne[1],
 								                        src1->grad->ne[2],
 								                        src1->grad->ne[3],
 								                        nb1, nb2, nb3, offset);
 								                }
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                        src0->grad,
 								                        ggml_acc_impl(ctx,
 								                            tensor->grad,
 								                            ggml_neg(ctx, tensor_grad_view),
 								                            nb1, nb2, nb3, offset, false),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								                if (src1->grad) {
 								                    src1->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            src1->grad,
 								                            ggml_reshape(ctx,
 								                                ggml_cont(ctx, tensor_grad_view),
 								                                src1->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_CPY:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
 								                // cpy overwrites value of src1 by src0 and returns view(src1)
 								                // the overwriting is mathematically equivalent to:
 								                // tensor = src0 * 1 + src1 * 0
 								                if (src0->grad) {
 								                    // dsrc0 = dtensor * 1
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								                if (src1->grad) {
 								                    // dsrc1 = dtensor * 0 -> noop
 								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        case GGML_OP_CONT:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // same as cpy
 								                if (src0->grad) {
 								                    GGML_ASSERT(ggml_is_contiguous(src0->grad));
 								                    GGML_ASSERT(ggml_is_contiguous(tensor->grad));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_RESHAPE:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx, src0->grad,
 								                            ggml_reshape(ctx,
 								                                ggml_is_contiguous(tensor->grad)
 								                                    ? tensor->grad
 								                                    : ggml_cont(ctx, tensor->grad),
 								                                src0->grad),
 								                        zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_VIEW:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
 								                if (src0->grad) {
 								                    size_t offset;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    memcpy(&offset, tensor->op_params, sizeof(offset));
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
 								                    size_t nb1     = tensor->nb[1];
 								                    size_t nb2     = tensor->nb[2];
 								                    size_t nb3     = tensor->nb[3];
 								                    if (src0->type != src0->grad->type) {
 								                        // gradient is typically F32, but src0 could be other type
 								                        size_t ng = ggml_element_size(src0->grad);
 								                        size_t n0 = ggml_element_size(src0);
 								                        GGML_ASSERT(offset % n0 == 0);
 								                        GGML_ASSERT(nb1 % n0 == 0);
 								                        GGML_ASSERT(nb2 % n0 == 0);
 								                        GGML_ASSERT(nb3 % n0 == 0);
 								                        offset = (offset / n0) * ng;
 								                        nb1 = (nb1 / n0) * ng;
 								                        nb2 = (nb2 / n0) * ng;
 								                        nb3 = (nb3 / n0) * ng;
 								                    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_PERMUTE:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
 								                if (src0->grad) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    int32_t * axes = (int32_t *) tensor->op_params;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    int axis0 = axes[0] & 0x3;
 								                    int axis1 = axes[1] & 0x3;
 								                    int axis2 = axes[2] & 0x3;
 								                    int axis3 = axes[3] & 0x3;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    int axes_backward[4] = {0,0,0,0};
 								                    axes_backward[axis0] = 0;
 								                    axes_backward[axis1] = 1;
 								                    axes_backward[axis2] = 2;
 								                    axes_backward[axis3] = 3;
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx, src0->grad,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            ggml_permute(ctx,
 								                                tensor->grad,
 								                                axes_backward[0],
 								                                axes_backward[1],
 								                                axes_backward[2],
 								                                axes_backward[3]),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_TRANSPOSE:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx, src0->grad,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            ggml_transpose(ctx, tensor->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_GET_ROWS:
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								            {
 								                // necessary for llama (only for tokenizer)
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx, src0->grad,
 								                            // last ggml_get_rows_back argument src0->grad is only
 								                            // necessary to setup correct output shape
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								                if (src1->grad) {
 								                    // noop
 								                }
 								            } break;
 								        case GGML_OP_GET_ROWS_BACK:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_DIAG:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								        case GGML_OP_DIAG_MASK_INF:
 								            {
 								                // necessary for llama
 								                if (src0->grad) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    const int n_past = ((int32_t *) tensor->op_params)[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx, src0->grad,
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                            /* ggml_diag_mask_inf_impl() shouldn't be here */
 								                            /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								            } break;
 								        case GGML_OP_DIAG_MASK_ZERO:
 								            {
 								                // necessary for llama
 								                if (src0->grad) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    const int n_past = ((int32_t *) tensor->op_params)[0];
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx, src0->grad,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_SOFT_MAX:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
 								                if (src0->grad) {
 								                    src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        ggml_add_or_set(ctx, src0->grad,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                            ggml_soft_max_back(ctx, tensor->grad, tensor),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								            } break;
 								        case GGML_OP_SOFT_MAX_BACK:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
 								        case GGML_OP_ROPE:
 								            {
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                // necessary for llama
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
 								                    const int mode       = ((int32_t *) tensor->op_params)[2];
 								                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
 								                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
 								                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
 								                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
 								                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
 								                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
 								                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
 								                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
 								                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
 								                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
 								                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            src0->grad,
 								                            ggml_rope_back(ctx,
 								                                tensor->grad,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                src1,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                n_dims,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                mode,
 								                                n_ctx,
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                                n_orig_ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                freq_base,
 								                                freq_scale,
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                                ext_factor,
 								                                attn_factor,
 								                                beta_fast,
 								                                beta_slow,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                xpos_base,
 								                                xpos_down),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
 								            } break;
 								        case GGML_OP_ROPE_BACK:
 								            {
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
 								                    const int mode       = ((int32_t *) tensor->op_params)[2];
 								                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
 								                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
 								                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
 								                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
 								                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
 								                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
 								                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
 								                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
 								                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
 								                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
 								                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                            src0->grad,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            ggml_rope_impl(ctx,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                tensor->grad,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                src1,
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                                n_dims,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                                mode,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                n_ctx,
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                                n_orig_ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                freq_base,
 								                                freq_scale,
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                                ext_factor,
 								                                attn_factor,
 								                                beta_fast,
 								                                beta_slow,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                xpos_base,
 								                                xpos_down,
 								                                false),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } break;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        case GGML_OP_ALIBI:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
 								        case GGML_OP_CLAMP:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_OP_CONV_TRANSPOSE_1D:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        case GGML_OP_IM2COL:
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_CONV_TRANSPOSE_2D:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
 								        case GGML_OP_POOL_1D:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
 								        case GGML_OP_POOL_2D:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
 								        case GGML_OP_UPSCALE:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        case GGML_OP_PAD:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								        case GGML_OP_ARANGE:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
 								        case GGML_OP_TIMESTEP_EMBEDDING:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        case GGML_OP_ARGSORT:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        case GGML_OP_LEAKY_RELU:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        case GGML_OP_FLASH_ATTN:
 								            {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                struct ggml_tensor * flash_grad = NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                if (src0->grad || src1->grad || tensor->src[2]->grad) {
 								                    int32_t t = ggml_get_op_params_i32(tensor, 0);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    GGML_ASSERT(t == 0 || t == 1);
 								                    bool masked = t != 0;
 								                    flash_grad =
 								                        ggml_flash_attn_back(ctx,
 								                            src0,
 								                            src1,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            tensor->src[2],
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                            tensor->grad,
 								                            masked);
 								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                struct ggml_tensor * src2 = tensor->src[2];
 								                const int64_t elem_q = ggml_nelements(src0);
 								                const int64_t elem_k = ggml_nelements(src1);
 								                const int64_t elem_v = ggml_nelements(src2);
 								                enum ggml_type result_type = flash_grad->type;
 								                GGML_ASSERT(ggml_blck_size(result_type) == 1);
 								                const size_t tsize = ggml_type_size(result_type);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                const size_t offs_q = 0;
 								                const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
 								                const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
 								                if (src0->grad) {
 								                    struct ggml_tensor * view_q = ggml_view_1d(ctx, flash_grad, elem_q, offs_q);
 								                    struct ggml_tensor * grad_q = ggml_reshape(ctx, view_q, src0);
 								                    src0->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                            src0->grad,
 								                            grad_q,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                if (src1->grad) {
 								                    struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k);
 								                    struct ggml_tensor * grad_k = ggml_reshape(ctx, view_k, src1);
 								                    src1->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                            src1->grad,
 								                            grad_k,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                if (src2->grad) {
 								                    struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v);
 								                    struct ggml_tensor * grad_v = ggml_reshape(ctx, view_v, src2);
 								                    src2->grad = ggml_add_or_set(ctx,
 								                            src2->grad,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                            grad_v,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                            zero_table);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								            } break;
 								        case GGML_OP_FLASH_FF:
 								            {
 								                GGML_ASSERT(false); // not supported
 								            } break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_FLASH_ATTN_BACK:
 								            {
 								                GGML_ASSERT(false); // not supported
 								            } break;
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								        case GGML_OP_SSM_CONV:
 								        case GGML_OP_SSM_SCAN:
 								            {
 								                GGML_ASSERT(false); // TODO: not implemented
 								            } break;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_WIN_PART:
 								        case GGML_OP_WIN_UNPART:
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_UNARY:
 								            {
 								                switch (ggml_get_unary_op(tensor)) {
 								                    case GGML_UNARY_OP_ABS:
 								                        {
 								                            if (src0->grad) {
 								                                src0->grad =
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                    ggml_add_or_set(ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                            src0->grad,
 								                                            ggml_mul(ctx,
 								                                                ggml_sgn(ctx, src0),
 								                                                tensor->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                            zero_table);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            }
 								                        } break;
 								                    case GGML_UNARY_OP_SGN:
 								                        {
 								                            if (src0->grad) {
 								                                // noop
 								                            }
 								                        } break;
 								                    case GGML_UNARY_OP_NEG:
 								                        {
 								                            if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            }
 								                        } break;
 								                    case GGML_UNARY_OP_STEP:
 								                        {
 								                            if (src0->grad) {
 								                                // noop
 								                            }
 								                        } break;
 								                    case GGML_UNARY_OP_TANH:
 								                        {
 								                            GGML_ASSERT(false); // TODO: not implemented
 								                        } break;
 								                    case GGML_UNARY_OP_ELU:
 								                        {
 								                            GGML_ASSERT(false); // TODO: not implemented
 								                        } break;
 								                    case GGML_UNARY_OP_RELU:
 								                        {
 								                            if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                src0->grad = ggml_add_or_set(ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                        src0->grad,
 								                                        ggml_mul(ctx,
 								                                            ggml_step(ctx, src0),
 								                                            tensor->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                        zero_table);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            }
 								                        } break;
 								                    case GGML_UNARY_OP_GELU:
 								                        {
 								                            GGML_ASSERT(false); // TODO: not implemented
 								                        } break;
 								                    case GGML_UNARY_OP_GELU_QUICK:
 								                        {
 								                            GGML_ASSERT(false); // TODO: not implemented
 								                        } break;
 								                    case GGML_UNARY_OP_SILU:
 								                        {
 								                            // necessary for llama
 								                            if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                src0->grad = ggml_add_or_set(ctx,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                        src0->grad,
 								                                        ggml_silu_back(ctx, src0, tensor->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                        zero_table);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            }
 								                        } break;
 								                    default:
 								                        GGML_ASSERT(false);
 								                }
 								            } break;
 								        case GGML_OP_GET_REL_POS:
 								        case GGML_OP_ADD_REL_POS:
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								        case GGML_OP_MAP_UNARY:
 								        case GGML_OP_MAP_BINARY:
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_OP_MAP_CUSTOM1_F32:
 								        case GGML_OP_MAP_CUSTOM2_F32:
 								        case GGML_OP_MAP_CUSTOM3_F32:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        case GGML_OP_MAP_CUSTOM1:
 								        case GGML_OP_MAP_CUSTOM2:
 								        case GGML_OP_MAP_CUSTOM3:
 								            {
 								                GGML_ASSERT(false); // not supported
 								            } break;
 								        case GGML_OP_CROSS_ENTROPY_LOSS:
 								            {
 								                if (src0->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    src0->grad = ggml_add_or_set(ctx,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                                src0->grad,
 								                                ggml_cross_entropy_loss_back(ctx,
 								                                    src0,
 								                                    src1,
 								                                    tensor->grad),
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                zero_table);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
 								            } break;
 								        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-												ggml : sync latest ggml

											
										
										
											2023-04-14 16:20:39 +00:00
+								            {
 								                GGML_ASSERT(false); // not supported
 								            } break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        case GGML_OP_NONE:
 								            {
 								                // nop
 								            } break;
 								        case GGML_OP_COUNT:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
-												minor : small code cleanups (#302)

* Small code cleanups

- fix indentation
- remove extra semicolons
- remove extra break after returns in case statements
- remove unnecessary call to .data() on string
- use empty() instead of checking size()
- no need to check for nullptr before free
- remove unnecessary initialization of string to ""

* minor : switch case always break

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2022-12-22 15:06:19 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    for (int i = 0; i < GGML_MAX_SRC; ++i) {
 								        if (tensor->src[i] && tensor->src[i]->grad) {
 								            GGML_ASSERT(ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    if (node->grad == NULL) {
 								        // this usually happens when we generate intermediate nodes from constants in the backward pass
 								        // it can also happen during forward pass, if the user performs computations with constants
 								        if (node->op != GGML_OP_NONE) {
 								            //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
 								        }
 								    }
 								    // check if already visited
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        return;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const int k =
 								            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
 								            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
 								            /* unknown order, just fall back to using i*/ i;
 								        if (node->src[k]) {
 								            ggml_visit_parents(cgraph, node->src[k]);
-												Flash + language support (ref #2)

- Achieved big performance improvement + memory usage reduction
- Can now translate / transcribe different languages

											
										
										
											2022-09-28 17:46:05 +00:00
+								        }
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    if (node->op == GGML_OP_NONE && node->grad == NULL) {
 								        // reached a leaf node, not part of the gradient graph (e.g. a constant)
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        GGML_ASSERT(cgraph->n_leafs < cgraph->size);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (strlen(node->name) == 0) {
 								            ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        cgraph->leafs[cgraph->n_leafs] = node;
 								        cgraph->n_leafs++;
 								    } else {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        GGML_ASSERT(cgraph->n_nodes < cgraph->size);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (strlen(node->name) == 0) {
 								            ggml_format_name(node, "node_%d", cgraph->n_nodes);
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        cgraph->nodes[cgraph->n_nodes] = node;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        if (cgraph->grads) {
 								            cgraph->grads[cgraph->n_nodes] = node->grad;
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        cgraph->n_nodes++;
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    if (!expand) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
 								        ggml_graph_clear(cgraph);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								    const int n0 = cgraph->n_nodes;
 								    UNUSED(n0);
 								    ggml_visit_parents(cgraph, tensor);
 								    const int n_new = cgraph->n_nodes - n0;
 								    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
 								    if (n_new > 0) {
 								        // the last added node should always be starting point
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								}
 								void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
 								    ggml_build_forward_impl(cgraph, tensor, true);
 								}
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(gf->n_nodes > 0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
 								    if (keep) {
 								        for (int i = 0; i < gf->n_nodes; i++) {
 								            struct ggml_tensor * node = gf->nodes[i];
 								            if (node->grad) {
 								                node->grad = ggml_dup_tensor(ctx, node);
 								                gf->grads[i] = node->grad;
 								            }
 								        }
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    // remember original gradients which start with zero values
 								    struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
 								    for (int i = 0; i < gf->n_nodes; i++) {
 								        if (gf->grads[i]) {
 								            ggml_hash_insert(zero_table, gf->grads[i]);
 								        }
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    for (int i = gf->n_nodes - 1; i >= 0; i--) {
 								        struct ggml_tensor * node = gf->nodes[i];
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        // inplace operations to add gradients are not created by ggml_compute_backward
 								        // use allocator to automatically make inplace operations
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        if (node->grad) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            ggml_compute_backward(ctx, node, zero_table);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    for (int i = 0; i < gf->n_nodes; i++) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_tensor * node = gf->nodes[i];
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								            ggml_build_forward_expand(gb, node->grad);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
 								    ggml_hash_set_free(zero_table);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static size_t ggml_graph_nbytes(size_t size, bool grads) {
 								    size_t nbytes = sizeof(struct ggml_cgraph);
 								    nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
 								    if (grads) {
 								        nbytes += size * sizeof(struct ggml_tensor *); // grads
 								    }
 								    nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
 								    return nbytes;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								size_t ggml_graph_overhead_custom(size_t size, bool grads) {
 								    return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
 								}
 								size_t ggml_graph_overhead(void) {
 								    return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
 								}
 								struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
 								    const size_t obj_size = ggml_graph_nbytes(size, grads);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
 								    size_t hash_size = ggml_hash_size(size * 2);
 								    struct ggml_tensor ** nodes_ptr = data_start;
 								    struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
 								    struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
 								    struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
 								    // check that we allocated the correct amount of memory
 								    assert(obj_size == (size_t) (
 								        (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
 								    memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    *cgraph = (struct ggml_cgraph) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        /*.size         =*/ size,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        /*.n_nodes      =*/ 0,
 								        /*.n_leafs      =*/ 0,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        /*.nodes        =*/ nodes_ptr,
 								        /*.grads        =*/ grads_ptr,
 								        /*.leafs        =*/ leafs_ptr,
 								        /*.hash_table   =*/ { hash_size, hash_keys_ptr },
 								        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        /*.perf_runs    =*/ 0,
 								        /*.perf_cycles  =*/ 0,
 								        /*.perf_time_us =*/ 0,
 								    };
 								    return cgraph;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
 								    return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
 								}
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
 								    struct ggml_cgraph cgraph = {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        /*.size         =*/ 0,
 								        /*.n_nodes      =*/ i1 - i0,
 								        /*.n_leafs      =*/ 0,
 								        /*.nodes        =*/ cgraph0->nodes + i0,
 								        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
 								        /*.leafs        =*/ NULL,
 								        /*.hash_table   =*/ { 0, NULL },
 								        /*.order        =*/ cgraph0->order,
 								        /*.perf_runs    =*/ 0,
 								        /*.perf_cycles  =*/ 0,
 								        /*.perf_time_us =*/ 0,
 								    };
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return cgraph;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
 								    GGML_ASSERT(dst->size >= src->n_leafs);
 								    GGML_ASSERT(dst->size >= src->n_nodes);
 								    GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
 								    dst->n_leafs = src->n_leafs;
 								    dst->n_nodes = src->n_nodes;
 								    dst->order   = src->order;
 								    for (int i = 0; i < src->n_leafs; ++i) {
 								        dst->leafs[i] = src->leafs[i];
 								    }
 								    for (int i = 0; i < src->n_nodes; ++i) {
 								        dst->nodes[i] = src->nodes[i];
 								    }
 								    if (src->grads) {
 								        GGML_ASSERT(dst->grads != NULL);
 								        for (int i = 0; i < src->n_nodes; ++i) {
 								            dst->grads[i] = src->grads[i];
 								        }
 								    }
 								    for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
 								        if (src->visited_hash_table.keys[i]) {
 								            ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
 								        }
 								    }
 								}
 								struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
 								    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
 								    ggml_graph_cpy(cgraph, result);
 								    return result;
 								}
 								void ggml_graph_reset(struct ggml_cgraph * cgraph) {
 								    GGML_ASSERT(cgraph->grads != NULL);
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
 								        struct ggml_tensor * grad = cgraph->grads[i];
 								        if (grad) {
 								            ggml_set_zero(grad);
 								        }
 								    }
 								}
 								void ggml_graph_clear(struct ggml_cgraph * cgraph) {
 								    cgraph->n_leafs = 0;
 								    cgraph->n_nodes = 0;
 								    memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
 								// thread data
 								//
 								// synchronization is done via busy loops
 								// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
 								//
 								#ifdef __APPLE__
 								//#include <os/lock.h>
-												bench : add memcpy and ggml_mul_mat benchmarks

											
										
										
											2023-01-18 18:31:46 +00:00
+								//
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//typedef os_unfair_lock ggml_lock_t;
 								//
 								//#define ggml_lock_init(x)    UNUSED(x)
 								//#define ggml_lock_destroy(x) UNUSED(x)
 								//#define ggml_lock_lock       os_unfair_lock_lock
 								//#define ggml_lock_unlock     os_unfair_lock_unlock
 								//
 								//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
 								typedef int ggml_lock_t;
 								#define ggml_lock_init(x)    UNUSED(x)
 								#define ggml_lock_destroy(x) UNUSED(x)
 								#define ggml_lock_lock(x)    UNUSED(x)
 								#define ggml_lock_unlock(x)  UNUSED(x)
 								#define GGML_LOCK_INITIALIZER 0
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								typedef pthread_t ggml_thread_t;
 								#define ggml_thread_create pthread_create
 								#define ggml_thread_join   pthread_join
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#else
 								//typedef pthread_spinlock_t ggml_lock_t;
 								//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
 								//#define ggml_lock_destroy pthread_spin_destroy
 								//#define ggml_lock_lock    pthread_spin_lock
 								//#define ggml_lock_unlock  pthread_spin_unlock
 								typedef int ggml_lock_t;
 								#define ggml_lock_init(x)    UNUSED(x)
 								#define ggml_lock_destroy(x) UNUSED(x)
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
 								#define ggml_lock_lock(x)    _mm_pause()
 								#else
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#define ggml_lock_lock(x)    UNUSED(x)
-												ggml : sync latest ggml

- New Q4 and Q5 formats
- Various improvements

											
										
										
											2023-05-14 15:04:23 +00:00
+								#endif
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#define ggml_lock_unlock(x)  UNUSED(x)
 								#define GGML_LOCK_INITIALIZER 0
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								typedef pthread_t ggml_thread_t;
 								#define ggml_thread_create pthread_create
 								#define ggml_thread_join   pthread_join
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								#endif
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								// Android's libc implementation "bionic" does not support setting affinity
-												ggml : android and old glibc NUMA incompatibility bugfixes (llama/5557)

* #ifdef out some code NUMA blocks for Android due to lack of support

* added in some __ANDROID__ if def gates around numa code and forced GLIBC prior to 2.29 to use a syscall for getcpu instead of the wrapper

* Changed gates on numa platform specific stuff to __gnu_linux__ to skip any platforms without glibc

* harmonizing #if defined blocks for numa code to __gnu_linux__ since that's the only model that's being followed anyways

---------

Co-authored-by: root <root@nenya.lothlorien.ca>

											
										
										
											2024-02-19 07:38:32 +00:00
+								#if defined(__gnu_linux__)
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								static void set_numa_thread_affinity(int thread_n) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    if (!ggml_is_numa()) {
 								        return;
 								    }
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    int node_num;
 								    int rv;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    switch(g_state.numa.numa_strategy) {
 								        case GGML_NUMA_STRATEGY_DISTRIBUTE:
 								            // run thread on node_num thread_n / (threads per node)
 								            node_num = thread_n % g_state.numa.n_nodes;
 								            break;
 								        case GGML_NUMA_STRATEGY_ISOLATE:
 								            // run thread on current_node
 								            node_num = g_state.numa.current_node;
 								            break;
 								        case GGML_NUMA_STRATEGY_NUMACTL:
 								            // use the cpuset that numactl gave us
 								            rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
 								            if (rv) {
 								                fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
 								            }
 								            return;
 								        default:
 								            return;
 								    }
 								    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
 								    CPU_ZERO_S(setsize, cpus);
 								    for (size_t i = 0; i < node->n_cpus; ++i) {
 								        CPU_SET_S(node->cpus[i], setsize, cpus);
 								    }
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    if (rv) {
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    }
 								    CPU_FREE(cpus);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void clear_numa_thread_affinity(void) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    if (!ggml_is_numa()) {
 								        return;
 								    }
 								    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
 								    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
 								    CPU_ZERO_S(setsize, cpus);
 								    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
 								        CPU_SET_S(i, setsize, cpus);
 								    }
 								    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
 								    if (rv) {
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    }
 								    CPU_FREE(cpus);
 								}
 								#else
 								// TODO: Windows etc.
 								// (the linux implementation may also work on BSD, someone should test)
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void clear_numa_thread_affinity(void) {}
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								#endif
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								struct ggml_compute_state_shared {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const struct ggml_cgraph * cgraph;
 								    const struct ggml_cplan  * cplan;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
 								    int64_t perf_node_start_cycles;
 								    int64_t perf_node_start_time_us;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int n_threads;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    // synchronization primitives
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								    atomic_int n_active;  // num active threads
 								    atomic_int node_n;    // active graph node
 								    atomic_int node_task; // active graph node task phase
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												ggml : add abort_callback for cpu backend (ggml/725)

* a way to use abort_callback with the cpu backend

* whisper update

											
										
										
											2024-02-09 09:42:27 +00:00
+								    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    void * abort_callback_data;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								};
 								struct ggml_compute_state {
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								    ggml_thread_t thrd;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    int ith;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    struct ggml_compute_state_shared * shared;
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								    enum ggml_status ec;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								};
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
 								    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
 								    int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
 								    node->perf_runs++;
 								    node->perf_cycles  += cycles_cur;
 								    node->perf_time_us += time_us_cur;
 								}
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    int n_tasks = 0;
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    if (ggml_is_empty(node)) {
 								        // no need to multi-thread a no-op
 								        n_tasks = 1;
 								        return n_tasks;
 								    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    switch (node->op) {
 								        case GGML_OP_CPY:
 								        case GGML_OP_DUP:
 								        case GGML_OP_ADD:
 								        case GGML_OP_ADD1:
 								        case GGML_OP_ACC:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_SUB:
 								        case GGML_OP_SQR:
 								        case GGML_OP_SQRT:
 								        case GGML_OP_LOG:
 								        case GGML_OP_SUM:
 								        case GGML_OP_SUM_ROWS:
 								        case GGML_OP_MEAN:
 								        case GGML_OP_ARGMAX:
 								        case GGML_OP_REPEAT:
 								        case GGML_OP_REPEAT_BACK:
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        case GGML_OP_LEAKY_RELU:
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            {
 								                n_tasks = 1;
 								            } break;
 								        case GGML_OP_UNARY:
 								            switch (ggml_get_unary_op(node)) {
 								                case GGML_UNARY_OP_ABS:
 								                case GGML_UNARY_OP_SGN:
 								                case GGML_UNARY_OP_NEG:
 								                case GGML_UNARY_OP_STEP:
 								                case GGML_UNARY_OP_TANH:
 								                case GGML_UNARY_OP_ELU:
 								                case GGML_UNARY_OP_RELU:
-												llava : MobileVLM support (llama/4954)

* MobileVLM native implementation

* delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake

* move android script to example/llava directory

* Fix the editor config checks

---------

Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>

											
										
										
											2024-01-22 13:09:35 +00:00
+								                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
 								                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    {
 								                        n_tasks = 1;
 								                    } break;
 								                case GGML_UNARY_OP_GELU:
 								                case GGML_UNARY_OP_GELU_QUICK:
 								                case GGML_UNARY_OP_SILU:
 								                    {
 								                        n_tasks = n_threads;
 								                    } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                default:
 								                    GGML_ASSERT(false);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
 								            break;
 								        case GGML_OP_SILU_BACK:
 								        case GGML_OP_MUL:
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        case GGML_OP_DIV:
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_OP_NORM:
 								        case GGML_OP_RMS_NORM:
 								        case GGML_OP_RMS_NORM_BACK:
 								        case GGML_OP_GROUP_NORM:
 								        case GGML_OP_CONCAT:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_MUL_MAT:
 								            {
 								                n_tasks = n_threads;
 								                // TODO: use different scheduling for different matrix sizes
 								                //const int nr0 = ggml_nrows(node->src[0]);
 								                //const int nr1 = ggml_nrows(node->src[1]);
 								                //n_tasks = MIN(n_threads, MAX(1, nr0/128));
 								                //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
 								            } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        case GGML_OP_MUL_MAT_ID:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_OP_OUT_PROD:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								        case GGML_OP_GET_ROWS:
 								            {
 								                // FIXME: the cost of launching additional threads decreases performance with GPU offloading
 								                //n_tasks = MIN(n_threads, ggml_nelements(node->src[1]));
 								                n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1]));
 								            } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_OP_SCALE:
 								        case GGML_OP_SET:
 								        case GGML_OP_CONT:
 								        case GGML_OP_RESHAPE:
 								        case GGML_OP_VIEW:
 								        case GGML_OP_PERMUTE:
 								        case GGML_OP_TRANSPOSE:
 								        case GGML_OP_GET_ROWS_BACK:
 								        case GGML_OP_DIAG:
 								            {
 								                n_tasks = 1;
 								            } break;
 								        case GGML_OP_DIAG_MASK_ZERO:
 								        case GGML_OP_DIAG_MASK_INF:
 								        case GGML_OP_SOFT_MAX_BACK:
 								        case GGML_OP_ROPE:
 								        case GGML_OP_ROPE_BACK:
 								        case GGML_OP_ADD_REL_POS:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_ALIBI:
 								            {
 								                n_tasks = 1; //TODO
 								            } break;
 								        case GGML_OP_CLAMP:
 								            {
 								                n_tasks = 1; //TODO
 								            } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        case GGML_OP_SOFT_MAX:
 								            {
-												ggml : update softmax n_task calculation (llama/5126)

updated the n_task calculation to use max number of
threads possible. This has improved the prompt eval
performance by around 5% for DOT kernels and by
around 10% for MMLA kernels on AWS Graviton3.

											
										
										
											2024-01-26 17:17:59 +00:00
+								                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_OP_CONV_TRANSPOSE_1D:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												whisper : add full CUDA and Metal offloading (#1472)

* whisper : migrate to ggml-backend

* whisper : fix logit reading

* whisper : fix tensor allocation during load

* whisper : fix beam-search with CUDA

* whisper : free backends + fix compile warning

* whisper : print when CUDA is enabled

* whisper : fix CoreML

* make : clean-up

* talk : fix compile warning

* whisper : support ggml_conv with CUDA and Metal (#1473)

* ggml : add CUDA support for ggml_conv

* whisper : remove ggml_repeat for conv bias + single backend

* cuda : fix im2col kernel

* metal : add im2col support + mul mat-vec f16 x f16

* bench-all : add q4 models

* whisper : clean-up

* quantize-all : fix

* ggml : im2col opts

* whisper : avoid whisper_model_data wrapper

* whisper : add note that ggml_mul_mat_pad does not work with CUDA

* whisper : factor out graph compute in common function

* whisper : fixes

* whisper : fix UB with measure buffers

* whisper : try to fix the parallel whisper_state functionality (#1479)

* whisper : try to fix the parallel whisper_state functionality

* whisper : fix multi-state Metal

* whisper : free backend instances in whisper_state
											
										
										
											2023-11-12 13:31:08 +00:00
+								        case GGML_OP_IM2COL:
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_CONV_TRANSPOSE_2D:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_POOL_1D:
 								        case GGML_OP_POOL_2D:
 								            {
 								                n_tasks = 1;
 								            } break;
 								        case GGML_OP_UPSCALE:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												sync : ggml (Metal fixes, new ops, tests) (#1633)

* sync : ggml (Metal fixes, new ops, tests)

* cuda : fix bin bcast when src1 and dst have different types
											
										
										
											2023-12-13 19:55:03 +00:00
+								        case GGML_OP_PAD:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-03-03 12:23:52 +00:00
+								        case GGML_OP_ARANGE:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_TIMESTEP_EMBEDDING:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								        case GGML_OP_ARGSORT:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_OP_FLASH_ATTN:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_FLASH_FF:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_FLASH_ATTN_BACK:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												llama : support Mamba Selective State Space Models (llama/5328)

* mamba : begin working on support for Mamba SSM

* mamba : begin figuring out how to (ab)use the kv cache for Mamba

* mamba : recurrent inference almost works, but incoherent

* mamba : recurrent inference WORKS!!!

* convert : optionally use d_conv and d_state from config.json for Mamba

* mamba : refactor recurrent conv, resulting in 20% perf increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.

* ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.

* mamba : simplify the conv step with a self-overlapping view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.

* mamba : fix self-overlapping view depth stride

* mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.

* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.

* mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")

* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.

* convert : for Mamba, also consider the "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json

* mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.

* ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.

* mamba : remove some useless comments

No code change.

* convert : fix flake8 linter errors

* mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32

* mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok

* mamba : in comments, properly refer to KV cells instead of slots

* mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.

* mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).

* mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos

* mamba : make the server and parallel examples work with whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not

* mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.

* mamba : adapt perplexity, batched, and batched-bench examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.

* mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"

* mamba : more correctly update the "used" field of the KV cache

* ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.

* convert : for Mamba, fallback to internal NeoX tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.

* mamba : support state saving and restoring

* ggml : implicitly pass src tensors through dst for Mamba-related ops

* mamba : clarify some comments

* server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.

* convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406

* mamba : rename metadata to be more similar to transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

* mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight

* mamba : add missing spaces

This is purely a formatting change.

* convert-hf : omit output.weight when identical with token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.

* readme : add Mamba to supported models, and add recent API changes

* mamba : move state_seq and state_mask views outside layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.

											
										
										
											2024-03-08 22:31:00 +00:00
+								        case GGML_OP_SSM_CONV:
 								        case GGML_OP_SSM_SCAN:
 								            {
 								                n_tasks = n_threads;
 								            } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        case GGML_OP_WIN_PART:
 								        case GGML_OP_WIN_UNPART:
 								        case GGML_OP_GET_REL_POS:
 								        case GGML_OP_MAP_UNARY:
 								        case GGML_OP_MAP_BINARY:
 								        case GGML_OP_MAP_CUSTOM1_F32:
 								        case GGML_OP_MAP_CUSTOM2_F32:
 								        case GGML_OP_MAP_CUSTOM3_F32:
 								            {
 								                n_tasks = 1;
 								            } break;
 								        case GGML_OP_MAP_CUSTOM1:
 								            {
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								                struct ggml_map_custom1_op_params p;
 								                memcpy(&p, node->op_params, sizeof(p));
 								                if (p.n_tasks == GGML_N_TASKS_MAX) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    n_tasks = n_threads;
 								                } else {
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								                    n_tasks = MIN(p.n_tasks, n_threads);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                }
 								            } break;
 								        case GGML_OP_MAP_CUSTOM2:
 								            {
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								                struct ggml_map_custom2_op_params p;
 								                memcpy(&p, node->op_params, sizeof(p));
 								                if (p.n_tasks == GGML_N_TASKS_MAX) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    n_tasks = n_threads;
 								                } else {
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								                    n_tasks = MIN(p.n_tasks, n_threads);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                }
 								            } break;
 								        case GGML_OP_MAP_CUSTOM3:
 								            {
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								                struct ggml_map_custom3_op_params p;
 								                memcpy(&p, node->op_params, sizeof(p));
 								                if (p.n_tasks == GGML_N_TASKS_MAX) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    n_tasks = n_threads;
 								                } else {
-												add google magika inference example (ggml/748)

* add magika inference example

* ggml : fix unaligned accesses in custom ops

* ggml : fix FP32 GELU for values that exceed the FP16 range

* use ggml_pool_1d

* add README

* Update README.md

* pad inputs if the files are too small

* cleanup

ggml-ci

											
										
										
											2024-02-25 19:41:35 +00:00
+								                    n_tasks = MIN(p.n_tasks, n_threads);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                }
 								            } break;
 								        case GGML_OP_CROSS_ENTROPY_LOSS:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
 								            {
 								                n_tasks = n_threads;
 								            } break;
 								        case GGML_OP_NONE:
 								            {
 								                n_tasks = 1;
 								            } break;
 								        case GGML_OP_COUNT:
 								            {
 								                GGML_ASSERT(false);
 								            } break;
 								        default:
 								            {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                fprintf(stderr, "%s: op not implemented: ", __func__);
 								                if (node->op < GGML_OP_COUNT) {
 								                    fprintf(stderr, "%s\n", ggml_op_name(node->op));
 								                } else {
 								                    fprintf(stderr, "%d\n", node->op);
 								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                GGML_ASSERT(false);
 								            } break;
 								    }
 								    assert(n_tasks > 0);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    return n_tasks;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								}
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
 								    // wait for other threads to finish
 								    const int last_node_n = * node_n;
 								    while (true) {
 								        if (do_yield) {
 								            sched_yield();
 								        }
 								        * node_n = atomic_load(&state->shared->node_n);
 								        if (* node_n != last_node_n) break;
 								    }
 								}
 								static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
 								    // wait for other threads to finish
 								    const int last_task_phase = * task_phase;
 								    while (true) {
 								        if (do_yield) {
 								            sched_yield();
 								        }
 								        * task_phase = atomic_load(&state->shared->node_task);
 								        if (* task_phase != last_task_phase) break;
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static thread_ret_t ggml_graph_compute_thread(void * data) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const struct ggml_cgraph * cgraph = state->shared->cgraph;
 								    const struct ggml_cplan  * cplan  = state->shared->cplan;
 								    const int   n_threads   = state->shared->n_threads;
-												ggml : add numa options (llama/5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>

											
										
										
											2024-02-16 09:31:07 +00:00
+								    set_numa_thread_affinity(state->ith);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								    int node_n     = -1;
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    int task_phase = GGML_TASK_TYPE_FINALIZE;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    while (true) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
 								            state->shared->node_n += 1;
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								            state->ec = GGML_STATUS_ABORTED;
 								            return 0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        }
-												ggml : do not sched_yield when calling BLAS (llama/4761)

* ggml : do not sched_yield when calling BLAS

ggml-ci

* ggml : fix do_yield logic

ggml-ci

* ggml : simplify do_yield logic

ggml-ci

											
										
										
											2024-01-05 13:18:21 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
 								            // all other threads are finished and spinning
 								            // do finalize and init here so we don't have synchronize again
 								            struct ggml_compute_params params = {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                /*.type  =*/ GGML_TASK_TYPE_FINALIZE,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                /*.ith   =*/ 0,
 								                /*.nth   =*/ 0,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                /*.wsize =*/ cplan->work_size,
 								                /*.wdata =*/ cplan->work_data,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            };
 								            if (node_n != -1) {
 								                /* FINALIZE */
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                struct ggml_tensor * node = cgraph->nodes[node_n];
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                if (GGML_OP_HAS_FINALIZE[node->op]) {
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								                    params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                    ggml_compute_forward(&params, node);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                ggml_graph_compute_perf_stats_node(node, state->shared);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            // distribute new work or execute it direct if 1T
 								            while (++node_n < cgraph->n_nodes) {
 								                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
 								                struct ggml_tensor * node = cgraph->nodes[node_n];
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								                const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
 								                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
 								                state->shared->perf_node_start_time_us = ggml_perf_time_us();
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                params.nth = n_tasks;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                if (n_tasks == 1) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								                    /* INIT */
 								                    if (GGML_OP_HAS_INIT[node->op]) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                        params.type = GGML_TASK_TYPE_INIT;
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								                        ggml_compute_forward(&params, node);
 								                    }
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
 								                    // they do something more efficient than spinning (?)
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    params.type = GGML_TASK_TYPE_COMPUTE;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                    ggml_compute_forward(&params, node);
 								                    if (GGML_OP_HAS_FINALIZE[node->op]) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                        params.type = GGML_TASK_TYPE_FINALIZE;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                        ggml_compute_forward(&params, node);
 								                    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								                    ggml_graph_compute_perf_stats_node(node, state->shared);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								                } else {
 								                    break;
 								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
 								                    break;
 								                }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            task_phase = GGML_TASK_TYPE_INIT;
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								            atomic_store(&state->shared->n_active,  n_threads);
 								            atomic_store(&state->shared->node_n,    node_n);
 								            atomic_store(&state->shared->node_task, task_phase);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        } else {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
 								            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        // check if we should stop
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        if (node_n >= cgraph->n_nodes) break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								        /* INIT & COMPUTE */
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        struct ggml_tensor * node = cgraph->nodes[node_n];
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								        const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
-												bench : add memcpy and ggml_mul_mat benchmarks

											
										
										
											2023-01-18 18:31:46 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        struct ggml_compute_params params = {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            /*.type  =*/ GGML_TASK_TYPE_INIT,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            /*.ith   =*/ state->ith,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            /*.nth   =*/ n_tasks,
 								            /*.wsize =*/ cplan->work_size,
 								            /*.wdata =*/ cplan->work_data,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        };
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        if (state->ith < n_tasks) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								            if (GGML_OP_HAS_INIT[node->op]) {
 								                ggml_compute_forward(&params, node);
 								            }
 								        }
 								        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            task_phase = GGML_TASK_TYPE_COMPUTE;
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								            atomic_store(&state->shared->n_active,  n_threads);
 								            atomic_store(&state->shared->node_task, task_phase);
 								        }
 								        else {
 								            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
 								            //       depending on the workload and the operating system.
 								            //       since it is not clear what is the best approach, it should potentially become user-configurable
 								            //       ref: https://github.com/ggerganov/ggml/issues/291
 								            // UPD:  adding the do_yield flag seems to resolve the issue universally
 								            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
 								            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
 								        }
 								        if (state->ith < n_tasks) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            params.type = GGML_TASK_TYPE_COMPUTE;
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            ggml_compute_forward(&params, node);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
 								        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            task_phase = GGML_TASK_TYPE_FINALIZE;
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								            atomic_store(&state->shared->n_active,  n_threads);
 								            atomic_store(&state->shared->node_task, task_phase);
 								        }
 								        else {
 								            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								    return 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												llama : ggml-backend integration (llama/4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (llama/4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (llama/4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

											
										
										
											2024-01-12 19:07:38 +00:00
+								struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (n_threads <= 0) {
 								        n_threads = GGML_DEFAULT_N_THREADS;
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    size_t work_size = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_cplan cplan;
 								    memset(&cplan, 0, sizeof(struct ggml_cplan));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : limit n_threads to the max n_tasks (llama/5238)

											
										
										
											2024-01-31 12:43:03 +00:00
+								    int max_tasks = 1;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // thread scheduling for the different operations + work buffer size estimation
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
 								        struct ggml_tensor * node = cgraph->nodes[i];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												llama : add pipeline parallelism support (llama/6017)

* llama : add pipeline parallelism support for batch processing with multiple CUDA GPUs

ggml-ci

* server : add -ub, --ubatch-size parameter

* fix server embedding test

* llama : fix Mamba inference for pipeline parallelism

Tested to work correctly with both `main` and `parallel` examples.

* llama : limit max batch size to n_batch

* add LLAMA_SCHED_MAX_COPIES to configure the number of input copies for pipeline parallelism
default increase to 4 (from 2)

changing this value may improve performance for some systems, but increases memory usage

* fix hip build

* fix sycl build (disable cpy_tensor_async)

* fix hip build

* llama : limit n_batch and n_ubatch to n_ctx during context creation

* llama : fix norm backend

* batched-bench : sync after decode

* swiftui : sync after decode

* ggml : allow ggml_get_rows to use multiple threads if they are available

* check n_ubatch >= n_tokens with non-casual attention

* llama : do not limit n_batch to n_ctx with non-casual attn

* server : construct batch with size of llama_n_batch

* ggml_backend_cpu_graph_compute : fix return value when alloc fails

* llama : better n_batch and n_ubatch comment

* fix merge

* small fix

* reduce default n_batch to 2048

---------

Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-13 17:54:21 +00:00
+								        const int n_tasks = ggml_get_n_tasks(node, n_threads, 1);
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												ggml : limit n_threads to the max n_tasks (llama/5238)

											
										
										
											2024-01-31 12:43:03 +00:00
+								        max_tasks = MAX(max_tasks, n_tasks);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        size_t cur = 0;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        switch (node->op) {
 								            case GGML_OP_CPY:
 								            case GGML_OP_DUP:
 								                {
 								                    if (ggml_is_quantized(node->type)) {
 								                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
 								                    }
 								                } break;
 								            case GGML_OP_ADD:
 								            case GGML_OP_ADD1:
 								                {
 								                    if (ggml_is_quantized(node->src[0]->type)) {
 								                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
 								                    }
 								                } break;
 								            case GGML_OP_ACC:
 								                {
 								                    if (ggml_is_quantized(node->src[0]->type)) {
 								                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
 								                    }
 								                } break;
 								            case GGML_OP_MUL_MAT:
 								                {
 								                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								#if defined(GGML_USE_CLBLAST)
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
 								                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
 								                    } else
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								#endif
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-												ggml : do not sched_yield when calling BLAS (llama/4761)

* ggml : do not sched_yield when calling BLAS

ggml-ci

* ggml : fix do_yield logic

ggml-ci

* ggml : simplify do_yield logic

ggml-ci

											
										
										
											2024-01-05 13:18:21 +00:00
+								                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        if (node->src[0]->type != GGML_TYPE_F32) {
-												ggml : parallelize FP32 conversion when using BLAS (llama/5045)

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-22 13:15:08 +00:00
+								                            // here we need memory for fully dequantized matrix from src0
-												minor : clean-up some warnings and style (llama/5094)

* minor : clean-up some warnings and style

ggml-ci

* ggml : add comment

											
										
										
											2024-01-23 12:12:57 +00:00
+								                            // take into account that src0 can be broadcasted into src1[2,3]
 								                            cur = ggml_type_size(GGML_TYPE_F32)
 								                                * node->src[0]->ne[0]*node->src[0]->ne[1]
 								                                * node->src[1]->ne[2]*node->src[1]->ne[3];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    } else
 								#endif
 								                    if (node->src[1]->type != vec_dot_type) {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    }
 								                } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            case GGML_OP_MUL_MAT_ID:
 								                {
-												llama : ggml-backend integration (llama/4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (llama/4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (llama/4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

											
										
										
											2024-01-12 19:07:38 +00:00
+								                    cur = 0;
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    const struct ggml_tensor * src0 = node->src[2];
 								                    const struct ggml_tensor * src1 = node->src[1];
 								                    const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
 								                    if (src1->type != vec_dot_type) {
-												llama : ggml-backend integration (llama/4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (llama/4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (llama/4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

											
										
										
											2024-01-12 19:07:38 +00:00
+								                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    const int n_as = ggml_get_op_params_i32(node, 1);
-												llama : ggml-backend integration (llama/4766)

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (llama/4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (llama/4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

											
										
										
											2024-01-12 19:07:38 +00:00
+								                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
 								                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            case GGML_OP_OUT_PROD:
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    if (ggml_is_quantized(node->src[0]->type)) {
 								                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
 								                    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                } break;
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								            case GGML_OP_SOFT_MAX:
-												ggml: cache sin/cos for RoPE (llama/4908)

											
										
										
											2024-01-13 20:41:37 +00:00
+								            case GGML_OP_ROPE:
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                {
 								                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
 								                } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            case GGML_OP_CONV_TRANSPOSE_1D:
 								                {
 								                    GGML_ASSERT(node->src[0]->ne[3] == 1);
 								                    GGML_ASSERT(node->src[1]->ne[2] == 1);
 								                    GGML_ASSERT(node->src[1]->ne[3] == 1);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    const int64_t ne00 = node->src[0]->ne[0];  // K
 								                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
 								                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
 								                    const int64_t ne10 = node->src[1]->ne[0];  // L
 								                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
 								                    if (node->src[0]->type == GGML_TYPE_F16 &&
 								                        node->src[1]->type == GGML_TYPE_F32) {
 								                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
 								                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
 								                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
 								                               node->src[1]->type == GGML_TYPE_F32) {
 								                        cur += sizeof(float)*ne00*ne01*ne02;
 								                        cur += sizeof(float)*ne10*ne11;
 								                    } else {
 								                        GGML_ASSERT(false);
 								                    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                } break;
 								            case GGML_OP_CONV_TRANSPOSE_2D:
 								                {
 								                    const int64_t ne00 = node->src[0]->ne[0]; // W
 								                    const int64_t ne01 = node->src[0]->ne[1]; // H
 								                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
 								                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    const int64_t ne10 = node->src[1]->ne[0]; // W
 								                    const int64_t ne11 = node->src[1]->ne[1]; // H
 								                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
 								                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
 								                } break;
 								            case GGML_OP_FLASH_ATTN:
 								                {
 								                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
-												ggml : speed-up soft max via Accelerate + unroll

											
										
										
											2023-01-07 14:11:41 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    if (node->src[1]->type == GGML_TYPE_F32) {
 								                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
 								                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
 								                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
 								                    }
 								                } break;
 								            case GGML_OP_FLASH_FF:
 								                {
 								                    if (node->src[1]->type == GGML_TYPE_F32) {
 								                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
 								                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
 								                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
 								                    }
 								                } break;
 								            case GGML_OP_FLASH_ATTN_BACK:
 								                {
 								                    const int64_t    D = node->src[0]->ne[0];
 								                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
 								                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
 								                    if (node->src[1]->type == GGML_TYPE_F32) {
 								                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
 								                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
 								                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
 								                    }
 								                } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            case GGML_OP_CROSS_ENTROPY_LOSS:
 								                {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                } break;
 								            case GGML_OP_COUNT:
 								                {
 								                    GGML_ASSERT(false);
 								                } break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            default:
 								                break;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        work_size = MAX(work_size, cur);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    }
 								    if (work_size > 0) {
 								        work_size += CACHE_LINE_SIZE*(n_threads - 1);
 								    }
-												ggml : limit n_threads to the max n_tasks (llama/5238)

											
										
										
											2024-01-31 12:43:03 +00:00
+								    cplan.n_threads = MIN(max_tasks, n_threads);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    cplan.work_size = work_size;
 								    cplan.work_data = NULL;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return cplan;
 								}
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    {
 								        GGML_ASSERT(cplan);
 								        GGML_ASSERT(cplan->n_threads > 0);
 								        if (cplan->work_size > 0) {
 								            GGML_ASSERT(cplan->work_data);
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								#ifdef GGML_USE_VULKAN
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
-												Basic Vulkan Multi-GPU implementation (llama/5321)

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-07 06:54:50 +00:00
+								        ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								    }
-												Basic Vulkan Multi-GPU implementation (llama/5321)

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-07 06:54:50 +00:00
+								    ggml_vk_preallocate_buffers_cpu_assist();
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
-												Basic Vulkan Multi-GPU implementation (llama/5321)

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-07 06:54:50 +00:00
+								        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								    }
 								#endif
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    const int n_threads = cplan->n_threads;
 								    struct ggml_compute_state_shared state_shared = {
 								        /*.cgraph                  =*/ cgraph,
 								        /*.cgraph_plan             =*/ cplan,
 								        /*.perf_node_start_cycles  =*/ 0,
 								        /*.perf_node_start_time_us =*/ 0,
 								        /*.n_threads               =*/ n_threads,
 								        /*.n_active                =*/ n_threads,
 								        /*.node_n                  =*/ -1,
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        /*.node_task               =*/ GGML_TASK_TYPE_FINALIZE,
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        /*.abort_callback          =*/ NULL,
 								        /*.abort_callback_data     =*/ NULL,
 								    };
 								    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    // create thread pool
 								    if (n_threads > 1) {
 								        for (int j = 1; j < n_threads; ++j) {
 								            workers[j] = (struct ggml_compute_state) {
 								                .thrd   = 0,
 								                .ith = j,
 								                .shared = &state_shared,
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								                .ec = GGML_STATUS_SUCCESS,
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            };
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
 								            GGML_ASSERT(rc == 0);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            UNUSED(rc);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        }
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    workers[0].ith = 0;
 								    workers[0].shared = &state_shared;
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								    workers[0].ec = GGML_STATUS_SUCCESS;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    const int64_t perf_start_cycles  = ggml_perf_cycles();
 								    const int64_t perf_start_time_us = ggml_perf_time_us();
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    // this is a work thread too
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								    ggml_graph_compute_thread(&workers[0]);
 								    enum ggml_status compute_status = workers[0].ec;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								    // don't leave affinity set on the main thread
 								    clear_numa_thread_affinity();
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // join or kill thread pool
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if (n_threads > 1) {
-												ggml : sync latest repo (mostly refactoring changes)

											
										
										
											2023-07-02 18:45:27 +00:00
+								        for (int j = 1; j < n_threads; j++) {
 								            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            GGML_ASSERT(rc == 0);
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								            if (workers[j].ec != GGML_STATUS_SUCCESS)
 								                compute_status = workers[j].ec;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        }
 								    }
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								#ifdef GGML_USE_VULKAN
-												Basic Vulkan Multi-GPU implementation (llama/5321)

* Initial Vulkan multi-gpu implementation

Move most global variables into backend context

* Add names to backend device functions

* Add further missing cleanup code

* Reduce code duplication in tensor split layer assignment

* generalize LLAMA_SPLIT_LAYER for all backends, do not expose device count and memory in llama.h

* Only do device info print in the beginning and initialize one backend for cpu assist

Add missing cleanup code

* Rework backend memory management to make sure devices and buffers get properly allocated and freed

* Rename cpu assist free function

---------

Co-authored-by: slaren <slarengh@gmail.com>

											
										
										
											2024-02-07 06:54:50 +00:00
+								    ggml_vk_graph_cleanup_cpu_assist();
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								#endif
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // performance stats (graph)
 								    {
 								        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
 								        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
 								        cgraph->perf_runs++;
 								        cgraph->perf_cycles  += perf_cycles_cur;
 								        cgraph->perf_time_us += perf_time_us_cur;
 								        GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
 								                __func__, cgraph->perf_runs,
 								                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),
 								                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
 								                (double) perf_time_us_cur     / 1000.0,
 								                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    return compute_status;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-												ggml : introduce ggml_status (ggml/750)

* using enum as an exit code instead of macros

* update return type from enum to unsigned int

* indentation fix

* compound update
ggml_compute_exit_code -> ggml_status
changed ggml_status from a bit-field type to simple codes
ggml_status to string cast

* ggml_status to string cast

* GGML_CALL was removed

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-03-04 09:05:42 +00:00
+								    return ggml_graph_compute(cgraph, &cplan);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
 								    for (int i = 0; i < cgraph->n_leafs; i++) {
 								        struct ggml_tensor * leaf = cgraph->leafs[i];
 								        if (strcmp(leaf->name, name) == 0) {
 								            return leaf;
 								        }
 								    }
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
 								        struct ggml_tensor * node = cgraph->nodes[i];
 								        if (strcmp(node->name, name) == 0) {
 								            return node;
 								        }
 								    }
 								    return NULL;
 								}
 								static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
 								    const int64_t * ne = tensor->ne;
 								    const size_t  * nb = tensor->nb;
 								    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
 								            ggml_type_name(tensor->type),
 								            ggml_op_name  (tensor->op),
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								            ggml_n_dims(tensor),
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            ne[0], ne[1], ne[2], ne[3],
 								            nb[0], nb[1], nb[2], nb[3],
 								            tensor->data,
 								            tensor->name);
 								}
 								static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
 								    const int64_t * ne = tensor->ne;
 								    const size_t  * nb = tensor->nb;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            arg,
 								            ggml_type_name(tensor->type),
 								            ggml_op_name  (tensor->op),
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								            ggml_n_dims(tensor),
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            ne[0], ne[1], ne[2], ne[3],
 								            nb[0], nb[1], nb[2], nb[3],
 								            tensor->data,
 								            tensor->name);
 								}
 								void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
 								    uint64_t size_eval = 0;
 								    // compute size of intermediate results
 								    // TODO: does not take into account scratch buffers !!!!
 								    for (int i = 0; i < cgraph->n_nodes; ++i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    }
 								    // print
 								    {
 								        FILE * fout = stdout;
 								        fprintf(fout, "\n");
 								        fprintf(fout, "%-16s %8x\n", "magic",        GGML_FILE_MAGIC);
 								        fprintf(fout, "%-16s %8d\n", "version",      GGML_FILE_VERSION);
 								        fprintf(fout, "%-16s %8d\n", "leafs",        cgraph->n_leafs);
 								        fprintf(fout, "%-16s %8d\n", "nodes",        cgraph->n_nodes);
 								        fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
 								        // header
 								        fprintf(fout, "\n");
 								        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
 								                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
 								        for (int i = 0; i < cgraph->n_leafs; ++i) {
 								            ggml_graph_export_leaf(cgraph->leafs[i], fout);
 								            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
 								            GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        }
 								        // header
 								        fprintf(fout, "\n");
 								        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
 								                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
 								        for (int i = 0; i < cgraph->n_nodes; ++i) {
 								            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            for (int j = 0; j < GGML_MAX_SRC; ++j) {
 								                if (cgraph->nodes[i]->src[j]) {
 								                    ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
 								            }
 								            fprintf(fout, "\n");
 								        }
 								        fprintf(fout, "\n");
 								    }
 								    // write binary data
 								    {
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        FILE * fout = ggml_fopen(fname, "wb");
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								        if (!fout) {
 								            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
 								            return;
 								        }
 								        // header
 								        {
 								            const uint32_t magic   = GGML_FILE_MAGIC;
 								            const uint32_t version = GGML_FILE_VERSION;
 								            const uint32_t n_leafs = cgraph->n_leafs;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            const uint32_t n_nodes = cgraph->n_nodes;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								            fwrite(&magic,     sizeof(uint32_t), 1, fout);
 								            fwrite(&version,   sizeof(uint32_t), 1, fout);
 								            fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            fwrite(&n_nodes,   sizeof(uint32_t), 1, fout);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
 								        }
 								        // leafs
 								        {
 								            for (int i = 0; i < cgraph->n_leafs; ++i) {
 								                const struct ggml_tensor * tensor = cgraph->leafs[i];
 								                const uint32_t type   = tensor->type;
 								                const uint32_t op     = tensor->op;
 								                fwrite(&type,   sizeof(uint32_t), 1, fout);
 								                fwrite(&op,     sizeof(uint32_t), 1, fout);
 								                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
 								                    const uint64_t ne = tensor->ne[j];
 								                    const uint64_t nb = tensor->nb[j];
 								                    fwrite(&ne, sizeof(uint64_t), 1, fout);
 								                    fwrite(&nb, sizeof(uint64_t), 1, fout);
 								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
 								                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                // dump the data
 								                // TODO: pad this to 32 byte boundary
 								                {
 								                    const size_t size = ggml_nbytes(tensor);
 								                    fwrite(tensor->data, sizeof(char), size, fout);
 								                }
 								            }
 								        }
 								        // nodes
 								        {
 								            for (int i = 0; i < cgraph->n_nodes; ++i) {
 								                const struct ggml_tensor * tensor = cgraph->nodes[i];
 								                const uint32_t type   = tensor->type;
 								                const uint32_t op     = tensor->op;
 								                fwrite(&type,   sizeof(uint32_t), 1, fout);
 								                fwrite(&op,     sizeof(uint32_t), 1, fout);
 								                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
 								                    const uint64_t ne = tensor->ne[j];
 								                    const uint64_t nb = tensor->nb[j];
 								                    fwrite(&ne, sizeof(uint64_t), 1, fout);
 								                    fwrite(&nb, sizeof(uint64_t), 1, fout);
 								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
 								                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                // output the op arguments
 								                {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
 								                        args[j] = tensor->src[j];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                        if (args[j]) {
 								                            int32_t idx = -1;
 								                            // check if leaf
 								                            {
 								                                for (int k = 0; k < cgraph->n_leafs; ++k) {
 								                                    if (args[j] == cgraph->leafs[k]) {
 								                                        idx = k;
 								                                        break;
 								                                    }
 								                                }
 								                            }
 								                            // check if node
 								                            if (idx == -1) {
 								                                for (int k = 0; k < cgraph->n_nodes; ++k) {
 								                                    if (args[j] == cgraph->nodes[k]) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                        idx = cgraph->n_leafs + k;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                                        break;
 								                                    }
 								                                }
 								                            }
 								                            if (idx == -1) {
 								                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                                fclose(fout);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                                return;
 								                            }
 								                            fwrite(&idx, sizeof(int32_t), 1, fout);
 								                        } else {
 								                            const int32_t nul = -1;
 								                            fwrite(&nul, sizeof(int32_t), 1, fout);
 								                        }
 								                    }
 								                }
 								            }
 								        }
 								        fclose(fout);
 								    }
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    assert(*ctx_data == NULL);
 								    assert(*ctx_eval == NULL);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct ggml_cgraph * result = NULL;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    struct ggml_tensor * data = NULL;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // read file into data
 								    {
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        FILE * fin = ggml_fopen(fname, "rb");
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (!fin) {
 								            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
 								            return result;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        size_t fsize = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        fseek(fin, 0, SEEK_END);
 								        fsize = ftell(fin);
 								        fseek(fin, 0, SEEK_SET);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        // create the data context
 								        {
 								            const size_t overhead = 1*ggml_tensor_overhead();
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            struct ggml_init_params params = {
 								                .mem_size   = fsize + overhead,
 								                .mem_buffer = NULL,
 								                .no_alloc   = false,
 								            };
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            *ctx_data = ggml_init(params);
 								            if (!*ctx_data) {
 								                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
 								                fclose(fin);
 								                return result;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
 								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        {
 								            const size_t ret = fread(data->data, sizeof(char), fsize, fin);
 								            if (ret != fsize) {
 								                fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
 								                fclose(fin);
 								                return result;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        fclose(fin);
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // populate result
 								    {
 								        char * ptr = (char *) data->data;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (magic != GGML_FILE_MAGIC) {
 								            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
 								            return result;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (version != GGML_FILE_VERSION) {
 								            fprintf(stderr, "%s: invalid version number\n", __func__);
 								            return result;
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
 								        const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
 								        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        const int     graph_size = MAX(n_leafs, n_nodes);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        // create the data context
 								        {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								            struct ggml_init_params params = {
 								                .mem_size   = size_eval + overhead,
 								                .mem_buffer = NULL,
 								                .no_alloc   = true,
 								            };
 								            *ctx_eval = ggml_init(params);
 								            if (!*ctx_eval) {
 								                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
 								                return result;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            }
 								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
 								        result->n_leafs = n_leafs;
 								        result->n_nodes = n_nodes;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        // leafs
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            uint32_t type;
 								            uint32_t op;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            for (uint32_t i = 0; i < n_leafs; ++i) {
 								                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
 								                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                int64_t ne[GGML_MAX_DIMS];
 								                size_t  nb[GGML_MAX_DIMS];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
 								                    uint64_t ne_cur;
 								                    uint64_t nb_cur;
 								                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
 								                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
 								                    ne[j] = ne_cur;
 								                    nb[j] = nb_cur;
 								                }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                tensor->op = (enum ggml_op) op;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                memcpy(tensor->name,      ptr, GGML_MAX_NAME);      ptr += GGML_MAX_NAME;
 								                memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                tensor->data = (void *) ptr;
 								                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
 								                    tensor->nb[j] = nb[j];
 								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                result->leafs[i] = tensor;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                ptr += ggml_nbytes(tensor);
-												ggml, common, examples, tests : fixed type arguments in printf (llama/5528)

											
										
										
											2024-02-18 16:20:12 +00:00
+								                fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        ggml_set_no_alloc(*ctx_eval, false);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        // nodes
 								        {
 								            uint32_t type;
 								            uint32_t op;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            for (uint32_t i = 0; i < n_nodes; ++i) {
 								                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
 								                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                enum ggml_op eop = (enum ggml_op) op;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                int64_t ne[GGML_MAX_DIMS];
 								                size_t  nb[GGML_MAX_DIMS];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
 								                    uint64_t ne_cur;
 								                    uint64_t nb_cur;
 								                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
 								                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
 								                    ne[j] = ne_cur;
 								                    nb[j] = nb_cur;
 								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                const char * ptr_name      = ptr; ptr += GGML_MAX_NAME;
 								                const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                // parse args
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                for (int j = 0; j < GGML_MAX_SRC; ++j) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    const int32_t arg_idx = ptr_arg_idx[j];
 								                    if (arg_idx == -1) {
 								                        continue;
 								                    }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    if (arg_idx < result->n_leafs) {
 								                        args[j] = result->leafs[arg_idx];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    } else {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        args[j] = result->nodes[arg_idx - result->n_leafs];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    }
 								                }
 								                // create the tensor
 								                // "view" operations are handled differently
 								                // TODO: handle inplace ops - currently a copy is always made
 								                struct ggml_tensor * tensor = NULL;
 								                switch (eop) {
 								                    // TODO: implement other view ops
 								                    case GGML_OP_RESHAPE:
 								                        {
 								                            tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
 								                        } break;
 								                    case GGML_OP_VIEW:
 								                        {
 								                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            size_t offs;
 								                            memcpy(&offs, ptr_op_params, sizeof(offs));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                            tensor->data = ((char *) tensor->data) + offs;
 								                        } break;
 								                    case GGML_OP_TRANSPOSE:
 								                        {
 								                            tensor = ggml_transpose(*ctx_eval, args[0]);
 								                        } break;
 								                    case GGML_OP_PERMUTE:
 								                        {
 								                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
 								                        } break;
 								                    default:
 								                        {
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                            tensor->op = eop;
 								                        } break;
 								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                memcpy(tensor->name,      ptr_name,      GGML_MAX_NAME);
 								                memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
 								                    tensor->nb[j] = nb[j];
 								                }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                for (int j = 0; j < GGML_MAX_SRC; ++j) {
 								                    tensor->src[j] = args[j];
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                result->nodes[i] = tensor;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml, common, examples, tests : fixed type arguments in printf (llama/5528)

											
										
										
											2024-02-18 16:20:12 +00:00
+								                fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								    return result;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								void ggml_graph_print(const struct ggml_cgraph * cgraph) {
 								    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
 								    GGML_PRINT("=== GRAPH ===\n");
 								    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
 								        struct ggml_tensor * node = cgraph->nodes[i];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                i,
-												Use Accelerate framework on Apple silicon

Huge performance improvement in the Encode (almost x2 on MacBook M1 Pro)

Also various extra optimizations:

- Multi-threaded NORM operator
- Faster GELU via F16 cast

											
										
										
											2022-10-17 18:44:16 +00:00
+								                node->ne[0], node->ne[1], node->ne[2],
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								                ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
 								                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
 								                (double) node->perf_time_us / 1000.0,
 								                (double) node->perf_time_us / 1000.0 / node->perf_runs);
 								    }
 								    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
 								    for (int i = 0; i < cgraph->n_leafs; i++) {
 								        struct ggml_tensor * node = cgraph->leafs[i];
-												whisper : Metal and ggml-alloc support (#1270)

* metal : init

* whisper : factor out graph builds

* whisper : allocate encoder and decoder using ggml-alloc

* whisper : ggml-alloc is now supported

* whisper : CoreML support ggml-alloc

* build : fix ggml-alloc

* ios : update submodule

* extra : update sync-ggml.sh script to also sync ggml-alloc

* ci : see if this is causing the crash

* whisper : refactor ggml-alloc init

* whisper.android : try to fix build

* whisper : initial Metal version

* ci : try to debug vmem issue

* metal : decoder works on GPU!

* metal : add multi-decoder support

* ggml : fix ggml_nbytes (probably temp solution)

* metal : run "cross" step on the GPU

* whisper : remove ggml_repeat in the encoder

* whisper : offload the Encoder to Metal

* ggml : use simpler ggml_bytes() implementation

* ggml-alloc : try to make CI happy by reducing vram to 128GB

* whisper : add whisper_allocr to wrap ggml_allocr

* whisper : factor out alloc init in a function

* cmake : update to support Metal build

* whisper : add <functional> header

* objc : fix build (no Metal yet)

* ios : add Metal support

* swiftui : fix build

* metal : speed-up KQ multiplication

* metal : sync latest llama.cpp kernels

* readme : add Metal info

* ios : update submodule

* coreml : add code to toggle Core ML config (CPU, ANE, GPU)

* bench : fix timings by running a pre-heat

* bench : start benching the decoder

* whisper : add ggml_mul_mat_pad

* bench : fix uninitialized vars

* whisper : add comment for disabling mul-mat padding

* whisper : add description of ggml_mul_mat_pad

* whisper : clean-up ggml_mul_mat_pad

* metal : remove the "concurrent" flag

* bench : variable n_past

* ios : update SPM package
											
										
										
											2023-09-15 09:18:18 +00:00
+								        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                i,
 								                node->ne[0], node->ne[1],
-												whisper : Metal and ggml-alloc support (#1270)

* metal : init

* whisper : factor out graph builds

* whisper : allocate encoder and decoder using ggml-alloc

* whisper : ggml-alloc is now supported

* whisper : CoreML support ggml-alloc

* build : fix ggml-alloc

* ios : update submodule

* extra : update sync-ggml.sh script to also sync ggml-alloc

* ci : see if this is causing the crash

* whisper : refactor ggml-alloc init

* whisper.android : try to fix build

* whisper : initial Metal version

* ci : try to debug vmem issue

* metal : decoder works on GPU!

* metal : add multi-decoder support

* ggml : fix ggml_nbytes (probably temp solution)

* metal : run "cross" step on the GPU

* whisper : remove ggml_repeat in the encoder

* whisper : offload the Encoder to Metal

* ggml : use simpler ggml_bytes() implementation

* ggml-alloc : try to make CI happy by reducing vram to 128GB

* whisper : add whisper_allocr to wrap ggml_allocr

* whisper : factor out alloc init in a function

* cmake : update to support Metal build

* whisper : add <functional> header

* objc : fix build (no Metal yet)

* ios : add Metal support

* swiftui : fix build

* metal : speed-up KQ multiplication

* metal : sync latest llama.cpp kernels

* readme : add Metal info

* ios : update submodule

* coreml : add code to toggle Core ML config (CPU, ANE, GPU)

* bench : fix timings by running a pre-heat

* bench : start benching the decoder

* whisper : add ggml_mul_mat_pad

* bench : fix uninitialized vars

* whisper : add comment for disabling mul-mat padding

* whisper : add description of ggml_mul_mat_pad

* whisper : clean-up ggml_mul_mat_pad

* metal : remove the "concurrent" flag

* bench : variable n_past

* ios : update SPM package
											
										
										
											2023-09-15 09:18:18 +00:00
+								                ggml_op_name(node->op),
 								                ggml_get_name(node));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								    for (int i = 0; i < GGML_OP_COUNT; i++) {
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        if (perf_total_per_op_us[i] == 0) {
 								            continue;
 								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								    GGML_PRINT("========================================\n");
 								}
 								// check if node is part of the graph
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    if (cgraph == NULL) {
 								        return true;
 								    }
 								    for (int i = 0; i < cgraph->n_nodes; i++) {
 								        if (cgraph->nodes[i] == node) {
 								            return true;
 								        }
 								    }
 								    return false;
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    for (int i = 0; i < cgraph->n_nodes; i++) {
 								        struct ggml_tensor * parent = cgraph->nodes[i];
 								        if (parent->grad == node) {
 								            return parent;
 								        }
 								    }
 								    return NULL;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
 								    struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
 								    struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
 								    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
 								            gparent0 ? (void *) gparent0 : (void *) parent,
 								            gparent0 ? "g" : "x",
 								            gparent ? (void *) gparent : (void *) node,
 								            gparent ? "g" : "x",
 								            gparent ? "empty" : "vee",
 								            gparent ? "dashed" : "solid",
 								            label);
 								}
 								static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
 								    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
 								            (void *) parent, "x",
 								            (void *) node, "x",
 								            label);
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
 								    char color[16];
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    FILE * fp = ggml_fopen(filename, "w");
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(fp);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    fprintf(fp, "digraph G {\n");
 								    fprintf(fp, "  newrank = true;\n");
 								    fprintf(fp, "  rankdir = LR;\n");
 								    for (int i = 0; i < gb->n_nodes; i++) {
 								        struct ggml_tensor * node = gb->nodes[i];
 								        if (ggml_graph_get_parent(gb, node) != NULL) {
 								            continue;
 								        }
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            snprintf(color, sizeof(color), "yellow");
 								        } else if (node->grad) {
 								            if (ggml_graph_find(gf, node)) {
 								                snprintf(color, sizeof(color), "green");
 								            } else {
 								                snprintf(color, sizeof(color), "lightblue");
 								            }
 								        } else {
 								            snprintf(color, sizeof(color), "white");
 								        }
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        fprintf(fp, "  \"%p\" [ "
 								                    "style = filled; fillcolor = %s; shape = record; "
 								                    "label=\"",
 								                (void *) node, color);
 								        if (strlen(node->name) > 0) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
 								        } else {
 								            fprintf(fp, "(%s)|", ggml_type_name(node->type));
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								        if (ggml_is_matrix(node)) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        } else {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								        if (node->grad) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        } else {
 								            fprintf(fp, "\"; ]\n");
 								        }
 								    }
 								    for (int i = 0; i < gb->n_leafs; i++) {
 								        struct ggml_tensor * node = gb->leafs[i];
 								        snprintf(color, sizeof(color), "pink");
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        fprintf(fp, "  \"%p\" [ "
 								                    "style = filled; fillcolor = %s; shape = record; "
 								                    "label=\"<x>",
 								                (void *) node, color);
 								        if (strlen(node->name) > 0) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
 								        } else {
 								            fprintf(fp, "(%s)|", ggml_type_name(node->type));
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
 								        fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
 								        if (ggml_nelements(node) < 5) {
 								            fprintf(fp, " | (");
 								            for (int j = 0; j < ggml_nelements(node); j++) {
 								                if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
 								                    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
 								                }
 								                else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
 								                    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
 								                }
 								                else {
 								                    fprintf(fp, "#");
 								                }
 								                if (j < ggml_nelements(node) - 1) {
 								                    fprintf(fp, ", ");
 								                }
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								            }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            fprintf(fp, ")");
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												ggml : sync ggml (clBLAST + tensor names)

											
										
										
											2023-05-02 18:23:54 +00:00
+								        fprintf(fp, "\"; ]\n");
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								    for (int i = 0; i < gb->n_nodes; i++) {
 								        struct ggml_tensor * node = gb->nodes[i];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        for (int j = 0; j < GGML_MAX_SRC; j++) {
 								            if (node->src[j]) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                char label[16];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                snprintf(label, sizeof(label), "src %d", j);
 								                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
 								    for (int i = 0; i < gb->n_leafs; i++) {
 								        struct ggml_tensor * node = gb->leafs[i];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        for (int j = 0; j < GGML_MAX_SRC; j++) {
 								            if (node->src[j]) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                char label[16];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                snprintf(label, sizeof(label), "src %d", j);
 								                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
 								    fprintf(fp, "}\n");
 								    fclose(fp);
 								    GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
 								}
 								////////////////////////////////////////////////////////////////////////////////
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    int i = 0;
 								    for (int p = 0; p < np; ++p) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        const int64_t ne = ggml_nelements(ps[p]) ;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        // TODO: add function to set tensor from array
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        for (int64_t j = 0; j < ne; ++j) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            ggml_set_f32_1d(ps[p], j, x[i++]);
 								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    int i = 0;
 								    for (int p = 0; p < np; ++p) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        const int64_t ne = ggml_nelements(ps[p]) ;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        // TODO: add function to get all elements at once
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        for (int64_t j = 0; j < ne; ++j) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            x[i++] = ggml_get_f32_1d(ps[p], j);
 								        }
 								    }
 								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    int64_t i = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    for (int p = 0; p < np; ++p) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        const int64_t ne = ggml_nelements(ps[p]) ;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        // TODO: add function to get all elements at once
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        for (int64_t j = 0; j < ne; ++j) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
 								        }
 								    }
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) {
 								    int64_t i = 0;
 								    for (int p = 0; p < np; ++p) {
 								        const int64_t ne = ggml_nelements(ps[p]) ;
 								        // TODO: add function to get all elements at once
 								        for (int64_t j = 0; j < ne; ++j) {
 								            g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale;
 								        }
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
-												sync : ggml (VMM, sync-ggml-am, dotprod ARM fixes, CUDA fixes) (#1691)

* scripts : add sync-ggml-am.sh

* sync : ggml (VMM, ARM dot prod fix, etc.)

* build : fix CUDA build

* ggml : fix some mul mat cases + add tests for src1 F16

https://github.com/ggerganov/ggml/commit/dbd02958fa4f46898f68ca29c27ddcdc58a06f98
											
										
										
											2023-12-29 09:30:47 +00:00
+								// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
-												sync : ggml (VMM, sync-ggml-am, dotprod ARM fixes, CUDA fixes) (#1691)

* scripts : add sync-ggml-am.sh

* sync : ggml (VMM, ARM dot prod fix, etc.)

* build : fix CUDA build

* ggml : fix some mul mat cases + add tests for src1 F16

https://github.com/ggerganov/ggml/commit/dbd02958fa4f46898f68ca29c27ddcdc58a06f98
											
										
										
											2023-12-29 09:30:47 +00:00
+								// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								//
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static enum ggml_opt_result ggml_opt_adam(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        struct ggml_opt_context * opt,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_opt_params params,
 								        struct ggml_tensor * f,
 								        struct ggml_cgraph * gf,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        struct ggml_cgraph * gb,
 								        ggml_opt_callback callback,
 								        void * callback_data) {
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    GGML_ASSERT(ggml_is_scalar(f));
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    // these will store the parameters we want to optimize
 								    struct ggml_tensor * ps[GGML_MAX_PARAMS];
 								    int np = 0;
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    int64_t nx = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    for (int i = 0; i < gf->n_nodes; ++i) {
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            GGML_ASSERT(np < GGML_MAX_PARAMS);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								            ps[np++] = gf->nodes[i];
 								            nx += ggml_nelements(gf->nodes[i]);
 								        }
 								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) {
 								        int iter = opt->iter;
 								        ggml_opt_init(opt->ctx, opt, params, nx);
 								        opt->iter = iter;
 								    }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // constants
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    float sched = params.adam.sched;
 								    const float alpha = params.adam.alpha;
 								    const float decay = params.adam.decay * alpha;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    const float beta1 = params.adam.beta1;
 								    const float beta2 = params.adam.beta2;
 								    const float eps   = params.adam.eps;
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    const float gclip = params.adam.gclip;
 								    const int decay_min_ndim = params.adam.decay_min_ndim;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int n_accum = MAX(1, params.n_gradient_accumulation);
 								    const float accum_norm = 1.0f / (float) n_accum;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    float * g  = opt->adam.g->data;  // gradients
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    float * m  = opt->adam.m->data;  // first moment
 								    float * v  = opt->adam.v->data;  // second moment
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    bool cancel = false;
 								    // compute the function value
 								    float fx = 0;
 								    ggml_set_zero(opt->adam.g);
 								    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
 								        if (callback) {
 								            callback(callback_data, accum_step, &sched, &cancel);
 								            if (cancel) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                return GGML_OPT_RESULT_CANCEL;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
 								        }
 								        // ggml_graph_reset  (gf);
 								        ggml_set_f32      (f->grad, 1.0f);
 								        ggml_graph_compute(gb, &cplan);
 								        ggml_opt_acc_grad(np, ps, g, accum_norm);
 								        fx += ggml_get_f32_1d(f, 0);
 								    }
 								    fx *= accum_norm;
 								    opt->adam.fx_prev = fx;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    opt->adam.fx_best = opt->adam.fx_prev;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    if (pf) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        pf[opt->iter % params.past] = opt->adam.fx_prev;
 								    }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    opt->loss_before = opt->adam.fx_prev;
 								    opt->loss_after  = opt->adam.fx_prev;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    // initialize
 								    if (opt->just_initialized) {
 								        opt->adam.n_no_improvement = 0;
 								        opt->just_initialized = false;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    float * fx_best = &opt->adam.fx_best;
 								    float * fx_prev = &opt->adam.fx_prev;
 								    int * n_no_improvement = &opt->adam.n_no_improvement;
 								    int iter0 = opt->iter;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    // run the optimizer
 								    for (int t = 0; t < params.adam.n_iter; ++t) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        opt->iter = iter0 + t + 1;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
 								        GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
 								        GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0));
 								        GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0));
 								        for (int i = 0; i < np; ++i) {
 								            GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
 								                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
 								        }
 								        const int64_t t_start_wall = ggml_time_us();
 								        const int64_t t_start_cpu = ggml_cycles();
 								        UNUSED(t_start_wall);
 								        UNUSED(t_start_cpu);
 								        {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								            float gnorm = 1.0f;
 								            if (gclip > 0.0f) {
 								                // gradient clipping
 								                ggml_float sum = 0.0;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                for (int64_t i = 0; i < nx; ++i) {
 								                    sum += (ggml_float)(g[i]*g[i]);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                }
 								                ggml_float norm = sqrt(sum);
 								                if (norm > (ggml_float) gclip) {
 								                    gnorm = (float) ((ggml_float) gclip / norm);
 								                }
 								            }
 								            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
 								            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
 								            int64_t i = 0;
 								            for (int p = 0; p < np; ++p) {
 								                const int64_t ne = ggml_nelements(ps[p]);
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								                const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                for (int64_t j = 0; j < ne; ++j) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    float x  = ggml_get_f32_1d(ps[p], j);
 								                    float g_ = g[i]*gnorm;
 								                    m[i] = m[i]*beta1 +    g_*(1.0f - beta1);
 								                    v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                    float mh = m[i]*beta1h;
 								                    float vh = v[i]*beta2h;
 								                    vh = sqrtf(vh) + eps;
 								                    x  = x*(1.0f - p_decay) - mh/vh;
 								                    ggml_set_f32_1d(ps[p], j, x);
 								                    ++i;
 								                }
 								            }
 								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        fx = 0;
 								        ggml_set_zero(opt->adam.g);
 								        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
 								            if (callback) {
 								                callback(callback_data, accum_step, &sched, &cancel);
 								                if (cancel) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    return GGML_OPT_RESULT_CANCEL;;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                }
 								            }
 								            // ggml_graph_reset  (gf);
 								            ggml_set_f32      (f->grad, 1.0f);
 								            ggml_graph_compute(gb, &cplan);
 								            ggml_opt_acc_grad(np, ps, g, accum_norm);
 								            fx += ggml_get_f32_1d(f, 0);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        fx *= accum_norm;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        opt->loss_after = fx;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        // check convergence
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            GGML_PRINT_DEBUG("converged\n");
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            return GGML_OPT_RESULT_OK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        // delta-based convergence test
 								        if (pf != NULL) {
 								            // need at least params.past iterations to start checking for convergence
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            if (params.past <= iter0 + t) {
 								                const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                if (fabsf(rate) < params.delta) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    return GGML_OPT_RESULT_OK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            pf[(iter0 + t)%params.past] = fx;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        // check for improvement
 								        if (params.max_no_improvement > 0) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            if (fx_best[0] > fx) {
 								                fx_best[0] = fx;
 								                n_no_improvement[0] = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } else {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                ++n_no_improvement[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                if (n_no_improvement[0] >= params.max_no_improvement) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    return GGML_OPT_RESULT_OK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
 								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        fx_prev[0] = fx;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								        {
 								            const int64_t t_end_cpu = ggml_cycles();
-												wip : initial WASM port

Works but it is very slow because no SIMD is used.
For example, jfk.wav is processed in ~23 seconds using "tiny.en" model

											
										
										
											2022-10-22 10:07:59 +00:00
+								            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            UNUSED(t_end_cpu);
 								            const int64_t t_end_wall = ggml_time_us();
 								            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
 								            UNUSED(t_end_wall);
 								        }
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								//
 								// L-BFGS
 								//
 								// the L-BFGS implementation below is based on the following implementation:
 								//
 								//   https://github.com/chokkan/liblbfgs
 								//
 								struct ggml_lbfgs_iteration_data {
 								    float alpha;
 								    float ys;
 								    float * s;
 								    float * y;
 								};
 								static enum ggml_opt_result linesearch_backtracking(
 								        const struct ggml_opt_params * params,
 								        int nx,
 								        float * x,
 								        float * fx,
 								        float * g,
 								        float * d,
 								        float * step,
 								        const float * xp,
 								        struct ggml_tensor * f,
 								        struct ggml_cgraph * gb,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        struct ggml_cplan  * cplan,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        const int np,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        struct ggml_tensor * ps[],
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        bool * cancel,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        ggml_opt_callback callback,
 								        void * callback_data) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    int count = 0;
 								    float width  = 0.0f;
 								    float dg     = 0.0f;
 								    float finit  = 0.0f;
 								    float dginit = 0.0f;
 								    float dgtest = 0.0f;
 								    const float dec = 0.5f;
 								    const float inc = 2.1f;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int n_accum = MAX(1, params->n_gradient_accumulation);
 								    const float accum_norm = 1.0f / (float) n_accum;
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								    if (*step <= 0.f) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        return GGML_LINESEARCH_INVALID_PARAMETERS;
 								    }
 								    // compute the initial gradient in the search direction
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								    ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    // make sure that d points to a descent direction
 								    if (0 < dginit) {
 								        return GGML_LINESEARCH_FAIL;
 								    }
 								    // initialize local variables
 								    finit = *fx;
 								    dgtest = params->lbfgs.ftol*dginit;
 								    while (true) {
 								        ggml_vec_cpy_f32(nx, x, xp);
 								        ggml_vec_mad_f32(nx, x, d, *step);
 								        // evaluate the function and gradient values
 								        {
 								            ggml_opt_set_params(np, ps, x);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            *fx = 0;
 								            memset(g, 0, sizeof(float)*nx);
 								            for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
 								                if (callback) {
 								                    // LBFG-S does not support learning rate -> ignore learning schedule
 								                    float sched = 0;
 								                    callback(callback_data, accum_step, &sched, cancel);
 								                    if (*cancel) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                        return GGML_OPT_RESULT_CANCEL;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    }
 								                }
 								                // ggml_graph_reset  (gf);
 								                ggml_set_f32      (f->grad, 1.0f);
 								                ggml_graph_compute(gb, cplan);
 								                ggml_opt_acc_grad(np, ps, g, accum_norm);
 								                *fx += ggml_get_f32_1d(f, 0);
 								            }
 								            *fx *= accum_norm;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								        }
 								        ++count;
 								        if (*fx > finit + (*step)*dgtest) {
 								            width = dec;
 								        } else {
 								            // Armijo condition is satisfied
 								            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
 								                return count;
 								            }
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								            ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								            // check the Wolfe condition
 								            if (dg < params->lbfgs.wolfe * dginit) {
 								                width = inc;
 								            } else {
 								                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
 								                    // regular Wolfe conditions
 								                    return count;
 								                }
 								                if(dg > -params->lbfgs.wolfe*dginit) {
 								                    width = dec;
 								                } else {
 								                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
 								                    return count;
 								                }
 								            }
 								        }
 								        if (*step < params->lbfgs.min_step) {
 								            return GGML_LINESEARCH_MINIMUM_STEP;
 								        }
 								        if (*step > params->lbfgs.max_step) {
 								            return GGML_LINESEARCH_MAXIMUM_STEP;
 								        }
 								        if (params->lbfgs.max_linesearch <= count) {
 								            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;
 								        }
 								        (*step) *= width;
 								    }
-												ci : add an option to fail on compile warning (llama/3952)

* feat(ci): add an option to fail on compile warning

* Update CMakeLists.txt

* minor : fix compile warnings

ggml-ci

* ggml : fix unreachable code warnings

ggml-ci

* ci : disable fatal warnings for windows, ios and tvos

* ggml : fix strncpy warning

* ci : disable fatal warnings for MPI build

* ci : add fatal warnings to ggml-ci

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-17 21:03:14 +00:00
+								    GGML_ASSERT(false && "line search failed");
 								    return GGML_LINESEARCH_FAIL;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : barrier refactor + static functions

											
										
										
											2022-12-28 16:47:30 +00:00
+								static enum ggml_opt_result ggml_opt_lbfgs(
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_context * ctx,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        struct ggml_opt_context * opt,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        struct ggml_opt_params params,
 								        struct ggml_tensor * f,
 								        struct ggml_cgraph * gf,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        struct ggml_cgraph * gb,
 								        ggml_opt_callback callback,
 								        void * callback_data) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
 								        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            return GGML_OPT_RESULT_INVALID_WOLFE;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								    }
 								    const int m = params.lbfgs.m;
 								    // these will store the parameters we want to optimize
 								    struct ggml_tensor * ps[GGML_MAX_PARAMS];
 								    int np = 0;
 								    int nx = 0;
 								    for (int i = 0; i < gf->n_nodes; ++i) {
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								        if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								            GGML_ASSERT(np < GGML_MAX_PARAMS);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								            ps[np++] = gf->nodes[i];
 								            nx += ggml_nelements(gf->nodes[i]);
 								        }
 								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) {
 								        int iter = opt->iter;
 								        ggml_opt_init(ctx, opt, params, nx);
 								        opt->iter = iter;
 								    }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    float * x  = opt->lbfgs.x->data;  // current parameters
 								    float * xp = opt->lbfgs.xp->data; // previous parameters
 								    float * g  = opt->lbfgs.g->data;  // current gradient
 								    float * gp = opt->lbfgs.gp->data; // previous gradient
 								    float * d  = opt->lbfgs.d->data;  // search direction
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    const int n_accum = MAX(1, params.n_gradient_accumulation);
 								    const float accum_norm = 1.0f / (float) n_accum;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    float fx    = 0.0f; // cost function value
 								    float xnorm = 0.0f; // ||x||
 								    float gnorm = 0.0f; // ||g||
 								    // initialize x from the graph nodes
 								    ggml_opt_get_params(np, ps, x);
 								    // the L-BFGS memory
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    float * lm_alpha = opt->lbfgs.lmal->data;
 								    float * lm_ys    = opt->lbfgs.lmys->data;
 								    float * lm_s     = opt->lbfgs.lms->data;
 								    float * lm_y     = opt->lbfgs.lmy->data;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    bool cancel = false;
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    // evaluate the function value and its gradient
 								    {
 								        ggml_opt_set_params(np, ps, x);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        fx = 0;
 								        memset(g, 0, sizeof(float)*nx);
 								        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
 								            if (callback) {
 								                // LBFG-S does not support learning rate -> ignore learning schedule
 								                float sched = 0;
 								                callback(callback_data, accum_step, &sched, &cancel);
 								                if (cancel) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    return GGML_OPT_RESULT_CANCEL;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                }
 								            }
 								            // ggml_graph_reset  (gf);
 								            ggml_set_f32      (f->grad, 1.0f);
 								            ggml_graph_compute(gb, &cplan);
 								            ggml_opt_acc_grad(np, ps, g, accum_norm);
 								            fx += ggml_get_f32_1d(f, 0);
 								        }
 								        fx *= accum_norm;
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
 								        opt->loss_before = fx;
 								        opt->loss_after  = fx;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
 								    // search direction = -gradient
 								    ggml_vec_neg_f32(nx, d, g);
 								    // ||x||, ||g||
 								    ggml_vec_norm_f32(nx, &xnorm, x);
 								    ggml_vec_norm_f32(nx, &gnorm, g);
 								    if (xnorm < 1.0f) {
 								        xnorm = 1.0f;
 								    }
 								    // already optimized
 								    if (gnorm/xnorm <= params.lbfgs.eps) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        return GGML_OPT_RESULT_OK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    if (opt->just_initialized) {
 								        if (pf) {
 								            pf[0] = fx;
 								        }
 								        opt->lbfgs.fx_best = fx;
 								        // initial step
 								        ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d);
 								        opt->lbfgs.j                = 0;
 								        opt->lbfgs.k                = 1;
 								        opt->lbfgs.end              = 0;
 								        opt->lbfgs.n_no_improvement = 0;
 								        opt->just_initialized       = false;
 								    }
 								    float * fx_best        = &opt->lbfgs.fx_best;
 								    float * step           = &opt->lbfgs.step;
 								    int * j                = &opt->lbfgs.j;
 								    int * k                = &opt->lbfgs.k;
 								    int * end              = &opt->lbfgs.end;
 								    int * n_no_improvement = &opt->lbfgs.n_no_improvement;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int ls     = 0;
 								    int bound  = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								    float ys   = 0.0f;
 								    float yy   = 0.0f;
 								    float beta = 0.0f;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    int it = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    while (true) {
 								        // store the current position and gradient vectors
 								        ggml_vec_cpy_f32(nx, xp, x);
 								        ggml_vec_cpy_f32(nx, gp, g);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        // TODO: instead of passing &cancel here, use the return code of the linesearch
 								        //       to determine if the optimization should be cancelled
 								        //       this is a simple change, but not doing this atm, since I don't have a nice
 								        //       way to test and don't want to break something with so many changes lined up
 								        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
 								        if (cancel) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            return GGML_OPT_RESULT_CANCEL;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        }
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								        if (ls < 0) {
 								            // linesearch failed - go back to the previous point and return
 								            ggml_vec_cpy_f32(nx, x, xp);
 								            ggml_vec_cpy_f32(nx, g, gp);
 								            return ls;
 								        }
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        opt->loss_after = fx;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        ggml_vec_norm_f32(nx, &xnorm, x);
 								        ggml_vec_norm_f32(nx, &gnorm, g);
 								        GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0));
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								        if (xnorm < 1.0f) {
 								            xnorm = 1.0f;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        if (gnorm/xnorm <= params.lbfgs.eps) {
 								            // converged
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            return GGML_OPT_RESULT_OK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        // delta-based convergence test
 								        if (pf != NULL) {
 								            // need at least params.past iterations to start checking for convergence
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            if (params.past <= k[0]) {
 								                const float rate = (pf[k[0]%params.past] - fx)/fx;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								                if (fabsf(rate) < params.delta) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    return GGML_OPT_RESULT_OK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            pf[k[0]%params.past] = fx;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        // check for improvement
 								        if (params.max_no_improvement > 0) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            if (fx < fx_best[0]) {
 								                fx_best[0] = fx;
 								                n_no_improvement[0] = 0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            } else {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                n_no_improvement[0]++;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                if (n_no_improvement[0] >= params.max_no_improvement) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    return GGML_OPT_RESULT_OK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                }
 								            }
 								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            // reached the maximum number of iterations
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        // update vectors s and y:
 								        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
 								        //   y_{k+1} = g_{k+1} - g_{k}.
 								        //
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp);
 								        ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								        // compute scalars ys and yy:
 								        //     ys = y^t \cdot s    -> 1 / \rho.
 								        //     yy = y^t \cdot y.
 								        //
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								        ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
 								        ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        lm_ys[end[0]] = ys;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								        // find new search direction
 								        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        bound = (m <= k[0]) ? m : k[0];
 								        k[0]++;
 								        it++;
 								        end[0] = (end[0] + 1)%m;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								        // initialize search direction with -g
 								        ggml_vec_neg_f32(nx, d, g);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        j[0] = end[0];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        for (int i = 0; i < bound; ++i) {
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            j[0] = (j[0] + m - 1) % m;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            lm_alpha[j[0]] /= lm_ys[j[0]];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        ggml_vec_scale_f32(nx, d, ys/yy);
 								        for (int i = 0; i < bound; ++i) {
 								            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								            ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            beta /= lm_ys[j[0]];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
 								            j[0] = (j[0] + 1)%m;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								        step[0] = 1.0;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ci : add an option to fail on compile warning (llama/3952)

* feat(ci): add an option to fail on compile warning

* Update CMakeLists.txt

* minor : fix compile warnings

ggml-ci

* ggml : fix unreachable code warnings

ggml-ci

* ci : disable fatal warnings for windows, ios and tvos

* ggml : fix strncpy warning

* ci : disable fatal warnings for MPI build

* ci : add fatal warnings to ggml-ci

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-17 21:03:14 +00:00
+								    GGML_ASSERT(false && "lbfgs failed");
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    return GGML_OPT_RESULT_DID_NOT_CONVERGE;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
 								struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
 								    struct ggml_opt_params result;
 								    switch (type) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        case GGML_OPT_TYPE_ADAM:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
 								                result = (struct ggml_opt_params) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    .type       = GGML_OPT_TYPE_ADAM,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
 								                    .n_threads  = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
 								                    .past       = 0,
 								                    .delta      = 1e-5f,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								                    .max_no_improvement = 100,
 								                    .print_forward_graph  = true,
 								                    .print_backward_graph = true,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    .n_gradient_accumulation = 1,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                    .adam = {
 								                        .n_iter = 10000,
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                        .sched  = 1.000f,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                        .decay  = 0.0f,
 								                        .decay_min_ndim = 2,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                        .alpha  = 0.001f,
 								                        .beta1  = 0.9f,
 								                        .beta2  = 0.999f,
 								                        .eps    = 1e-8f,
 								                        .eps_f  = 1e-5f,
 								                        .eps_g  = 1e-3f,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                        .gclip  = 0.0f,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                    },
 								                };
 								            } break;
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        case GGML_OPT_TYPE_LBFGS:
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								            {
 								                result = (struct ggml_opt_params) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								                    .type       = GGML_OPT_TYPE_LBFGS,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
 								                    .n_threads  = 1,
 								                    .past       = 0,
 								                    .delta      = 1e-5f,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
 								                    .max_no_improvement = 0,
 								                    .print_forward_graph  = true,
 								                    .print_backward_graph = true,
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    .n_gradient_accumulation = 1,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								                    .lbfgs = {
 								                        .m              = 6,
 								                        .n_iter         = 100,
 								                        .max_linesearch = 20,
 								                        .eps      = 1e-5f,
 								                        .ftol     = 1e-4f,
 								                        .wolfe    = 0.9f,
 								                        .min_step = 1e-20f,
 								                        .max_step = 1e+20f,
 								                        .linesearch = GGML_LINESEARCH_DEFAULT,
 								                    },
 								                };
 								            } break;
 								    }
 								    return result;
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								GGML_API void ggml_opt_init(
 								        struct ggml_context * ctx,
 								        struct ggml_opt_context * opt,
 								        struct ggml_opt_params params,
 								        int64_t nx) {
 								    opt->ctx = ctx;
 								    opt->params = params;
 								    opt->iter = 0;
 								    opt->nx = nx;
 								    opt->just_initialized = true;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    if (opt->ctx == NULL) {
 								        struct ggml_init_params ctx_opt_params;
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        if (opt->params.type == GGML_OPT_TYPE_ADAM) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
 								            if (opt->params.past > 0) {
 								                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
 								            }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
 								            if (opt->params.past > 0) {
 								                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
 								            }
 								        }
 								        ctx_opt_params.mem_buffer = NULL;
 								        ctx_opt_params.no_alloc   = false;
 								        opt->ctx = ggml_init(ctx_opt_params);
 								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    switch (opt->params.type) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        case GGML_OPT_TYPE_ADAM:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                opt->adam.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
 								                opt->adam.m  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
 								                opt->adam.v  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                opt->adam.pf = params.past > 0
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    : NULL;
 								                ggml_set_zero(opt->adam.m);
 								                ggml_set_zero(opt->adam.v);
 								                if (opt->adam.pf) {
 								                    ggml_set_zero(opt->adam.pf);
 								                }
 								            } break;
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        case GGML_OPT_TYPE_LBFGS:
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								            {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                opt->lbfgs.x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
 								                opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
 								                opt->lbfgs.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
 								                opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
 								                opt->lbfgs.d  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                opt->lbfgs.pf = params.past > 0
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                    : NULL;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
 								                opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
 								                opt->lbfgs.lms  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
 								                opt->lbfgs.lmy  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								                ggml_set_zero(opt->lbfgs.x);
 								                ggml_set_zero(opt->lbfgs.xp);
 								                ggml_set_zero(opt->lbfgs.g);
 								                ggml_set_zero(opt->lbfgs.gp);
 								                ggml_set_zero(opt->lbfgs.d);
 								                if (opt->lbfgs.pf) {
 								                    ggml_set_zero(opt->lbfgs.pf);
 								                }
 								                ggml_set_zero(opt->lbfgs.lmal);
 								                ggml_set_zero(opt->lbfgs.lmys);
 								                ggml_set_zero(opt->lbfgs.lms);
 								                ggml_set_zero(opt->lbfgs.lmy);
 								            } break;
 								    }
 								}
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								enum ggml_opt_result ggml_opt(
 								        struct ggml_context * ctx,
 								        struct ggml_opt_params params,
 								        struct ggml_tensor * f) {
 								    bool free_ctx = false;
 								    if (ctx == NULL) {
 								        struct ggml_init_params params_ctx = {
 								            .mem_size   = 16*1024*1024,
 								            .mem_buffer = NULL,
-												ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)

											
										
										
											2023-04-10 19:28:54 +00:00
+								            .no_alloc   = false,
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        };
 								        ctx = ggml_init(params_ctx);
 								        if (ctx == NULL) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								            return GGML_OPT_RESULT_NO_CONTEXT;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								        }
 								        free_ctx = true;
 								    }
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    enum ggml_opt_result result = GGML_OPT_RESULT_OK;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
 								    ggml_opt_init(ctx, opt, params, 0);
 								    result = ggml_opt_resume(ctx, opt, f);
 								    if (free_ctx) {
 								        ggml_free(ctx);
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return result;
 								}
 								enum ggml_opt_result ggml_opt_resume(
 								        struct ggml_context * ctx,
 								        struct ggml_opt_context * opt,
 								        struct ggml_tensor * f) {
 								    // build forward + backward compute graphs
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
 								    ggml_build_forward_expand(gf, f);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
 								    ggml_build_backward_expand(ctx, gf, gb, true);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								    return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
 								enum ggml_opt_result ggml_opt_resume_g(
 								        struct ggml_context * ctx,
 								        struct ggml_opt_context * opt,
 								        struct ggml_tensor * f,
 								        struct ggml_cgraph * gf,
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								        struct ggml_cgraph * gb,
 								        ggml_opt_callback callback,
 								        void * callback_data) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    // build forward + backward compute graphs
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								    enum ggml_opt_result result = GGML_OPT_RESULT_OK;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    switch (opt->params.type) {
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        case GGML_OPT_TYPE_ADAM:
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
-												code : normalize enum names (llama/5697)

* coda : normalize enum names

ggml-ci

* code : cont

* code : cont

											
										
										
											2024-02-25 10:09:09 +00:00
+								        case GGML_OPT_TYPE_LBFGS:
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            {
-												ggml : sync latest llama.cpp (view_src + alloc improvements) (#1247)

* ggml : sync latest llama.cpp (view_src + alloc improvements)

* ggml : fix build
											
										
										
											2023-09-05 17:57:27 +00:00
+								                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            } break;
 								    }
 								    if (opt->params.print_forward_graph) {
 								        ggml_graph_print   (gf);
 								        ggml_graph_dump_dot(gf, NULL, "opt-forward.dot");
 								    }
 								    if (opt->params.print_backward_graph) {
 								        ggml_graph_print   (gb);
 								        ggml_graph_dump_dot(gb, gf, "opt-backward.dot");
 								    }
 								    return result;
 								}
 								////////////////////////////////////////////////////////////////////////////////
-												ggml-alloc : v3 (ggml/727)

* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci

											
										
										
											2024-02-11 12:37:58 +00:00
+								void ggml_set_input(struct ggml_tensor * tensor) {
 								    tensor->flags |= GGML_TENSOR_FLAG_INPUT;
 								}
 								void ggml_set_output(struct ggml_tensor * tensor) {
 								    tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
 								}
 								////////////////////////////////////////////////////////////////////////////////
-												ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci

											
										
										
											2024-01-17 16:54:56 +00:00
+								void ggml_quantize_init(enum ggml_type type) {
 								    ggml_critical_section_start();
 								    switch (type) {
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        case GGML_TYPE_IQ2_XXS:
 								        case GGML_TYPE_IQ2_XS:
-												Adding IQ2_S and IQ2_M to complete coverage of the 2-3 bit quantization range (llama/5721)

* Adding IQ2_S and IQ2_M as a single cumulative commit

* Update examples/quantize/quantize.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-02-26 16:28:38 +00:00
+								        case GGML_TYPE_IQ2_S:
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_S:
 								        case GGML_TYPE_IQ1_M:   iq2xs_init_impl(type); break;
-												SOTA 3-bit quants (llama/5196)

* iq3_xxs: quantize/dequantize

RMSE seems a bit high-ish at about half-way between q2_K and
q3_K, so need to check more.

* iq3_xxs: CUDA dequantize works

* iq2_xxs: tuning quantization

* iq3_xxs: starting to look better

PPL on wiki.test.raw
LLaMA-v1-7B: 6.4218
LLaMA-v2-7B: 6.3560
Mistral-7B : 6.0717

This is better than Q3_K_XS, with a 5% reduction in quantized model
size.

* iq3_xxs: CUDA dot product

We have
PP-512: 5891 t/s
TG-128: 143.9 t/s

* iq3_xxs: scalar and AVX2 dot products

* iq3_xxs: ARM_NEON and Metal

Metal performance is decent, ARM_NEON is pathetic

* iq3_xxs: slightly better grid points

* Faster iq3_xxs and iq2_xs dot products on CUDA

* iq3_xxs: add some quant mix

* iq3_xxs: fix failing quantization test

Dot product still fails. Is this real?

* iq3_xxs: hopefully fix ROCm

* iq3_xxs: failing tests

This time the dot product accuracy did find an actual bug
in the AVX2 implementation.

* Add IQ3_XXS to test-backend-ops

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-30 13:14:12 +00:00
+								        case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
-												IQ3_S: a much better alternative to Q3_K (llama/5676)

* iq4_nl: squash commits for easier rebase

* Basics (quantize, dequantize)
* CUDA dequantize and dot product
* Slightly faster CUDA dot product (120 t/s)
* Switch to 6-bit scales
* Scalar dot product
* AVX2 dot product
* ARM_NEON dot product
* Works on metal, but still slow
* Slightly better Metal dot product
* Another small Metal improvement
* Metal dot product is getting there
* Faster CUDA dot product
* Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided
* Report the actual bpw
* Add _xs mix that is 4.05 bpw for non-MoE models
* Remove IQ4_XS for now, slightly adjust kvalues_iq4nl
* AVX2 dot product uses Q8_0 instead of Q8_K
* Add to test-backend-ops
* Minor fix
* Also use use Q5_K for attn_output in MoE models
* Fixes after merging latest master
* Switching to blocks of 32
* AVX2 for blocks of 32
* Scaler dot product for blocks of 32
* ARM_NEON dot product for blocks of 32
* Metal kernels for blocks of 32
* Slightly faster Metal kernels

* Resurrecting iq3_xs

After all the experimentation, nothing was better than this.

* Minor PPL improvement via a block scale fudge factor

* Minor improvement via 3 neighbours

* iq3_xs: working scalar and AVX2 dot products

* iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s)

* iq3_xs: working Metal implementation

* Adding IQ3_M - IQ3_XS mix with mostly Q4_K

* iiq3_xs: a 3.4375 bpw variant

* iq3_xs: make CUDA work for new version

* iq3_xs: make scalar and AVX2 work for new version

* iq3_s: make ARM_NEON work with new version

* iq3_xs: make new version work on metal

Performance is very similar to Q3_K_S

* iq3_xs: tiny Metal speed improvement

* iq3_xs: tiny Metal speed improvement

* Fix stupid warning

* Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS

* iq3_xs: rename to iq3_s

* iq3_s: make tests pass

* Move Q3_K_XS mix to 3.25 bpw

* Attempt to fix failing tests

* Another attempt to fix the Windows builds

* Attempt to fix ROCm

* ROCm again

* iq3_s: partial fix for QK_K = 64

* iq3_s: make it work on metal for QK_K = 64

Pleasent surprise: the coding was super-block size independent,
so all it took was to delete some QK_K == 256 guards.

* Will this fix ROCm?

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-24 14:23:52 +00:00
+								        case GGML_TYPE_IQ3_S:   iq3xs_init_impl(512); break;
-												ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci

											
										
										
											2024-01-17 16:54:56 +00:00
+								        default: // nothing
 								            break;
 								    }
 								    ggml_critical_section_end();
 								}
 								void ggml_quantize_free(void) {
 								    ggml_critical_section_start();
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								    iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
 								    iq2xs_free_impl(GGML_TYPE_IQ2_XS);
 								    iq2xs_free_impl(GGML_TYPE_IQ1_S);
 								    iq3xs_free_impl(256);
-												ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci

											
										
										
											2024-01-17 16:54:56 +00:00
 								    ggml_critical_section_end();
 								}
 								bool ggml_quantize_requires_imatrix(enum ggml_type type) {
 								    return
 								        type == GGML_TYPE_IQ2_XXS ||
-.5 bit quantization (llama/5453)

* iq1_s: WIP basics

* iq1_s: CUDA is working

* iq1_s: scalar CPU dot product

* iq1_s: WIP AVX2 dot product - something is not right

* Fix tests

* Fix shadow warnings

* Fix after merge with latest master

* iq1_s: AVX2 finally works

* iq1_s: ARM_NEON dot product. Works, but not very fast

* iq1_s: better grid

* iq1_s: use IQ2_XXS for attn_output

At a cost of 0.04 extra bpw this gives a big improvement in PPL.

* iq1_s: Metal basics

Dequantize works, but not dot product

* iq1_s: Metal works, but quite slow

As usual, Apple Silicon does not like the code I write.

* iq1_s: Tests

* iq1_s: slightly faster dot product

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-18 16:16:55 +00:00
+								        type == GGML_TYPE_IQ2_XS  ||
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        type == GGML_TYPE_IQ1_S;//   ||
 								        //type == GGML_TYPE_IQ1_M;
-												ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci

											
										
										
											2024-01-17 16:54:56 +00:00
+								}
-												ggml : remove old quantization functions (llama/5942)

* ggml : remove old quantization functions

ggml-ci

* ggml : simplify ggml_quantize_chunk

ggml-ci

* ggml : restrict correctness

ggml-ci

* ggml : remove hist data from the quantization API

ggml-ci

* tests : remove hist usage in test-backend-ops

ggml-ci

* vulkan : remove hist and fix typo

											
										
										
											2024-03-09 13:53:59 +00:00
+								size_t ggml_quantize_chunk(
 								        enum ggml_type   type,
 								           const float * src,
 								                  void * dst,
 								                   int   start,
 								                   int   nrows,
 								                   int   n_per_row,
 								           const float * imatrix) {
 								    const int n = nrows * n_per_row;
 								    if (ggml_quantize_requires_imatrix(type)) {
 								        GGML_ASSERT(imatrix != NULL);
 								    }
 								    GGML_ASSERT(start % type_traits[type].blck_size == 0);
 								    GGML_ASSERT(start % n_per_row == 0);
-												ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci

											
										
										
											2024-01-17 16:54:56 +00:00
+								    ggml_quantize_init(type); // this is noop if already initialized
-												ggml : remove old quantization functions (llama/5942)

* ggml : remove old quantization functions

ggml-ci

* ggml : simplify ggml_quantize_chunk

ggml-ci

* ggml : restrict correctness

ggml-ci

* ggml : remove hist data from the quantization API

ggml-ci

* tests : remove hist usage in test-backend-ops

ggml-ci

* vulkan : remove hist and fix typo

											
										
										
											2024-03-09 13:53:59 +00:00
 								    const size_t start_row = start / n_per_row;
 								    const size_t row_size  = ggml_row_size(type, n_per_row);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    size_t result = 0;
-												ggml : remove old quantization functions (llama/5942)

* ggml : remove old quantization functions

ggml-ci

* ggml : simplify ggml_quantize_chunk

ggml-ci

* ggml : restrict correctness

ggml-ci

* ggml : remove hist data from the quantization API

ggml-ci

* tests : remove hist usage in test-backend-ops

ggml-ci

* vulkan : remove hist and fix typo

											
										
										
											2024-03-09 13:53:59 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    switch (type) {
-												ggml : remove old quantization functions (llama/5942)

* ggml : remove old quantization functions

ggml-ci

* ggml : simplify ggml_quantize_chunk

ggml-ci

* ggml : restrict correctness

ggml-ci

* ggml : remove hist data from the quantization API

ggml-ci

* tests : remove hist usage in test-backend-ops

ggml-ci

* vulkan : remove hist and fix typo

											
										
										
											2024-03-09 13:53:59 +00:00
+								        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_IQ3_S:   result = quantize_iq3_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_IQ2_S:   result = quantize_iq2_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								        case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-												ggml : remove old quantization functions (llama/5942)

* ggml : remove old quantization functions

ggml-ci

* ggml : simplify ggml_quantize_chunk

ggml-ci

* ggml : restrict correctness

ggml-ci

* ggml : remove hist data from the quantization API

ggml-ci

* tests : remove hist usage in test-backend-ops

ggml-ci

* vulkan : remove hist and fix typo

											
										
										
											2024-03-09 13:53:59 +00:00
+								        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (llama/5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-28 08:37:02 +00:00
+								#if QK_K == 64
-												ggml : remove old quantization functions (llama/5942)

* ggml : remove old quantization functions

ggml-ci

* ggml : simplify ggml_quantize_chunk

ggml-ci

* ggml : restrict correctness

ggml-ci

* ggml : remove hist data from the quantization API

ggml-ci

* tests : remove hist usage in test-backend-ops

ggml-ci

* vulkan : remove hist and fix typo

											
										
										
											2024-03-09 13:53:59 +00:00
+								        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 								#else
 								        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-												ggml : make i-quants work with super-blocks of 64 (CPU,Metal) (llama/5760)

* WIP: make i-quants work for QK_K = 64

* iq2_xs: attempt to fix AVX dot product for QK_K = 64

Tests pass, but I get gibberish.

* QK_K = 64 tests pass on ARM_NEON and Metal

Sadly, that does not mean it actually works.

* Make CUDA compile with QK_K = 64

Tests don't pass, plus we get misaligned access

* Q2_K: fixed bug in imatrix quantization for QK_K = 64

* iq1_s: turn off SIMD implementation for QK_K = 64 (it does not work)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-02-28 08:37:02 +00:00
+								#endif
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        case GGML_TYPE_F16:
 								            {
-												ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci

											
										
										
											2024-01-17 16:54:56 +00:00
+								                size_t elemsize = sizeof(ggml_fp16_t);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
 								                result = n * elemsize;
 								            } break;
 								        case GGML_TYPE_F32:
 								            {
-												ggml : add IQ2 to test-backend-ops + refactoring (llama/4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci

											
										
										
											2024-01-17 16:54:56 +00:00
+								                size_t elemsize = sizeof(float);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                result = n * elemsize;
 								                memcpy((uint8_t *)dst + start * elemsize, src + start, result);
 								            } break;
 								        default:
 								            assert(false);
 								    }
-												ggml : remove old quantization functions (llama/5942)

* ggml : remove old quantization functions

ggml-ci

* ggml : simplify ggml_quantize_chunk

ggml-ci

* ggml : restrict correctness

ggml-ci

* ggml : remove hist data from the quantization API

ggml-ci

* tests : remove hist usage in test-backend-ops

ggml-ci

* vulkan : remove hist and fix typo

											
										
										
											2024-03-09 13:53:59 +00:00
 								    GGML_ASSERT(result == nrows * row_size);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return result;
 								}
 								////////////////////////////////////////////////////////////////////////////////
 								struct gguf_str {
 								    uint64_t n;  // GGUFv2
 								    char * data;
 								};
 								static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
 								    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
 								    [GGUF_TYPE_INT8]    = sizeof(int8_t),
 								    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
 								    [GGUF_TYPE_INT16]   = sizeof(int16_t),
 								    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
 								    [GGUF_TYPE_INT32]   = sizeof(int32_t),
 								    [GGUF_TYPE_FLOAT32] = sizeof(float),
 								    [GGUF_TYPE_BOOL]    = sizeof(bool),
 								    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
 								    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
 								    [GGUF_TYPE_INT64]   = sizeof(int64_t),
 								    [GGUF_TYPE_FLOAT64] = sizeof(double),
 								    [GGUF_TYPE_ARRAY]   = 0, // undefined
 								};
 								static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 								static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
 								    [GGUF_TYPE_UINT8]   = "u8",
 								    [GGUF_TYPE_INT8]    = "i8",
 								    [GGUF_TYPE_UINT16]  = "u16",
 								    [GGUF_TYPE_INT16]   = "i16",
 								    [GGUF_TYPE_UINT32]  = "u32",
 								    [GGUF_TYPE_INT32]   = "i32",
 								    [GGUF_TYPE_FLOAT32] = "f32",
 								    [GGUF_TYPE_BOOL]    = "bool",
 								    [GGUF_TYPE_STRING]  = "str",
 								    [GGUF_TYPE_ARRAY]   = "arr",
 								    [GGUF_TYPE_UINT64]  = "u64",
 								    [GGUF_TYPE_INT64]   = "i64",
 								    [GGUF_TYPE_FLOAT64] = "f64",
 								};
 								static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 								union gguf_value {
 								    uint8_t  uint8;
 								    int8_t   int8;
 								    uint16_t uint16;
 								    int16_t  int16;
 								    uint32_t uint32;
 								    int32_t  int32;
 								    float    float32;
 								    uint64_t uint64;
 								    int64_t  int64;
 								    double   float64;
 								    bool     bool_;
 								    struct gguf_str str;
 								    struct {
 								        enum gguf_type type;
 								        uint64_t n;  // GGUFv2
 								        void * data;
 								    } arr;
 								};
 								struct gguf_kv {
 								    struct gguf_str key;
 								    enum  gguf_type  type;
 								    union gguf_value value;
 								};
 								struct gguf_header {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    char magic[4];
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    uint32_t version;
 								    uint64_t n_tensors; // GGUFv2
 								    uint64_t n_kv;      // GGUFv2
 								};
 								struct gguf_tensor_info {
 								    struct gguf_str name;
 								    uint32_t n_dims;
 								    uint64_t ne[GGML_MAX_DIMS];
 								    enum ggml_type type;
 								    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
 								    // for writing API
 								    const void * data;
 								    size_t size;
 								};
 								struct gguf_context {
 								    struct gguf_header header;
 								    struct gguf_kv          * kv;
 								    struct gguf_tensor_info * infos;
 								    size_t alignment;
 								    size_t offset;    // offset of `data` from beginning of file
 								    size_t size;      // size of `data` in bytes
 								    //uint8_t * padding;
 								    void * data;
 								};
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								static size_t gguf_type_size(enum gguf_type type) {
 								    GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
 								    return GGUF_TYPE_SIZE[type];
 								}
 								static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
 								    GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
 								    GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
 								    for (uint32_t i = 0; i < info->n_dims; ++i) {
 								        GGML_ASSERT(info->ne[i] > 0);
 								    }
 								    // prevent overflow for total number of elements
 								    GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
 								    GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
 								    GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
 								    const size_t n = fread(dst, 1, size, file);
 								    *offset += n;
 								    return n == size;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    p->n    = 0;
 								    p->data = NULL;
 								    bool ok = true;
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								    ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
 								    // early exit if string length is invalid, prevents from integer overflow
 								    if (p->n == SIZE_MAX) {
 								        fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
 								        return false;
 								    }
 								    p->data = GGML_CALLOC(p->n + 1, 1);
 								    ok = ok && gguf_fread_el(file,  p->data, p->n, offset);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    return ok;
 								}
 								struct gguf_context * gguf_init_empty(void) {
 								    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ctx->header.version   = GGUF_VERSION;
 								    ctx->header.n_tensors = 0;
 								    ctx->header.n_kv      = 0;
 								    ctx->kv    = NULL;
 								    ctx->infos = NULL;
 								    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
 								    ctx->offset    = 0;
 								    ctx->size      = 0;
 								    ctx->data = NULL;
 								    return ctx;
 								}
 								struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    FILE * file = ggml_fopen(fname, "rb");
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (!file) {
 								        return NULL;
 								    }
 								    // offset from start of file
 								    size_t offset = 0;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    char magic[4];
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								    // check the magic before making allocations
 								    {
 								        gguf_fread_el(file, &magic, sizeof(magic), &offset);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        for (uint32_t i = 0; i < sizeof(magic); i++) {
 								            if (magic[i] != GGUF_MAGIC[i]) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                fclose(file);
 								                return NULL;
 								            }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        }
 								    }
 								    bool ok = true;
 								    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 								    // read the header
 								    {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        strncpy(ctx->header.magic, magic, 4);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        ctx->kv    = NULL;
 								        ctx->infos = NULL;
 								        ctx->data  = NULL;
 								        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
 								        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								        if (ctx->header.version == 1) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
 								            fclose(file);
 								            gguf_free(ctx);
 								            return NULL;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        }
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								        // sanity-checks to prevent from integer/buffer overflows
 								        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
 								        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
 								        ok = ok && (ctx->header.n_kv      < (SIZE_MAX/2)/sizeof(struct gguf_kv));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        if (!ok) {
 								            fprintf(stderr, "%s: failed to read header\n", __func__);
 								            fclose(file);
 								            gguf_free(ctx);
 								            return NULL;
 								        }
 								    }
 								    // read the kv pairs
 								    {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								        ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            struct gguf_kv * kv = &ctx->kv[i];
 								            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
 								            ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
 								            ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
 								            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
 								            switch (kv->type) {
 								                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
 								                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
 								                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
 								                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
 								                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
 								                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
 								                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
 								                case GGUF_TYPE_UINT64:  ok = ok && gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
 								                case GGUF_TYPE_INT64:   ok = ok && gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
 								                case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
 								                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
 								                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
 								                case GGUF_TYPE_ARRAY:
 								                    {
 								                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								                        switch (kv->value.arr.type) {
 								                            case GGUF_TYPE_UINT8:
 								                            case GGUF_TYPE_INT8:
 								                            case GGUF_TYPE_UINT16:
 								                            case GGUF_TYPE_INT16:
 								                            case GGUF_TYPE_UINT32:
 								                            case GGUF_TYPE_INT32:
 								                            case GGUF_TYPE_FLOAT32:
 								                            case GGUF_TYPE_UINT64:
 								                            case GGUF_TYPE_INT64:
 								                            case GGUF_TYPE_FLOAT64:
 								                            case GGUF_TYPE_BOOL:
 								                                {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                                    // prevent from integer overflow in the malloc below
-												gguf : fix comparison (ggml/715)

ggml-ci

											
										
										
											2024-01-29 19:08:18 +00:00
+								                                    if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
 								                                        fclose(file);
 								                                        gguf_free(ctx);
 								                                        return NULL;
 								                                    }
 								                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
 								                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                } break;
 								                            case GGUF_TYPE_STRING:
 								                                {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                                    // prevent from integer overflow in the malloc below
-												gguf : fix comparison (ggml/715)

ggml-ci

											
										
										
											2024-01-29 19:08:18 +00:00
+								                                    if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
 								                                        fclose(file);
 								                                        gguf_free(ctx);
 								                                        return NULL;
 								                                    }
 								                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
 								                                    }
 								                                } break;
 								                            case GGUF_TYPE_ARRAY:
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                            default: GGML_ASSERT(false && "invalid type"); break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    } break;
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                default: GGML_ASSERT(false && "invalid type");
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								            }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								            if (!ok) {
 								                break;
 								            }
 								        }
 								        if (!ok) {
 								            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
 								            fclose(file);
 								            gguf_free(ctx);
 								            return NULL;
 								        }
 								    }
 								    // read the tensor infos
 								    {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								        ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            struct gguf_tensor_info * info = &ctx->infos[i];
 								            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
 								                info->ne[j] = 1;
 								            }
 								            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
 								            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
 								            ok = ok && (info->n_dims <= GGML_MAX_DIMS);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            for (uint32_t j = 0; j < info->n_dims; ++j) {
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            }
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
 								            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								            gguf_tensor_info_sanitize(info);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            if (!ok) {
 								                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
 								                fclose(file);
 								                gguf_free(ctx);
 								                return NULL;
 								            }
 								        }
 								    }
 								    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
 								    int alignment_idx = gguf_find_key(ctx, "general.alignment");
 								    if (alignment_idx != -1) {
 								        ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
 								    }
 								    // we require the data section to be aligned, so take into account any padding
 								    {
 								        const size_t offset_pad = offset % ctx->alignment;
 								        if (offset_pad != 0) {
 								            offset += ctx->alignment - offset_pad;
 								            fseek(file, offset, SEEK_SET);
 								        }
 								    }
 								    // store the current file offset - this is where the data section starts
 								    ctx->offset = offset;
 								    // compute the total size of the data section, taking into account the alignment
 								    {
 								        ctx->size = 0;
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            struct gguf_tensor_info * info = &ctx->infos[i];
 								            const int64_t ne =
 								                (int64_t) info->ne[0] *
 								                (int64_t) info->ne[1] *
 								                (int64_t) info->ne[2] *
 								                (int64_t) info->ne[3];
 								            if (ne % ggml_blck_size(info->type) != 0) {
-												ggml : SOTA 2-bit quants (add IQ2_XS) (llama/4856)

* iq2_xs: basics

* iq2_xs: this should have been in the basics

* iq2_xs: CUDA and scalar CPU works

* iq2_xs: WIP Metal

* iq2_xs: Metal now works

* iq2_xs: working, but dog slow, ARM_NEON dot product

* iq2_xs: better ARM_NEON dot product

We are now at 19.5 t/s for TG-128 and 61 t/s for PP-512 when
running on the CPU.

* iq2_xs: AVX2 dot product - 19.5 t/s

* iq2_xs: faster AVX2 dit product

21.4 t/s for TG-128, 59.2 t/s for PP-512.
The latter is 2x compared to the previous version.

* iq2_xs: had forgotten to delete iq2-data.h

* Add llama enum for IQ2_XS

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

											
										
										
											2024-01-11 19:39:39 +00:00
+								                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
 								                        __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                fclose(file);
 								                gguf_free(ctx);
 								                return NULL;
 								            }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								            const size_t size_cur = ggml_row_size(info->type, ne);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
 								            ctx->size += GGML_PAD(size_cur, ctx->alignment);
 								        }
 								    }
 								    // load the tensor data only if requested
 								    if (params.ctx != NULL) {
 								        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
 								        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
 								        // the ggml_tensor structs to the appropriate locations in the binary blob
 								        // compute the exact size needed for the new ggml_context
 								        const size_t mem_size =
 								            params.no_alloc ?
 								            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
 								            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
 								        struct ggml_init_params pdata = {
 								            .mem_size   = mem_size,
 								            .mem_buffer = NULL,
 								            .no_alloc   = params.no_alloc,
 								        };
 								        *params.ctx = ggml_init(pdata);
 								        struct ggml_context * ctx_data = *params.ctx;
 								        struct ggml_tensor * data = NULL;
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								        if (!params.no_alloc) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
 								            ok = ok && data != NULL;
 								            // read the binary blob with the tensor data
 								            ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
 								            if (!ok) {
 								                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
 								                fclose(file);
 								                ggml_free(ctx_data);
 								                gguf_free(ctx);
 								                return NULL;
 								            }
 								            ctx->data = data->data;
 								        }
 								        ggml_set_no_alloc(ctx_data, true);
 								        // create the tensors
-												sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)


											
										
										
											2023-11-17 08:00:07 +00:00
+								        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            const int64_t ne[GGML_MAX_DIMS] = {
 								                ctx->infos[i].ne[0],
 								                ctx->infos[i].ne[1],
 								                ctx->infos[i].ne[2],
 								                ctx->infos[i].ne[3],
 								            };
 								            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
 								            ok = ok && cur != NULL;
 								            ggml_set_name(cur, ctx->infos[i].name.data);
 								            if (!ok) {
 								                break;
 								            }
 								            // point the data member to the appropriate location in the binary blob using the tensor infos
-												sync : ggml (HBM + Metal + style) (#1264)


											
										
										
											2023-09-08 14:58:31 +00:00
+								            if (!params.no_alloc) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
 								                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
 								            }
 								        }
 								        if (!ok) {
 								            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
 								            fclose(file);
 								            ggml_free(ctx_data);
 								            gguf_free(ctx);
 								            return NULL;
 								        }
 								        ggml_set_no_alloc(ctx_data, params.no_alloc);
 								    }
 								    fclose(file);
 								    return ctx;
 								}
 								void gguf_free(struct gguf_context * ctx) {
 								    if (ctx == NULL) {
 								        return;
 								    }
 								    if (ctx->kv) {
 								        // free string memory - not great..
-												gguf : fix potential infinite for-loop (llama/4600)

Co-authored-by: Bernhard Gstrein <gstrein@informatik.uni-freiburg.de>

											
										
										
											2024-01-13 16:06:20 +00:00
+								        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            struct gguf_kv * kv = &ctx->kv[i];
 								            if (kv->key.data) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                GGML_FREE(kv->key.data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            }
 								            if (kv->type == GGUF_TYPE_STRING) {
 								                if (kv->value.str.data) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                    GGML_FREE(kv->value.str.data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                }
 								            }
 								            if (kv->type == GGUF_TYPE_ARRAY) {
 								                if (kv->value.arr.data) {
 								                    if (kv->value.arr.type == GGUF_TYPE_STRING) {
-												gguf : fix potential infinite for-loop (llama/4600)

Co-authored-by: Bernhard Gstrein <gstrein@informatik.uni-freiburg.de>

											
										
										
											2024-01-13 16:06:20 +00:00
+								                        for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
 								                            if (str->data) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                                GGML_FREE(str->data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            }
 								                        }
 								                    }
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                    GGML_FREE(kv->value.arr.data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                }
 								            }
 								        }
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								        GGML_FREE(ctx->kv);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    }
 								    if (ctx->infos) {
-												gguf : fix potential infinite for-loop (llama/4600)

Co-authored-by: Bernhard Gstrein <gstrein@informatik.uni-freiburg.de>

											
										
										
											2024-01-13 16:06:20 +00:00
+								        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            struct gguf_tensor_info * info = &ctx->infos[i];
 								            if (info->name.data) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                GGML_FREE(info->name.data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								            }
 								        }
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								        GGML_FREE(ctx->infos);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    }
 								    GGML_ALIGNED_FREE(ctx);
 								}
 								const char * gguf_type_name(enum gguf_type type) {
 								    return GGUF_TYPE_NAME[type];
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								int gguf_get_version(const struct gguf_context * ctx) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ctx->header.version;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								size_t gguf_get_alignment(const struct gguf_context * ctx) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ctx->alignment;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								size_t gguf_get_data_offset(const struct gguf_context * ctx) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ctx->offset;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								void * gguf_get_data(const struct gguf_context * ctx) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ctx->data;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								int gguf_get_n_kv(const struct gguf_context * ctx) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ctx->header.n_kv;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								int gguf_find_key(const struct gguf_context * ctx, const char * key) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // return -1 if key not found
 								    int keyfound = -1;
 								    const int n_kv = gguf_get_n_kv(ctx);
 								    for (int i = 0; i < n_kv; ++i) {
 								        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
 								            keyfound = i;
 								            break;
 								        }
 								    }
 								    return keyfound;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    return ctx->kv[key_id].key.data;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    return ctx->kv[key_id].type;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
 								    return ctx->kv[key_id].value.arr.type;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
 								    return ctx->kv[key_id].value.arr.data;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct gguf_kv * kv = &ctx->kv[key_id];
 								    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
 								    return str->data;
 								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
 								    return ctx->kv[key_id].value.arr.n;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
 								    return ctx->kv[key_id].value.uint8;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
 								    return ctx->kv[key_id].value.int8;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
 								    return ctx->kv[key_id].value.uint16;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
 								    return ctx->kv[key_id].value.int16;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
 								    return ctx->kv[key_id].value.uint32;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
 								    return ctx->kv[key_id].value.int32;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
 								    return ctx->kv[key_id].value.float32;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
 								    return ctx->kv[key_id].value.uint64;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
 								    return ctx->kv[key_id].value.int64;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
 								    return ctx->kv[key_id].value.float64;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
 								    return ctx->kv[key_id].value.bool_;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
 								    return ctx->kv[key_id].value.str.data;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												sync : ggml (new ops, new backend, etc) (#1602)

* sync : ggml (new ops, new backend, etc)

* whisper : remove obsolete broadcasting code

* ggml : remove backend self-registers + fix ggml_concat + n_task logic

* metal : fix assert

* metal : print resource path

* whisper : fix bug if metal init fails
											
										
										
											2023-12-07 20:27:19 +00:00
+								const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
 								    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
 								    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
 								    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
 								    return &ctx->kv[key_id].value;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								int gguf_get_n_tensors(const struct gguf_context * ctx) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ctx->header.n_tensors;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // return -1 if tensor not found
 								    int tensorfound = -1;
 								    const int n_tensors = gguf_get_n_tensors(ctx);
 								    for (int i = 0; i < n_tensors; ++i) {
 								        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
 								            tensorfound = i;
 								            break;
 								        }
 								    }
 								    return tensorfound;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ctx->infos[i].offset;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return ctx->infos[i].name.data;
 								}
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
 								    return ctx->infos[i].type;
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								// returns the index
 								static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
 								    const int idx = gguf_find_key(ctx, key);
 								    if (idx >= 0) {
 								        return idx;
 								    }
 								    const int n_kv = gguf_get_n_kv(ctx);
 								    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
 								    ctx->kv[n_kv].key.n    = strlen(key);
 								    ctx->kv[n_kv].key.data = strdup(key);
 								    ctx->header.n_kv++;
 								    return n_kv;
 								}
 								void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type        = GGUF_TYPE_UINT8;
 								    ctx->kv[idx].value.uint8 = val;
 								}
 								void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type       = GGUF_TYPE_INT8;
 								    ctx->kv[idx].value.int8 = val;
 								}
 								void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type         = GGUF_TYPE_UINT16;
 								    ctx->kv[idx].value.uint16 = val;
 								}
 								void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type        = GGUF_TYPE_INT16;
 								    ctx->kv[idx].value.int16 = val;
 								}
 								void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type         = GGUF_TYPE_UINT32;
 								    ctx->kv[idx].value.uint32 = val;
 								}
 								void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type        = GGUF_TYPE_INT32;
 								    ctx->kv[idx].value.int32 = val;
 								}
 								void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type          = GGUF_TYPE_FLOAT32;
 								    ctx->kv[idx].value.float32 = val;
 								}
 								void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type         = GGUF_TYPE_UINT64;
 								    ctx->kv[idx].value.uint64 = val;
 								}
 								void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type        = GGUF_TYPE_INT64;
 								    ctx->kv[idx].value.int64 = val;
 								}
 								void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type          = GGUF_TYPE_FLOAT64;
 								    ctx->kv[idx].value.float64 = val;
 								}
 								void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
 								    ctx->kv[idx].value.bool_ = val;
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ctx->kv[idx].type           = GGUF_TYPE_STRING;
 								    ctx->kv[idx].value.str.n    = strlen(val);
 								    ctx->kv[idx].value.str.data = strdup(val);
 								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
 								    ctx->kv[idx].value.arr.type = type;
 								    ctx->kv[idx].value.arr.n    = n;
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
 								    memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
 								    const int idx = gguf_get_or_add_key(ctx, key);
 								    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
 								    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
 								    ctx->kv[idx].value.arr.n    = n;
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    for (int i = 0; i < n; i++) {
 								        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
 								        str->n    = strlen(data[i]);
 								        str->data = strdup(data[i]);
 								    }
 								}
 								// set or add KV pairs from another context
 								void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
 								    for (uint32_t i = 0; i < src->header.n_kv; i++) {
 								        switch (src->kv[i].type) {
 								            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
 								            case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
 								            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
 								            case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
 								            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
 								            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
 								            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
 								            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
 								            case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
 								            case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
 								            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
 								            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
 								            case GGUF_TYPE_ARRAY:
 								                {
 								                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                        const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
 								                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
 								                        }
 								                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                        GGML_FREE((void *)data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                    } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
 								                        GGML_ASSERT(false && "nested arrays not supported");
 								                    } else {
 								                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
 								                    }
 								                } break;
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								            default: GGML_ASSERT(false && "invalid type"); break;
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        }
 								    }
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								void gguf_add_tensor(
 								             struct gguf_context * ctx,
 								        const struct ggml_tensor * tensor) {
 								    const int idx = ctx->header.n_tensors;
 								    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
-												ggml : sync latest ggml lib

											
										
										
											2023-06-25 11:22:21 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ctx->infos[idx].name.n    = strlen(tensor->name);
 								    ctx->infos[idx].name.data = strdup(tensor->name);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
 								        ctx->infos[idx].ne[i] = 1;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677)

* sync : ggml

* sync : llama.cpp

* talk-llama : fix obsolete param

* ggml-alloc : fix ggml_tallocr_is_own

* talk.wasm : update to new ggml

* ggml : fix type punning in ggml_scale

* ggml : cuda jetson + arm quants warnings
											
										
										
											2023-12-22 15:53:39 +00:00
+								    ctx->infos[idx].n_dims = ggml_n_dims(tensor);
 								    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        ctx->infos[idx].ne[i] = tensor->ne[i];
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ctx->infos[idx].type   = tensor->type;
 								    ctx->infos[idx].offset = 0;
 								    ctx->infos[idx].data   = tensor->data;
 								    ctx->infos[idx].size   = ggml_nbytes(tensor);
 								    if (ctx->header.n_tensors > 0) {
 								        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ctx->header.n_tensors++;
-												Initial release

											
										
										
											2022-09-25 18:23:15 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
 								    const int idx = gguf_find_tensor(ctx, name);
 								    if (idx < 0) {
 								        GGML_ASSERT(false && "tensor not found");
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ctx->infos[idx].type = type;
 								}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
 								    const int idx = gguf_find_tensor(ctx, name);
 								    if (idx < 0) {
 								        GGML_ASSERT(false && "tensor not found");
 								    }
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    ctx->infos[idx].data = data;
 								    ctx->infos[idx].size = size;
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // update offsets
 								    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
 								        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								    }
 								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
 								//    fwrite(&val->n,   sizeof(val->n),    1, file);
 								//    fwrite(val->data, sizeof(char), val->n, file);
 								//}
 								//
 								//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
 								//    fwrite(val, sizeof(char), size, file);
 								//}
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								struct gguf_buf {
 								    void * data;
 								    size_t size;
 								    size_t offset;
 								};
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static struct gguf_buf gguf_buf_init(size_t size) {
 								    struct gguf_buf buf = {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								        /*buf.data   =*/ size == 0 ? NULL : GGML_MALLOC(size),
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        /*buf.size   =*/ size,
 								        /*buf.offset =*/ 0,
 								    };
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    return buf;
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void gguf_buf_free(struct gguf_buf buf) {
 								    if (buf.data) {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								        GGML_FREE(buf.data);
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    }
 								}
 								static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
 								    if (buf->offset + size > buf->size) {
 								        buf->size = 1.5*(buf->offset + size);
 								        if (buf->data) {
 								            buf->data = realloc(buf->data, buf->size);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        }
 								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
 								    gguf_buf_grow(buf, sizeof(val->n) + val->n);
 								    if (buf->data) {
 								        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
 								    }
 								    buf->offset += sizeof(val->n);
 								    if (buf->data) {
 								        memcpy((char *) buf->data + buf->offset, val->data, val->n);
 								    }
 								    buf->offset += val->n;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								}
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
 								    gguf_buf_grow(buf, el_size);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (buf->data) {
 								        memcpy((char *) buf->data + buf->offset, val, el_size);
 								    }
 								    buf->offset += el_size;
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // write header
 								    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
 								    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
 								    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
 								    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // write key-value pairs
 								    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
 								        struct gguf_kv * kv = &ctx->kv[i];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        gguf_bwrite_str(buf, &kv->key);
 								        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        switch (kv->type) {
 								            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
 								            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
 								            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
 								            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
 								            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
 								            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
 								            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
 								            case GGUF_TYPE_UINT64:  gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
 								            case GGUF_TYPE_INT64:   gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
 								            case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
 								            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
 								            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
 								            case GGUF_TYPE_ARRAY:
 								                {
 								                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
 								                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
 								                    switch (kv->value.arr.type) {
 								                        case GGUF_TYPE_UINT8:
 								                        case GGUF_TYPE_INT8:
 								                        case GGUF_TYPE_UINT16:
 								                        case GGUF_TYPE_INT16:
 								                        case GGUF_TYPE_UINT32:
 								                        case GGUF_TYPE_INT32:
 								                        case GGUF_TYPE_FLOAT32:
 								                        case GGUF_TYPE_UINT64:
 								                        case GGUF_TYPE_INT64:
 								                        case GGUF_TYPE_FLOAT64:
 								                        case GGUF_TYPE_BOOL:
 								                            {
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                            } break;
 								                        case GGUF_TYPE_STRING:
 								                            {
 								                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
 								                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
 								                                }
 								                            } break;
 								                        case GGUF_TYPE_ARRAY:
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								                        default: GGML_ASSERT(false && "invalid type"); break;
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								                    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								                } break;
-												gguf : add input validation, prevent integer overflows (ggml/709)

* gguf : add input validation, prevent integer overflows

ggml-ci

* gguf : fix switch default case

* gguf : sanitize info->n_dims and info->type

ggml-ci

* gguf : assert GGUF_TYPE_SIZE access

ggml-ci

* ggml : assert mallocs are successful

ggml-ci

* gguf : prevent integer overflow

* gguf : sanitize tensor info

ggml-ci

* gguf : stricter limit on the number of items

ggml-ci

											
										
										
											2024-01-29 12:00:10 +00:00
+								            default: GGML_ASSERT(false && "invalid type");
-												sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.)

* metal : allow env metal variable to override resource path (#1415)

* Allow env variable to override resource path

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* sync : restore common / main from `master`

* sync : restore whisper from `master`

* talk-llama : update to latest llama.cpp

* ruby : fix build

* ggml : fix 32-bit ARM build

* ggml : fix MIN / MAX macro collisions + update ios bindings

* ggml : fix ifdefs and MIN / MAX again

* exampels : fix Obj-C and Swift examples

* ggml : fix 32-bit ARM compatibility

* ggml : one more attempt to fix 32-bit ARM compat

* whisper : fix support for larger graphs

---------

Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
											
										
										
											2023-11-03 19:35:05 +00:00
+								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // write tensor infos
 								    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
 								        struct gguf_tensor_info * info = &ctx->infos[i];
 								        gguf_bwrite_str(buf, &info->name);
 								        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
 								        for (uint32_t j = 0; j < info->n_dims; ++j) {
 								            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
 								        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // we require the data section to be aligned, so take into account any padding
 								    {
 								        const size_t offset     = buf->offset;
 								        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        if (offset_pad != offset) {
 								            uint8_t pad = 0;
 								            for (size_t i = 0; i < offset_pad - offset; ++i) {
 								                gguf_bwrite_el(buf, &pad, sizeof(pad));
 								            }
 								        }
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (only_meta) {
 								        return;
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    size_t offset = 0;
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // write tensor data
 								    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
 								        struct gguf_tensor_info * info = &ctx->infos[i];
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        const size_t size     = info->size;
 								        const size_t size_pad = GGML_PAD(size, ctx->alignment);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        gguf_bwrite_el(buf, info->data, size);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        if (size_pad != size) {
 								            uint8_t pad = 0;
 								            for (size_t j = 0; j < size_pad - size; ++j) {
 								                gguf_bwrite_el(buf, &pad, sizeof(pad));
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								            }
 								        }
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								        GGML_ASSERT(offset == info->offset);
 								        offset += size_pad;
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    FILE * file = ggml_fopen(fname, "wb");
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    if (!file) {
 								        GGML_ASSERT(false && "failed to open file for writing");
 								    }
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct gguf_buf buf = gguf_buf_init(16*1024);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    gguf_write_to_buf(ctx, &buf, only_meta);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    fwrite(buf.data, 1, buf.offset, file);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    gguf_buf_free(buf);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    fclose(file);
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								size_t gguf_get_meta_size(const struct gguf_context * ctx) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    // no allocs - only compute size
 								    struct gguf_buf buf = gguf_buf_init(0);
 								    gguf_write_to_buf(ctx, &buf, true);
 								    return buf.offset;
 								}
-												sync : ggml (const correctness)

											
										
										
											2023-09-15 11:49:56 +00:00
+								void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
-												ggml : sync (ggml-alloc, GPU, eps, etc.) (#1220)

* ggml : sync (ggml-alloc, GPU, eps, etc.)

* ggml : fix build

* wasm : fix build
											
										
										
											2023-09-05 10:54:40 +00:00
+								    struct gguf_buf buf = gguf_buf_init(16*1024);
 								    gguf_write_to_buf(ctx, &buf, true);
 								    memcpy(data, buf.data, buf.offset);
 								    gguf_buf_free(buf);
-												talk-llama : add new example + sync ggml from llama.cpp (#664)

* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
											
										
										
											2023-03-27 18:00:32 +00:00
+								}
 								////////////////////////////////////////////////////////////////////////////////
-												add AVX support

											
										
										
											2022-11-23 11:23:24 +00:00
+								int ggml_cpu_has_avx(void) {
 								#if defined(__AVX__)
 								    return 1;
 								#else
 								    return 0;
-												ggml : add ggml_cpu_has_avx_vnni() (llama/4589)

* feat: add avx_vnni based on intel documents

* ggml: add avx vnni based on intel document

* llama: add avx vnni information display

* docs: add more details about using oneMKL and oneAPI for intel processors

* docs: add more details about using oneMKL and oneAPI for intel processors

* docs: add more details about using oneMKL and oneAPI for intel processors

* docs: add more details about using oneMKL and oneAPI for intel processors

* docs: add more details about using oneMKL and oneAPI for intel processors

* Update ggml.c

Fix indentation upgate

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2023-12-30 08:07:48 +00:00
+								#endif
 								}
 								int ggml_cpu_has_avx_vnni(void) {
 								#if defined(__AVXVNNI__)
 								    return 1;
 								#else
 								    return 0;
-												add AVX support

											
										
										
											2022-11-23 11:23:24 +00:00
+								#endif
 								}
-												ggml : add system info functions

											
										
										
											2022-10-25 17:18:26 +00:00
+								int ggml_cpu_has_avx2(void) {
 								#if defined(__AVX2__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
 								int ggml_cpu_has_avx512(void) {
 								#if defined(__AVX512F__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								int ggml_cpu_has_avx512_vbmi(void) {
 								#if defined(__AVX512VBMI__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
 								int ggml_cpu_has_avx512_vnni(void) {
 								#if defined(__AVX512VNNI__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : simplify the SIMD code (#324)

* ggml : simplify the SIMD code

* ggml : generic reduce for all register sizes + comments
											
										
										
											2022-12-24 08:22:28 +00:00
+								int ggml_cpu_has_fma(void) {
 								#if defined(__FMA__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : add system info functions

											
										
										
											2022-10-25 17:18:26 +00:00
+								int ggml_cpu_has_neon(void) {
-												ggml : fix the check for NEON support (#7)

Was using the wrong preprocessor macro

											
										
										
											2022-11-02 15:52:24 +00:00
+								#if defined(__ARM_NEON)
-												ggml : add system info functions

											
										
										
											2022-10-25 17:18:26 +00:00
+								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												Check for both __ARM_NEON and __ARM_FEATURE_FMA so that the project can be compiled for armv7a.

Android armeabi-v7a's NEON support doesn't support FMA unless configured with `-mfpu=neon-fp-armv8`, which would need runtime checks.
* Also removed ABI filter from Android project.

											
										
										
											2022-12-20 18:33:33 +00:00
+								int ggml_cpu_has_arm_fma(void) {
 								#if defined(__ARM_FEATURE_FMA)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												metal : add F32 support + update bench output

											
										
										
											2023-09-15 10:56:08 +00:00
+								int ggml_cpu_has_metal(void) {
 								#if defined(GGML_USE_METAL)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : add F16C CPU flag check

											
										
										
											2022-12-06 19:56:56 +00:00
+								int ggml_cpu_has_f16c(void) {
 								#if defined(__F16C__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : add system info functions

											
										
										
											2022-10-25 17:18:26 +00:00
+								int ggml_cpu_has_fp16_va(void) {
 								#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
 								int ggml_cpu_has_wasm_simd(void) {
 								#if defined(__wasm_simd128__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
 								int ggml_cpu_has_blas(void) {
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								int ggml_cpu_has_cuda(void) {
 								#if defined(GGML_USE_CUDA)
-												ggml : add system info functions

											
										
										
											2022-10-25 17:18:26 +00:00
+								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								int ggml_cpu_has_clblast(void) {
 								#if defined(GGML_USE_CLBLAST)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : add Vulkan backend (llama/2059)

* Vulkan loader code

* Fix matmul kernel, continue implementation

* Continue implementation

* Vulkan memory management

* Vulkan development

* Matmul call

* Add aligned malloc and free for VMA

* Continue implementation

* First matmul success

* GEMM Kernel optimization

* 1D Blocktiling

* 2D Blocktiling

* Write coalescing

* Continue vulkan implementation and optimization

* First FP16 attempt, disabled for now

* Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel

* Enable device extensions properly, restore fp16 matmul op

* Fix mulmat_f16

* Output FP32 in fp16 matmul shader

* Fix f16_to_f32 kernel

* dequant_q4_0 kernel

* Add VMA library

* Avoid requesting dedicated memory, VMA can decide that by itself

* Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly

* add cmake commands

* Add 2d write operation, profiling code

* Fix 2d write

* Fix queue selection for AMD RADV

* Fix trailing whitespace in vk_mem_alloc.h

* Add WIP warp tile mat mul shaders

* Disable glslc optimization

* Disable glslc optimization for CMake

* Optimize warptile matmul shader, replace blocktile with it

* Add split-k optimization for small matrix multiplication

Use semaphores for synchronization instead of fences or waitidle

Rework async write/read for synchronization

* Fix validation errors, improve compatibility with AMD GPUs

* Rework command buffer handling

* Variable matmul kernel using specialization constants

* Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints

* Reuse semaphores

* Handle stage flags during command buffer submission properly

* Increase matmul test runs for consistent results

* Fix F32 matmul

* Add vectorized loading and zeropadding for matrix multiplication

* Use pinned memory for f16 preprocessing

* Don't force aligned matmul

* Don't free before queue done

* Replace VMA library with native Vulkan buffer management

* Basic offloading support with mul_f32 and dmmv for q4_0

* Run glslc commands in parallel

* Unroll loops in dmmv shader

* Reduce usage of waitIdle

* Reuse pinned allocation for f16 conversion

* Handle devices with only a single queue

* Fix trailing whitespace in CMakeLists.txt

* Allow parallel execution of kernels, parallelize third and fourth dimension calls

* Add fallback for devices only supporting one DescriptorSet per DescriptorPool

* Move to graph function similar to CUDA implementation

* Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function

* Add F32 dmmv shaders

* Batch submissions

* Add .spv to gitignore

* Split off matrix vector multiplication for separate optimization

* Use single command buffer for matrix vector multiplication ops

* Reduce overhead of mul_f32 calls by using a single command buffer

* Add submission batching to mul_f32

* Fix tests

* Add missing barrier

* Add further missing barrier

* Add further ops

* Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions

* Remove unnecessary cblas link

* Fix descriptor set pre-allocation assert

* Add runtime shader compilation, start transferring shaders to this approach

* Transfer remaining shaders to header and compile on runtime

* Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16

* Add support for q4_1, q5_0, q5_1 and q8_0

* Remove unnecessary scalar layout extension

* Parse graph early to pre-record command buffers

* Add q6_k support

* Add multi-submit for command buffers

* Fix q6_k dequant shader for AMD

* Fix q6_k for GPUs without fp16 support

* Simplify q6_k fp16 fix

* Minor fixes

* Fix wg_denom of m-mulmat shaders

* Add Python-based Vulkan shader generator

* Replace shaderc dependency with precompiled shaders

Fix python script to generate shaders

* Clean up code

* Fix shader generator script Windows compatibility

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>

* Close file before deletion

* Fix vulkan shader fp32 name

* Add q2_k and q3_k support

Add validation check to compare shader results to cpu results

* Add q4_k support

* Add q5_k support

* Bake SPIR-V bytecode into the library instead of loading shaders from file

* Switch to signal semaphores for flexibility

Prepare broadcasting support for mul mat

* Finish broadcasting mul mat support for GQA

* Clean up unused functions

Add repeat op

* Add further ops, not yet enabled. Improve semaphore code

* Reduce number of used semaphores by utilizing timelines more properly

* Remove queue information

* Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations

* Add Vulkan to llama-bench

* Remove cblas dependency

* Fix matmul k-split bug

* Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader

* Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug

* Fix issues with float16 overflows in shaders

* Fix issues with older Vulkan headers on Ubuntu 22.04

* Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers

* Implement further ops, rework op_f32 calls, fix bugs

* Finish full offloading support, add last remaining ops, fix bugs, remove redundant code

* Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders

* Merge upstream changes, fix conflicts, adapt soft_max op

* Fix Python and shader header format

* Free model gpu buffers on exit

* Use single queue per device to simplify code

* Add matmul shader support for running multiple calculations in parallel

* Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible

* Fix missing event cast

* Replace uint64_t(-1) with UINT64_MAX, rename function for clarity

* Fix warning about empty C function parameters

* Fix compiler warnings

* Properly implement Vulkan backend buffer handling

* Fix oversized host staging buffers

* Simplify barrier synchronization calls

* Fix gcc warnings

* Implement max_size for backend buffer types to limit the size of a single allocation

* Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size

* refactor multi buf

* Disable unsupported ops to fix tests

* Check for maintenance4 support before using it

* Handle devices with only a single queue

* Fix single queue logic

* propagate buffer usage in multi buffers

* Implement rope_neox op

* Cleanup header and other files

* Simplify gpu_extras by removing events and putting staging memcpys into contexts

* Move queue into context

Add not-yet-enabled async backend ops

* Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization

* Add get_max_size to SYCL backend.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* llama : fix trailing whitespace

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 17:03:59 +00:00
+								int ggml_cpu_has_vulkan(void) {
 								#if defined(GGML_USE_VULKAN)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												kompute : llama-bench support and ggml_cpu_has_kompute() (llama/5226)

											
										
										
											2024-01-31 00:04:37 +00:00
+								int ggml_cpu_has_kompute(void) {
 								#if defined(GGML_USE_KOMPUTE)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : add unified SYCL backend for Intel GPUs (llama/2690)

* first update for migration

* update init_cublas

* add debug functio, commit all help code

* step 1

* step 2

* step3 add fp16, slower 31->28

* add GGML_LIST_DEVICE function

* step 5 format device and print

* step6, enhance error check, remove CUDA macro, enhance device id to fix none-zero id issue

* support main device is non-zero

* step7 add debug for code path, rm log

* step 8, rename all macro & func from cuda by sycl

* fix error of select non-zero device, format device list

* ren ggml-sycl.hpp -> ggml-sycl.h

* clear CMAKE to rm unused lib and options

* correct queue: rm dtct:get_queue

* add print tensor function to debug

* fix error: wrong result in 658746bb26702e50f2c59c0e4ada8e9da6010481

* summary dpct definition in one header file to replace folder:dpct

* refactor device log

* mv dpct definition from folder dpct to ggml-sycl.h

* update readme, refactor build script

* fix build with sycl

* set nthread=1 when sycl, increase performance

* add run script, comment debug code

* add ls-sycl-device tool

* add ls-sycl-device, rm unused files

* rm rear space

* dos2unix

* Update README_sycl.md

* fix return type

* remove sycl version from include path

* restore rm code to fix hang issue

* add syc and link for sycl readme

* rm original sycl code before refactor

* fix code err

* add know issue for pvc hang issue

* enable SYCL_F16 support

* align pr4766

* check for sycl blas, better performance

* cleanup 1

* remove extra endif

* add build&run script, clean CMakefile, update guide by review comments

* rename macro to intel hardware

* editor config format

* format fixes

* format fixes

* editor format fix

* Remove unused headers

* skip build sycl tool for other code path

* replace tab by space

* fix blas matmul function

* fix mac build

* restore hip dependency

* fix conflict

* ren as review comments

* mv internal function to .cpp file

* export funciton print_sycl_devices(), mv class dpct definition to source file

* update CI/action for sycl code, fix CI error of repeat/dup

* fix action ID format issue

* rm unused strategy

* enable llama_f16 in ci

* fix conflict

* fix build break on MacOS, due to CI of MacOS depend on external ggml, instead of internal ggml

* fix ci cases for unsupported data type

* revert unrelated changed in cuda cmake
remove useless nommq
fix typo of GGML_USE_CLBLAS_SYCL

* revert hip cmake changes

* fix indent

* add prefix in func name

* revert no mmq

* rm cpu blas duplicate

* fix no_new_line

* fix src1->type==F16 bug.

* pass batch offset for F16 src1

* fix batch error

* fix wrong code

* revert sycl checking in test-sampling

* pass void as arguments of ggml_backend_sycl_print_sycl_devices

* remove extra blank line in test-sampling

* revert setting n_threads in sycl

* implement std::isinf for icpx with fast math.

* Update ci/run.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/sycl/run-llama2.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/sycl/run-llama2.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* add copyright and MIT license declare

* update the cmd example

---------

Co-authored-by: jianyuzh <jianyu.zhang@intel.com>
Co-authored-by: luoyu-intel <yu.luo@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

											
										
										
											2024-01-28 15:56:23 +00:00
+								int ggml_cpu_has_sycl(void) {
 								#if defined(GGML_USE_SYCL)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								int ggml_cpu_has_gpublas(void) {
-												sync : ggml (#2001)

* sync : update scripts

* sync : ggml

* talk-llama : sync llama.cpp

* make : WHISPER_CUBLAS -> WHISPER_CUDA

* ci : try to fix sycl build

* talk-llama : fix make build
											
										
										
											2024-03-27 16:55:10 +00:00
+								    return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
-												kompute : llama-bench support and ggml_cpu_has_kompute() (llama/5226)

											
										
										
											2024-01-31 00:04:37 +00:00
+								           ggml_cpu_has_sycl();
-												ggml : sync latest ggml + llama.cpp updates (quantization)

											
										
										
											2023-04-29 09:31:52 +00:00
+								}
-												ggml : add SSE3 and fp16 conversion lookup table (#368)

* Improves WASM performance:
  On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2023-01-06 16:45:59 +00:00
+								int ggml_cpu_has_sse3(void) {
 								#if defined(__SSE3__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : detect SSSE3 (#1211)

* ggml : add ggml_cpu_has_ssse3

* whisper : show SSSE3 in system info

* make : detect SSSE3 via cpuinfo
											
										
										
											2023-08-27 18:36:41 +00:00
+								int ggml_cpu_has_ssse3(void) {
 								#if defined(__SSSE3__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												whisper : document POWER VSX support

											
										
										
											2023-01-05 04:00:30 +00:00
+								int ggml_cpu_has_vsx(void) {
 								#if defined(__POWER9_VECTOR__)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : add mmla kernels for quantized GEMM (llama/4966)

* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm

armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"

On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.

* ggml: update unit tests for the new vec_dot interface

* llama.cpp: add MATMUL_INT8 capability to system_info

											
										
										
											2024-02-11 13:22:33 +00:00
+								int ggml_cpu_has_matmul_int8(void) {
 								#if defined(__ARM_FEATURE_MATMUL_INT8)
 								    return 1;
 								#else
 								    return 0;
 								#endif
 								}
-												ggml : add system info functions

											
										
										
											2022-10-25 17:18:26 +00:00
+								////////////////////////////////////////////////////////////////////////////////