mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-12-24 06:46:37 +00:00
ruby : fix bindings (#2484)
* Improve Rakefile * Remove intermediate files * Remove unnecessary manipulations from extconf.rb * Add README and LINCENSE to source files * Manage ext source files using YAML file * Use extsources.yaml to include files into gem package file * Add git-managed source files to build dependency * Add test task * Download model for test if not exists * Add test for build * Ignore gem package directory * Enable GitHub action for Ruby binding * Fix model name * Build lib file for test * Use extension for each platform * Use extension for each platform on testing * Move built lib file rather than copy * Add intermediate files to clean targets
This commit is contained in:
parent
f7c99e49b3
commit
d3f7137cc9
65
.github/workflows/bindings-ruby.yml
vendored
Normal file
65
.github/workflows/bindings-ruby.yml
vendored
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
name: Bindings Tests (Ruby)
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- bindings/ruby/**
|
||||||
|
- src/whisper.cpp
|
||||||
|
- include/whisper.h
|
||||||
|
- ggml/src/ggml.c
|
||||||
|
- ggml/src/ggml-impl.h
|
||||||
|
- ggml/src/ggml-aarch64.h
|
||||||
|
- ggml/src/ggml-aarch64.c
|
||||||
|
- ggml/src/ggml-alloc.c
|
||||||
|
- ggml/src/ggml-backend-impl.h
|
||||||
|
- ggml/src/ggml-backend.cpp
|
||||||
|
- ggml/src/ggml-common.h
|
||||||
|
- ggml/src/ggml-quants.h
|
||||||
|
- ggml/src/ggml-quants.c
|
||||||
|
- ggml/src/ggml-cpu-impl.h
|
||||||
|
- ggml/include/ggml.h
|
||||||
|
- ggml/include/ggml-alloc.h
|
||||||
|
- ggml/include/ggml-backend.h
|
||||||
|
- ggml/include/ggml-cuda.h
|
||||||
|
- ggml/include/ggml-kompute.h
|
||||||
|
- ggml/include/ggml-metal.h
|
||||||
|
- ggml/include/ggml-sycl.h
|
||||||
|
- ggml/include/ggml-vulkan.h
|
||||||
|
- examples/dr_wav.h
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- bindings/ruby/**
|
||||||
|
- src/whisper.cpp
|
||||||
|
- include/whisper.h
|
||||||
|
- ggml/src/ggml.c
|
||||||
|
- ggml/src/ggml-impl.h
|
||||||
|
- ggml/src/ggml-aarch64.h
|
||||||
|
- ggml/src/ggml-aarch64.c
|
||||||
|
- ggml/src/ggml-alloc.c
|
||||||
|
- ggml/src/ggml-backend-impl.h
|
||||||
|
- ggml/src/ggml-backend.cpp
|
||||||
|
- ggml/src/ggml-common.h
|
||||||
|
- ggml/src/ggml-quants.h
|
||||||
|
- ggml/src/ggml-quants.c
|
||||||
|
- ggml/src/ggml-cpu-impl.h
|
||||||
|
- ggml/include/ggml.h
|
||||||
|
- ggml/include/ggml-alloc.h
|
||||||
|
- ggml/include/ggml-backend.h
|
||||||
|
- ggml/include/ggml-cuda.h
|
||||||
|
- ggml/include/ggml-kompute.h
|
||||||
|
- ggml/include/ggml-metal.h
|
||||||
|
- ggml/include/ggml-sycl.h
|
||||||
|
- ggml/include/ggml-vulkan.h
|
||||||
|
- examples/dr_wav.h
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ubuntu-latest:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: bindings/ruby
|
||||||
|
steps:
|
||||||
|
- uses: ruby/setup-ruby@v1
|
||||||
|
with:
|
||||||
|
ruby-version: '3.0'
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- run: rake test
|
23
.github/workflows/bindings-ruby.yml.disabled
vendored
23
.github/workflows/bindings-ruby.yml.disabled
vendored
@ -1,23 +0,0 @@
|
|||||||
# TODO: fix this workflow file, disabled for now
|
|
||||||
name: Bindings Tests (Ruby)
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
paths:
|
|
||||||
- bindings/ruby/**
|
|
||||||
- whisper.h
|
|
||||||
pull_request:
|
|
||||||
paths:
|
|
||||||
- bindings/ruby/**
|
|
||||||
- whisper.h
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
ubuntu-latest:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: ruby/setup-ruby@v1
|
|
||||||
with:
|
|
||||||
ruby-version: '3.0'
|
|
||||||
- uses: actions/checkout@v1
|
|
||||||
- run: |
|
|
||||||
cd bindings/ruby/ext
|
|
||||||
ruby extconf.rb && make
|
|
4
bindings/ruby/.gitignore
vendored
Normal file
4
bindings/ruby/.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
README.md
|
||||||
|
LICENSE
|
||||||
|
pkg/
|
||||||
|
lib/whisper.*
|
@ -1,12 +1,55 @@
|
|||||||
require 'rake/clean'
|
require 'rake/clean'
|
||||||
require 'rubygems/package'
|
require "bundler/gem_tasks"
|
||||||
|
require "pathname"
|
||||||
|
require "yaml"
|
||||||
|
require "rake/testtask"
|
||||||
|
|
||||||
desc 'Build gem'
|
extsources = YAML.load_file("extsources.yaml")
|
||||||
task :package do
|
extsources.each_pair do |src_dir, dests|
|
||||||
spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec')
|
dests.each do |dest|
|
||||||
spec = nil
|
src = Pathname(src_dir)/File.basename(dest)
|
||||||
# see: http://gist.github.com/16215
|
|
||||||
Thread.new { spec = eval("#{spec_source}") }.join
|
file src
|
||||||
spec.validate
|
file dest => src do |t|
|
||||||
Gem::Package.build(spec)
|
cp t.source, t.name
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
SOURCES = extsources.values.flatten
|
||||||
|
CLEAN.include SOURCES
|
||||||
|
CLEAN.include FileList["ext/*.o", "ext/whisper.so", "ext/whisper.bundle", "ext/whisper.dll"]
|
||||||
|
|
||||||
|
task build: SOURCES + FileList[
|
||||||
|
"ext/extconf.rb",
|
||||||
|
"ext/ruby_whisper.h",
|
||||||
|
"ext/ruby_whisper.cpp",
|
||||||
|
"whispercpp.gemspec",
|
||||||
|
]
|
||||||
|
|
||||||
|
directory "pkg"
|
||||||
|
CLOBBER.include "pkg"
|
||||||
|
|
||||||
|
TEST_MODEL = "../../models/ggml-base.en.bin"
|
||||||
|
LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
|
||||||
|
LIB_FILE = File.join("lib", LIB_NAME)
|
||||||
|
|
||||||
|
directory "lib"
|
||||||
|
task LIB_FILE => SOURCES + ["lib"] do |t|
|
||||||
|
Dir.chdir "ext" do
|
||||||
|
sh "ruby extconf.rb"
|
||||||
|
sh "make"
|
||||||
|
end
|
||||||
|
mv "ext/#{LIB_NAME}", t.name
|
||||||
|
end
|
||||||
|
CLEAN.include LIB_FILE
|
||||||
|
|
||||||
|
Rake::TestTask.new do |t|
|
||||||
|
t.test_files = FileList["tests/test_*.rb"]
|
||||||
|
end
|
||||||
|
task test: [TEST_MODEL, LIB_FILE]
|
||||||
|
|
||||||
|
file TEST_MODEL do
|
||||||
|
Dir.chdir "../.." do
|
||||||
|
sh "./models/download-ggml-model.sh base.en"
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
21
bindings/ruby/ext/.gitignore
vendored
21
bindings/ruby/ext/.gitignore
vendored
@ -3,7 +3,26 @@ ggml.c
|
|||||||
ggml.h
|
ggml.h
|
||||||
ggml-alloc.c
|
ggml-alloc.c
|
||||||
ggml-alloc.h
|
ggml-alloc.h
|
||||||
whisper.bundle
|
ggml-aarch64.c
|
||||||
|
ggml-aarch64.h
|
||||||
|
ggml-backend.cpp
|
||||||
|
ggml-backend-impl.h
|
||||||
|
ggml-backend.c
|
||||||
|
ggml-backend.h
|
||||||
|
ggml-common.h
|
||||||
|
ggml-cpu-impl.h
|
||||||
|
ggml-cuda.h
|
||||||
|
ggml-impl.h
|
||||||
|
ggml-kompute.h
|
||||||
|
ggml-metal.h
|
||||||
|
ggml-opencl.h
|
||||||
|
ggml-quants.c
|
||||||
|
ggml-quants.h
|
||||||
|
ggml-sycl.h
|
||||||
|
ggml-vulkan.h
|
||||||
whisper.cpp
|
whisper.cpp
|
||||||
whisper.h
|
whisper.h
|
||||||
dr_wav.h
|
dr_wav.h
|
||||||
|
whisper.bundle
|
||||||
|
whisper.so
|
||||||
|
whisper.dll
|
||||||
|
@ -1,21 +1,4 @@
|
|||||||
require 'mkmf'
|
require 'mkmf'
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-aarch64.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-aarch64.c')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.cpp')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
|
|
||||||
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
|
|
||||||
|
|
||||||
|
|
||||||
# need to use c++ compiler flags
|
# need to use c++ compiler flags
|
||||||
$CXXFLAGS << ' -std=c++11'
|
$CXXFLAGS << ' -std=c++11'
|
||||||
|
@ -1,141 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
// ggml-backend internal header
|
|
||||||
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
|
||||||
// Backend buffer
|
|
||||||
//
|
|
||||||
|
|
||||||
// buffer type
|
|
||||||
typedef void * ggml_backend_buffer_type_context_t;
|
|
||||||
|
|
||||||
struct ggml_backend_buffer_type_i {
|
|
||||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
|
||||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
|
||||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
|
||||||
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
|
||||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
|
||||||
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
|
||||||
// check if tensor data is in host memory
|
|
||||||
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
|
||||||
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_backend_buffer_type {
|
|
||||||
struct ggml_backend_buffer_type_i iface;
|
|
||||||
ggml_backend_buffer_type_context_t context;
|
|
||||||
};
|
|
||||||
|
|
||||||
// buffer
|
|
||||||
typedef void * ggml_backend_buffer_context_t;
|
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
|
||||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
|
||||||
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
|
|
||||||
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
|
||||||
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
||||||
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
||||||
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
||||||
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
|
||||||
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
|
||||||
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_backend_buffer {
|
|
||||||
struct ggml_backend_buffer_i iface;
|
|
||||||
ggml_backend_buffer_type_t buft;
|
|
||||||
ggml_backend_buffer_context_t context;
|
|
||||||
size_t size;
|
|
||||||
enum ggml_backend_buffer_usage usage;
|
|
||||||
};
|
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
||||||
ggml_backend_buffer_type_t buft,
|
|
||||||
struct ggml_backend_buffer_i iface,
|
|
||||||
ggml_backend_buffer_context_t context,
|
|
||||||
size_t size);
|
|
||||||
|
|
||||||
// do not use directly, use ggml_backend_tensor_copy instead
|
|
||||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
||||||
|
|
||||||
// buffer that contains a collection of buffers
|
|
||||||
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
|
||||||
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
|
||||||
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Backend
|
|
||||||
//
|
|
||||||
|
|
||||||
typedef void * ggml_backend_context_t;
|
|
||||||
|
|
||||||
struct ggml_backend_i {
|
|
||||||
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
void (*GGML_CALL free)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// buffer allocation
|
|
||||||
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// (optional) asynchronous tensor data access
|
|
||||||
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
||||||
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
||||||
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
||||||
|
|
||||||
// (optional) complete all pending operations
|
|
||||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// compute graph with a plan (not used currently)
|
|
||||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
|
||||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
||||||
|
|
||||||
// compute graph with a plan
|
|
||||||
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
||||||
// compute graph without a plan (async)
|
|
||||||
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
|
|
||||||
// check if the backend supports an operation
|
|
||||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
||||||
|
|
||||||
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
|
||||||
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
|
||||||
// even if the weight has to be copied from the CPU temporarily
|
|
||||||
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
||||||
|
|
||||||
// (optional) event synchronization
|
|
||||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
|
||||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
|
||||||
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
|
||||||
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
|
||||||
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_backend {
|
|
||||||
ggml_guid_t guid;
|
|
||||||
|
|
||||||
struct ggml_backend_i iface;
|
|
||||||
ggml_backend_context_t context;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_backend_event {
|
|
||||||
ggml_backend_t backend;
|
|
||||||
void * context;
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
|
||||||
// Backend registry
|
|
||||||
//
|
|
||||||
|
|
||||||
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
File diff suppressed because it is too large
Load Diff
@ -1,233 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-alloc.h"
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
|
||||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
|
||||||
typedef struct ggml_backend_event * ggml_backend_event_t;
|
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
|
||||||
typedef void * ggml_backend_graph_plan_t;
|
|
||||||
|
|
||||||
//
|
|
||||||
// Backend buffer
|
|
||||||
//
|
|
||||||
|
|
||||||
// buffer type
|
|
||||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
|
||||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
|
||||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
|
||||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
|
||||||
|
|
||||||
// buffer
|
|
||||||
enum ggml_backend_buffer_usage {
|
|
||||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
|
||||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
|
||||||
};
|
|
||||||
|
|
||||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
||||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
|
||||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
|
||||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Backend
|
|
||||||
//
|
|
||||||
|
|
||||||
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
|
||||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
|
||||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
|
||||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
|
||||||
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
||||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
||||||
|
|
||||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
||||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
||||||
|
|
||||||
// tensor copy between different backends
|
|
||||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
||||||
|
|
||||||
// asynchronous copy
|
|
||||||
// the copy is performed after all the currently queued operations in backend_src
|
|
||||||
// backend_dst will wait for the copy to complete before performing other operations
|
|
||||||
// automatic fallback to sync copy if async is not supported
|
|
||||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
||||||
|
|
||||||
// events
|
|
||||||
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
|
||||||
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
|
||||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
|
||||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
|
||||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
|
|
||||||
|
|
||||||
//
|
|
||||||
// CPU backend
|
|
||||||
//
|
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
|
||||||
// Backend registry
|
|
||||||
//
|
|
||||||
|
|
||||||
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
|
||||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
|
||||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
|
|
||||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
|
||||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Backend scheduler
|
|
||||||
//
|
|
||||||
|
|
||||||
// The backend scheduler allows for multiple backends to be used together
|
|
||||||
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
|
||||||
// The backends are selected based on:
|
|
||||||
// - the backend that supports the operation
|
|
||||||
// - the location of the pre-allocated tensors (e.g. the weights)
|
|
||||||
/*
|
|
||||||
Example usage:
|
|
||||||
|
|
||||||
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
|
||||||
// preferrably to run on the same backend as the buffer
|
|
||||||
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
||||||
|
|
||||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
|
|
||||||
|
|
||||||
// initialize buffers from a max size graph (optional)
|
|
||||||
reserve_graph = build_graph(sched, max_batch_size);
|
|
||||||
|
|
||||||
// manually assign nodes to a backend (optional, should not be needed in most cases)
|
|
||||||
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
|
||||||
ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
|
|
||||||
|
|
||||||
ggml_backend_sched_reserve(sched, reserve_graph);
|
|
||||||
|
|
||||||
// compute
|
|
||||||
graph = build_graph(sched);
|
|
||||||
ggml_backend_sched_graph_compute(sched, graph);
|
|
||||||
|
|
||||||
// if there are graph inputs:
|
|
||||||
ggml_backend_sched_reset(sched);
|
|
||||||
ggml_backend_sched_alloc_graph(sched, graph);
|
|
||||||
ggml_backend_tensor_set(input_tensor, ...);
|
|
||||||
ggml_backend_sched_graph_compute(sched, graph);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
struct ggml_backend_sched;
|
|
||||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
|
||||||
|
|
||||||
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
|
||||||
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
|
||||||
//
|
|
||||||
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
|
||||||
// if the user returns false, the scheduler will cancel the graph compute
|
|
||||||
//
|
|
||||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
|
||||||
|
|
||||||
// Initialize a backend scheduler
|
|
||||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
||||||
|
|
||||||
// Initialize backend buffers from a measure graph
|
|
||||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
|
||||||
|
|
||||||
// Get the number of splits of the last graph
|
|
||||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
|
||||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
||||||
|
|
||||||
// Allocate and compute graph on the backend scheduler
|
|
||||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
||||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
||||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
||||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
|
||||||
|
|
||||||
// Reset all assignments and allocators - must be called before changing the node backends
|
|
||||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
|
||||||
|
|
||||||
// Set a callback to be called for each resulting node during graph compute
|
|
||||||
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Utils
|
|
||||||
//
|
|
||||||
|
|
||||||
struct ggml_backend_graph_copy {
|
|
||||||
ggml_backend_buffer_t buffer;
|
|
||||||
struct ggml_context * ctx_allocated;
|
|
||||||
struct ggml_context * ctx_unallocated;
|
|
||||||
struct ggml_cgraph * graph;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Copy a graph to a different backend
|
|
||||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
|
||||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
|
||||||
|
|
||||||
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
|
||||||
|
|
||||||
// Compare the output of two backends
|
|
||||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
|
||||||
|
|
||||||
// Tensor initialization
|
|
||||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
||||||
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
File diff suppressed because it is too large
Load Diff
@ -1,43 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#ifdef GGML_USE_HIPBLAS
|
|
||||||
#define GGML_CUDA_NAME "ROCm"
|
|
||||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
|
||||||
#else
|
|
||||||
#define GGML_CUDA_NAME "CUDA"
|
|
||||||
#define GGML_CUBLAS_NAME "cuBLAS"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
|
||||||
|
|
||||||
// backend API
|
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
|
||||||
|
|
||||||
// device buffer
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,272 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
// GGML internal header
|
|
||||||
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <string.h> // memcpy
|
|
||||||
#include <math.h> // fabsf
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// static_assert should be a #define, but if it's not,
|
|
||||||
// fall back to the _Static_assert C11 keyword.
|
|
||||||
// if C99 - static_assert is noop
|
|
||||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
|
||||||
#ifndef __cplusplus
|
|
||||||
#ifndef static_assert
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
|
||||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
|
||||||
#else
|
|
||||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
|
||||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
|
||||||
#ifndef __FMA__
|
|
||||||
#define __FMA__
|
|
||||||
#endif
|
|
||||||
#ifndef __F16C__
|
|
||||||
#define __F16C__
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
|
||||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
|
||||||
#ifndef __SSE3__
|
|
||||||
#define __SSE3__
|
|
||||||
#endif
|
|
||||||
#ifndef __SSSE3__
|
|
||||||
#define __SSSE3__
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// 16-bit float
|
|
||||||
// on Arm, we use __fp16
|
|
||||||
// on x86, we use uint16_t
|
|
||||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
||||||
|
|
||||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
|
||||||
//
|
|
||||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
|
||||||
//
|
|
||||||
#include <arm_neon.h>
|
|
||||||
|
|
||||||
typedef __fp16 ggml_fp16_internal_t;
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
|
|
||||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
ggml_fp16_internal_t tmp;
|
|
||||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
|
||||||
return (float)tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
ggml_fp16_t res;
|
|
||||||
ggml_fp16_internal_t tmp = f;
|
|
||||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
typedef uint16_t ggml_fp16_internal_t;
|
|
||||||
|
|
||||||
#ifdef __wasm_simd128__
|
|
||||||
#include <wasm_simd128.h>
|
|
||||||
#else
|
|
||||||
#ifdef __POWER9_VECTOR__
|
|
||||||
#include <altivec.h>
|
|
||||||
#undef bool
|
|
||||||
#define bool _Bool
|
|
||||||
#else
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
||||||
#include <intrin.h>
|
|
||||||
#else
|
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
|
||||||
#if !defined(__riscv)
|
|
||||||
#include <immintrin.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __riscv_v_intrinsic
|
|
||||||
#include <riscv_vector.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __F16C__
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
|
||||||
#else
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
/* the inline asm below is about 12% faster than the lookup method */
|
|
||||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
register float f;
|
|
||||||
register double d;
|
|
||||||
__asm__(
|
|
||||||
"mtfprd %0,%2\n"
|
|
||||||
"xscvhpdp %0,%0\n"
|
|
||||||
"frsp %1,%0\n" :
|
|
||||||
/* temp */ "=d"(d),
|
|
||||||
/* out */ "=f"(f):
|
|
||||||
/* in */ "r"(h));
|
|
||||||
return f;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
register double d;
|
|
||||||
register ggml_fp16_t r;
|
|
||||||
__asm__( /* xscvdphp can work on double or single precision */
|
|
||||||
"xscvdphp %0,%2\n"
|
|
||||||
"mffprd %1,%0\n" :
|
|
||||||
/* temp */ "=d"(d),
|
|
||||||
/* out */ "=r"(r):
|
|
||||||
/* in */ "f"(f));
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// FP16 <-> FP32
|
|
||||||
// ref: https://github.com/Maratyszcza/FP16
|
|
||||||
|
|
||||||
static inline float fp32_from_bits(uint32_t w) {
|
|
||||||
union {
|
|
||||||
uint32_t as_bits;
|
|
||||||
float as_value;
|
|
||||||
} fp32;
|
|
||||||
fp32.as_bits = w;
|
|
||||||
return fp32.as_value;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint32_t fp32_to_bits(float f) {
|
|
||||||
union {
|
|
||||||
float as_value;
|
|
||||||
uint32_t as_bits;
|
|
||||||
} fp32;
|
|
||||||
fp32.as_value = f;
|
|
||||||
return fp32.as_bits;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
const uint32_t w = (uint32_t) h << 16;
|
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
||||||
const uint32_t two_w = w + w;
|
|
||||||
|
|
||||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
||||||
const float exp_scale = 0x1.0p-112f;
|
|
||||||
#else
|
|
||||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
|
||||||
#endif
|
|
||||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
|
||||||
|
|
||||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
|
||||||
const float magic_bias = 0.5f;
|
|
||||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
|
||||||
|
|
||||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
|
||||||
const uint32_t result = sign |
|
|
||||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
|
||||||
return fp32_from_bits(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
||||||
const float scale_to_inf = 0x1.0p+112f;
|
|
||||||
const float scale_to_zero = 0x1.0p-110f;
|
|
||||||
#else
|
|
||||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
|
||||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
|
||||||
#endif
|
|
||||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
|
||||||
|
|
||||||
const uint32_t w = fp32_to_bits(f);
|
|
||||||
const uint32_t shl1_w = w + w;
|
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
||||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
|
||||||
if (bias < UINT32_C(0x71000000)) {
|
|
||||||
bias = UINT32_C(0x71000000);
|
|
||||||
}
|
|
||||||
|
|
||||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
|
||||||
const uint32_t bits = fp32_to_bits(base);
|
|
||||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
|
||||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
|
||||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
|
||||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
|
|
||||||
#endif // __F16C__
|
|
||||||
|
|
||||||
#endif // __ARM_NEON
|
|
||||||
|
|
||||||
// precomputed f32 table for f16 (256 KB)
|
|
||||||
// defined in ggml.c, initialized in ggml_init()
|
|
||||||
extern float ggml_table_f32_f16[1 << 16];
|
|
||||||
|
|
||||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
|
||||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
|
||||||
// This is also true for POWER9.
|
|
||||||
#if !defined(GGML_FP16_TO_FP32)
|
|
||||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
||||||
uint16_t s;
|
|
||||||
memcpy(&s, &f, sizeof(uint16_t));
|
|
||||||
return ggml_table_f32_f16[s];
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(GGML_FP32_TO_FP16)
|
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
|
||||||
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
|
||||||
|
|
||||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
|
||||||
|
|
||||||
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
|
||||||
|
|
||||||
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
|
||||||
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
|
||||||
|
|
||||||
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
|
||||||
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
|
||||||
|
|
||||||
// return index, asserts if table is full
|
|
||||||
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,46 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct ggml_vk_device {
|
|
||||||
int index;
|
|
||||||
int type; // same as VkPhysicalDeviceType
|
|
||||||
size_t heapSize;
|
|
||||||
const char * name;
|
|
||||||
const char * vendor;
|
|
||||||
int subgroupSize;
|
|
||||||
uint64_t bufferAlignment;
|
|
||||||
uint64_t maxAlloc;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
|
|
||||||
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
|
|
||||||
bool ggml_vk_has_vulkan(void);
|
|
||||||
bool ggml_vk_has_device(void);
|
|
||||||
struct ggml_vk_device ggml_vk_current_device(void);
|
|
||||||
|
|
||||||
//
|
|
||||||
// backend API
|
|
||||||
//
|
|
||||||
|
|
||||||
// forward declaration
|
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
|
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,66 +0,0 @@
|
|||||||
// An interface allowing to compute ggml_cgraph with Metal
|
|
||||||
//
|
|
||||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
|
||||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
|
|
||||||
//
|
|
||||||
// How it works?
|
|
||||||
//
|
|
||||||
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
|
|
||||||
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
|
|
||||||
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
|
|
||||||
//
|
|
||||||
// You only need to make sure that all memory buffers that you used during the graph creation
|
|
||||||
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
|
||||||
// used during the graph evaluation to determine the arguments of the compute kernels.
|
|
||||||
//
|
|
||||||
// Synchronization between device and host memory (for example for input and output tensors)
|
|
||||||
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
|
||||||
//
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
|
|
||||||
// max memory buffers that can be mapped to the device
|
|
||||||
#define GGML_METAL_MAX_BUFFERS 64
|
|
||||||
|
|
||||||
struct ggml_tensor;
|
|
||||||
struct ggml_cgraph;
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
|
||||||
// backend API
|
|
||||||
// user-code should use only these functions
|
|
||||||
//
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
|
||||||
// ideally, the user code should be doing these checks
|
|
||||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
|
||||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
|
||||||
|
|
||||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
|
||||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
@ -1,36 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
GGML_API void ggml_cl_init(void);
|
|
||||||
|
|
||||||
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
||||||
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
||||||
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
|
||||||
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
||||||
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
|
||||||
|
|
||||||
// GGML_API void * ggml_cl_host_malloc(size_t size);
|
|
||||||
// GGML_API void ggml_cl_host_free(void * ptr);
|
|
||||||
|
|
||||||
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
|
||||||
|
|
||||||
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
// backend API
|
|
||||||
|
|
||||||
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
|
|
||||||
|
|
||||||
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
|
||||||
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
File diff suppressed because it is too large
Load Diff
@ -1,133 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#define GGML_COMMON_DECL_C
|
|
||||||
#include "ggml-common.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
// GGML internal header
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Quantization
|
|
||||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
// Dequantization
|
|
||||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
|
||||||
|
|
||||||
// Dot product
|
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
||||||
|
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
||||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
|
|
||||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
|
||||||
void iq3xs_init_impl(int grid_size);
|
|
||||||
void iq3xs_free_impl(int grid_size);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
@ -1,49 +0,0 @@
|
|||||||
//
|
|
||||||
// MIT license
|
|
||||||
// Copyright (C) 2024 Intel Corporation
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
//
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GGML_SYCL_MAX_DEVICES 48
|
|
||||||
#define GGML_SYCL_NAME "SYCL"
|
|
||||||
|
|
||||||
// backend API
|
|
||||||
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
|
||||||
|
|
||||||
// devide buffer
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
|
||||||
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
|
||||||
|
|
||||||
// TODO: these are temporary
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
|
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
|
|
||||||
|
|
||||||
// SYCL doesn't support registering host memory, keep here for reference
|
|
||||||
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
|
||||||
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,29 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GGML_VK_NAME "Vulkan"
|
|
||||||
#define GGML_VK_MAX_DEVICES 16
|
|
||||||
|
|
||||||
GGML_API void ggml_vk_instance_init(void);
|
|
||||||
|
|
||||||
// backend API
|
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
|
|
||||||
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
|
||||||
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
32
bindings/ruby/extsources.yaml
Normal file
32
bindings/ruby/extsources.yaml
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
---
|
||||||
|
../../src:
|
||||||
|
- ext/whisper.cpp
|
||||||
|
../../include:
|
||||||
|
- ext/whisper.h
|
||||||
|
../../ggml/src:
|
||||||
|
- ext/ggml.c
|
||||||
|
- ext/ggml-impl.h
|
||||||
|
- ext/ggml-aarch64.h
|
||||||
|
- ext/ggml-aarch64.c
|
||||||
|
- ext/ggml-alloc.c
|
||||||
|
- ext/ggml-backend-impl.h
|
||||||
|
- ext/ggml-backend.cpp
|
||||||
|
- ext/ggml-common.h
|
||||||
|
- ext/ggml-quants.h
|
||||||
|
- ext/ggml-quants.c
|
||||||
|
- ext/ggml-cpu-impl.h
|
||||||
|
../../ggml/include:
|
||||||
|
- ext/ggml.h
|
||||||
|
- ext/ggml-alloc.h
|
||||||
|
- ext/ggml-backend.h
|
||||||
|
- ext/ggml-cuda.h
|
||||||
|
- ext/ggml-kompute.h
|
||||||
|
- ext/ggml-metal.h
|
||||||
|
- ext/ggml-sycl.h
|
||||||
|
- ext/ggml-vulkan.h
|
||||||
|
../../examples:
|
||||||
|
- ext/dr_wav.h
|
||||||
|
../..:
|
||||||
|
- README.md
|
||||||
|
- LICENSE
|
||||||
|
|
@ -1,11 +1,10 @@
|
|||||||
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
||||||
EXTDIR = File.join(TOPDIR, 'ext')
|
|
||||||
#$LIBDIR = File.join(TOPDIR, 'lib')
|
|
||||||
#$:.unshift(LIBDIR)
|
|
||||||
$:.unshift(EXTDIR)
|
|
||||||
|
|
||||||
require 'whisper'
|
require 'whisper'
|
||||||
require 'test/unit'
|
require 'test/unit'
|
||||||
|
require 'tempfile'
|
||||||
|
require 'tmpdir'
|
||||||
|
require 'shellwords'
|
||||||
|
|
||||||
class TestWhisper < Test::Unit::TestCase
|
class TestWhisper < Test::Unit::TestCase
|
||||||
def setup
|
def setup
|
||||||
@ -128,4 +127,25 @@ class TestWhisper < Test::Unit::TestCase
|
|||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_build
|
||||||
|
Tempfile.create do |file|
|
||||||
|
assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
|
||||||
|
assert_path_exist file.to_path
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
sub_test_case "Building binary on installation" do
|
||||||
|
def setup
|
||||||
|
system "rake", "build", exception: true
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_install
|
||||||
|
filename = `rake -Tbuild`.match(/(whispercpp-(?:.+)\.gem)/)[1]
|
||||||
|
basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
|
||||||
|
Dir.mktmpdir do |dir|
|
||||||
|
system "gem", "install", "--install-dir", dir.shellescape, "pkg/#{filename.shellescape}", exception: true
|
||||||
|
assert_path_exist File.join(dir, "gems/whispercpp-1.3.0/lib", basename)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
require "yaml"
|
||||||
|
|
||||||
Gem::Specification.new do |s|
|
Gem::Specification.new do |s|
|
||||||
s.name = "whispercpp"
|
s.name = "whispercpp"
|
||||||
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
|
||||||
@ -7,10 +9,8 @@ Gem::Specification.new do |s|
|
|||||||
s.email = 'todd.fisher@gmail.com'
|
s.email = 'todd.fisher@gmail.com'
|
||||||
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
s.extra_rdoc_files = ['LICENSE', 'README.md']
|
||||||
|
|
||||||
s.files = ["LICENSE", "README.md", "Rakefile", "ext/extconf.rb", "ext/ggml.c", "ext/ruby_whisper.cpp", "ext/whisper.cpp", "ext/dr_wav.h", "ext/ggml.h", "ext/ruby_whisper.h", "ext/whisper.h"]
|
s.files = `git ls-files . -z`.split("\x0") + YAML.load_file("extsources.yaml").values.flatten
|
||||||
|
|
||||||
#### Load-time details
|
|
||||||
s.require_paths = ['lib','ext']
|
|
||||||
s.summary = %q{Ruby whisper.cpp bindings}
|
s.summary = %q{Ruby whisper.cpp bindings}
|
||||||
s.test_files = ["tests/test_whisper.rb"]
|
s.test_files = ["tests/test_whisper.rb"]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user