2023-07-14 23:19:43 +00:00
|
|
|
package llama
|
|
|
|
|
|
|
|
// This is a wrapper to statisfy the GRPC service interface
|
|
|
|
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
|
2023-07-14 23:19:43 +00:00
|
|
|
"github.com/go-skynet/LocalAI/pkg/grpc/base"
|
2023-07-14 23:19:43 +00:00
|
|
|
pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
|
|
|
|
"github.com/go-skynet/go-llama.cpp"
|
2023-08-18 19:23:14 +00:00
|
|
|
"github.com/rs/zerolog/log"
|
2023-07-14 23:19:43 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
type LLM struct {
|
2023-07-14 23:19:43 +00:00
|
|
|
base.Base
|
|
|
|
|
2023-07-14 23:19:43 +00:00
|
|
|
llama *llama.LLama
|
|
|
|
}
|
|
|
|
|
|
|
|
func (llm *LLM) Load(opts *pb.ModelOptions) error {
|
2023-07-29 08:40:56 +00:00
|
|
|
|
2023-08-18 19:23:14 +00:00
|
|
|
if llm.Base.State != pb.StatusResponse_UNINITIALIZED {
|
|
|
|
log.Warn().Msgf("llama backend loading %s while already in state %s!", opts.Model, llm.Base.State.String())
|
|
|
|
}
|
|
|
|
|
|
|
|
llm.Base.Lock()
|
|
|
|
defer llm.Base.Unlock()
|
|
|
|
|
2023-07-29 08:40:56 +00:00
|
|
|
ropeFreqBase := float32(10000)
|
|
|
|
ropeFreqScale := float32(1)
|
|
|
|
|
|
|
|
if opts.RopeFreqBase != 0 {
|
|
|
|
ropeFreqBase = opts.RopeFreqBase
|
|
|
|
}
|
|
|
|
if opts.RopeFreqScale != 0 {
|
|
|
|
ropeFreqScale = opts.RopeFreqScale
|
|
|
|
}
|
|
|
|
|
2023-07-27 19:56:05 +00:00
|
|
|
llamaOpts := []llama.ModelOption{
|
2023-07-29 08:40:56 +00:00
|
|
|
llama.WithRopeFreqBase(ropeFreqBase),
|
|
|
|
llama.WithRopeFreqScale(ropeFreqScale),
|
2023-07-27 19:56:05 +00:00
|
|
|
}
|
2023-07-14 23:19:43 +00:00
|
|
|
|
2023-08-02 22:51:08 +00:00
|
|
|
if opts.NGQA != 0 {
|
|
|
|
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.RMSNormEps != 0 {
|
|
|
|
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
|
|
|
|
}
|
|
|
|
|
2023-07-14 23:19:43 +00:00
|
|
|
if opts.ContextSize != 0 {
|
|
|
|
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
|
|
|
|
}
|
|
|
|
if opts.F16Memory {
|
|
|
|
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
|
|
|
|
}
|
|
|
|
if opts.Embeddings {
|
|
|
|
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
|
|
|
|
}
|
|
|
|
if opts.NGPULayers != 0 {
|
|
|
|
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
|
|
|
|
}
|
|
|
|
|
|
|
|
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
|
|
|
|
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
|
|
|
|
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
|
|
|
|
if opts.NBatch != 0 {
|
|
|
|
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
|
|
|
|
} else {
|
|
|
|
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.NUMA {
|
|
|
|
llamaOpts = append(llamaOpts, llama.EnableNUMA)
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.LowVRAM {
|
|
|
|
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
|
|
|
|
}
|
|
|
|
|
2023-08-07 20:39:10 +00:00
|
|
|
model, err := llama.New(opts.ModelFile, llamaOpts...)
|
2023-07-14 23:19:43 +00:00
|
|
|
llm.llama = model
|
2023-08-18 19:23:14 +00:00
|
|
|
|
2023-07-14 23:19:43 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
|
2023-07-29 06:37:24 +00:00
|
|
|
ropeFreqBase := float32(10000)
|
2023-07-28 22:04:25 +00:00
|
|
|
ropeFreqScale := float32(1)
|
|
|
|
|
|
|
|
if opts.RopeFreqBase != 0 {
|
|
|
|
ropeFreqBase = opts.RopeFreqBase
|
|
|
|
}
|
|
|
|
if opts.RopeFreqScale != 0 {
|
|
|
|
ropeFreqScale = opts.RopeFreqScale
|
|
|
|
}
|
2023-07-14 23:19:43 +00:00
|
|
|
predictOptions := []llama.PredictOption{
|
2023-07-27 19:56:05 +00:00
|
|
|
llama.SetTemperature(opts.Temperature),
|
|
|
|
llama.SetTopP(opts.TopP),
|
2023-07-14 23:19:43 +00:00
|
|
|
llama.SetTopK(int(opts.TopK)),
|
|
|
|
llama.SetTokens(int(opts.Tokens)),
|
|
|
|
llama.SetThreads(int(opts.Threads)),
|
2023-07-25 17:05:27 +00:00
|
|
|
llama.WithGrammar(opts.Grammar),
|
2023-07-28 22:04:25 +00:00
|
|
|
llama.SetRopeFreqBase(ropeFreqBase),
|
|
|
|
llama.SetRopeFreqScale(ropeFreqScale),
|
2023-07-27 19:56:05 +00:00
|
|
|
llama.SetNegativePromptScale(opts.NegativePromptScale),
|
2023-07-25 17:05:27 +00:00
|
|
|
llama.SetNegativePrompt(opts.NegativePrompt),
|
2023-07-14 23:19:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if opts.PromptCacheAll {
|
|
|
|
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.PromptCacheRO {
|
|
|
|
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Expected absolute path
|
|
|
|
if opts.PromptCachePath != "" {
|
|
|
|
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.Mirostat != 0 {
|
|
|
|
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.MirostatETA != 0 {
|
2023-07-27 19:56:05 +00:00
|
|
|
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
|
2023-07-14 23:19:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if opts.MirostatTAU != 0 {
|
2023-07-27 19:56:05 +00:00
|
|
|
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
|
2023-07-14 23:19:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if opts.Debug {
|
|
|
|
predictOptions = append(predictOptions, llama.Debug)
|
|
|
|
}
|
|
|
|
|
|
|
|
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
|
|
|
|
|
|
|
|
if opts.PresencePenalty != 0 {
|
2023-07-27 19:56:05 +00:00
|
|
|
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
|
2023-07-14 23:19:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if opts.NKeep != 0 {
|
|
|
|
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.Batch != 0 {
|
|
|
|
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.F16KV {
|
|
|
|
predictOptions = append(predictOptions, llama.EnableF16KV)
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.IgnoreEOS {
|
|
|
|
predictOptions = append(predictOptions, llama.IgnoreEOS)
|
|
|
|
}
|
|
|
|
|
|
|
|
if opts.Seed != 0 {
|
|
|
|
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
|
|
|
|
}
|
|
|
|
|
|
|
|
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
|
|
|
|
|
2023-07-27 19:56:05 +00:00
|
|
|
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
|
2023-07-14 23:19:43 +00:00
|
|
|
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
|
|
|
|
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
|
|
|
|
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
|
|
|
|
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
|
2023-07-27 19:56:05 +00:00
|
|
|
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
|
|
|
|
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
|
2023-07-14 23:19:43 +00:00
|
|
|
return predictOptions
|
|
|
|
}
|
|
|
|
|
|
|
|
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
|
2023-08-18 19:23:14 +00:00
|
|
|
llm.Base.Lock()
|
|
|
|
defer llm.Base.Unlock()
|
2023-07-14 23:19:43 +00:00
|
|
|
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
|
|
|
|
}
|
|
|
|
|
2023-07-14 23:19:43 +00:00
|
|
|
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
|
2023-08-18 19:23:14 +00:00
|
|
|
llm.Base.Lock()
|
|
|
|
|
2023-07-14 23:19:43 +00:00
|
|
|
predictOptions := buildPredictOptions(opts)
|
|
|
|
|
|
|
|
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
|
|
|
|
results <- token
|
|
|
|
return true
|
|
|
|
}))
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
|
|
|
|
if err != nil {
|
|
|
|
fmt.Println("err: ", err)
|
|
|
|
}
|
|
|
|
close(results)
|
2023-08-18 19:23:14 +00:00
|
|
|
llm.Base.Unlock()
|
2023-07-14 23:19:43 +00:00
|
|
|
}()
|
2023-07-14 23:19:43 +00:00
|
|
|
|
|
|
|
return nil
|
2023-07-14 23:19:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
|
2023-08-18 19:23:14 +00:00
|
|
|
llm.Base.Lock()
|
|
|
|
defer llm.Base.Unlock()
|
|
|
|
|
2023-07-14 23:19:43 +00:00
|
|
|
predictOptions := buildPredictOptions(opts)
|
|
|
|
|
|
|
|
if len(opts.EmbeddingTokens) > 0 {
|
|
|
|
tokens := []int{}
|
|
|
|
for _, t := range opts.EmbeddingTokens {
|
|
|
|
tokens = append(tokens, int(t))
|
|
|
|
}
|
|
|
|
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
|
|
|
|
}
|
|
|
|
|
|
|
|
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
|
|
|
|
}
|
2023-08-18 19:23:14 +00:00
|
|
|
|
|
|
|
func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
|
|
|
|
llm.Base.Lock()
|
|
|
|
defer llm.Base.Unlock()
|
|
|
|
|
|
|
|
predictOptions := buildPredictOptions(opts)
|
|
|
|
l, tokens, err := llm.llama.TokenizeString(opts.Prompt, predictOptions...)
|
|
|
|
if err != nil {
|
|
|
|
return pb.TokenizationResponse{}, err
|
|
|
|
}
|
|
|
|
return pb.TokenizationResponse{
|
|
|
|
Length: l,
|
|
|
|
Tokens: tokens,
|
|
|
|
}, nil
|
|
|
|
}
|