mirror of
https://github.com/ParisNeo/lollms.git
synced 2025-01-05 04:14:15 +00:00
61a4f15109
Create README.md upgraded readme upgraded upgraded Added an example of a console chat Upgraded code upgraded submodule upgraded example upgraded console application Added some logo and readme upgraded upgraded updated updated changed logo upgraded upgrade upgraded upgraded upgraded version Added console app upgraded code and service information changed documentation title upgraded code updated zoo Upgraded logo upgradded Update server_endpoints.md Update README.md Update server_endpoints.md Enhanced code enhanced work + added training fixed error in README upgraded readme Fixed console problem enhanced code Added reference to models upgraded version Update README.md upgraded binding Update README.md enhanced server upgraded console and server upgraded tool upgraded upgraded Upgraded to new Version enhanced updated personalities zoo personalities_zoo upgraded readme Possibility to send files to personalities Possibility to send files to personalities upgraded code bugfix updated upgraded upgraded console updated readme version upgrade Update README.md Added menu build at startup change upgraded code now you select a personality of not selected upgraded upgraded documentation upgraded documentation updated Upgraded bugfix now you can build custom personalities updated. now we can use other personalities Bugfix added return changed colors added protection added back to personality installation bugfix typo fixed autogptq fixed autogptq gptq upgraded gptq changed version upgraded console typo Added send file updated send file upgraded personality upgraded image analysis tool updated upgraded version upgraded tool updated gpt4all is now working version update upgraded naming scheme hapen Upgraded path data upgraded version updated upgraded version upgraded install procedures personal path can be changed online upgraded chatgpt upgraded upgraded updated version bugfix upgraded personalities upgraded version enhanced enhanced update bugfix version update Added reset functionality Added settings upgraded enhanced library upgraded models Upgraded upgraded rebased upgraded code fixed gpt4all updated version
268 lines
8.9 KiB
Python
268 lines
8.9 KiB
Python
"""Wrapper around llama.cpp."""
|
|
import logging
|
|
from typing import Any, Dict, Generator, List, Optional
|
|
|
|
from pydantic import Field, root_validator
|
|
|
|
from langchain.callbacks.manager import CallbackManagerForLLMRun
|
|
from langchain.llms.base import LLM
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class LLMModel(LLM):
|
|
"""Wrapper around the llama.cpp model.
|
|
|
|
To use, you should have the llama-cpp-python library installed, and provide the
|
|
path to the Llama model as a named parameter to the constructor.
|
|
Check out: https://github.com/abetlen/llama-cpp-python
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain.llms import LlamaCppEmbeddings
|
|
llm = LlamaCppEmbeddings(model_path="/path/to/llama/model")
|
|
"""
|
|
|
|
client: Any #: :meta private:
|
|
model_path: str
|
|
"""The path to the Llama model file."""
|
|
|
|
lora_base: Optional[str] = None
|
|
"""The path to the Llama LoRA base model."""
|
|
|
|
lora_path: Optional[str] = None
|
|
"""The path to the Llama LoRA. If None, no LoRa is loaded."""
|
|
|
|
n_ctx: int = Field(512, alias="n_ctx")
|
|
"""Token context window."""
|
|
|
|
n_parts: int = Field(-1, alias="n_parts")
|
|
"""Number of parts to split the model into.
|
|
If -1, the number of parts is automatically determined."""
|
|
|
|
seed: int = Field(-1, alias="seed")
|
|
"""Seed. If -1, a random seed is used."""
|
|
|
|
f16_kv: bool = Field(True, alias="f16_kv")
|
|
"""Use half-precision for key/value cache."""
|
|
|
|
logits_all: bool = Field(False, alias="logits_all")
|
|
"""Return logits for all tokens, not just the last token."""
|
|
|
|
vocab_only: bool = Field(False, alias="vocab_only")
|
|
"""Only load the vocabulary, no weights."""
|
|
|
|
use_mlock: bool = Field(False, alias="use_mlock")
|
|
"""Force system to keep model in RAM."""
|
|
|
|
n_threads: Optional[int] = Field(None, alias="n_threads")
|
|
"""Number of threads to use.
|
|
If None, the number of threads is automatically determined."""
|
|
|
|
n_batch: Optional[int] = Field(8, alias="n_batch")
|
|
"""Number of tokens to process in parallel.
|
|
Should be a number between 1 and n_ctx."""
|
|
|
|
n_gpu_layers: Optional[int] = Field(None, alias="n_gpu_layers")
|
|
"""Number of layers to be loaded into gpu memory. Default None."""
|
|
|
|
suffix: Optional[str] = Field(None)
|
|
"""A suffix to append to the generated text. If None, no suffix is appended."""
|
|
|
|
max_tokens: Optional[int] = 256
|
|
"""The maximum number of tokens to generate."""
|
|
|
|
temperature: Optional[float] = 0.8
|
|
"""The temperature to use for sampling."""
|
|
|
|
top_p: Optional[float] = 0.95
|
|
"""The top-p value to use for sampling."""
|
|
|
|
logprobs: Optional[int] = Field(None)
|
|
"""The number of logprobs to return. If None, no logprobs are returned."""
|
|
|
|
echo: Optional[bool] = False
|
|
"""Whether to echo the prompt."""
|
|
|
|
stop: Optional[List[str]] = []
|
|
"""A list of strings to stop generation when encountered."""
|
|
|
|
repeat_penalty: Optional[float] = 1.1
|
|
"""The penalty to apply to repeated tokens."""
|
|
|
|
top_k: Optional[int] = 40
|
|
"""The top-k value to use for sampling."""
|
|
|
|
last_n_tokens_size: Optional[int] = 64
|
|
"""The number of tokens to look back when applying the repeat_penalty."""
|
|
|
|
use_mmap: Optional[bool] = True
|
|
"""Whether to keep the model loaded in RAM"""
|
|
|
|
streaming: bool = True
|
|
"""Whether to stream the results, token by token."""
|
|
|
|
@root_validator()
|
|
def validate_environment(cls, values: Dict) -> Dict:
|
|
"""Validate that llama-cpp-python library is installed."""
|
|
model = values["model"]
|
|
model_param_names = [
|
|
"lora_path",
|
|
"lora_base",
|
|
"n_ctx",
|
|
"n_parts",
|
|
"seed",
|
|
"f16_kv",
|
|
"logits_all",
|
|
"vocab_only",
|
|
"use_mlock",
|
|
"n_threads",
|
|
"n_batch",
|
|
"use_mmap",
|
|
"last_n_tokens_size",
|
|
]
|
|
model_params = {k: values[k] for k in model_param_names}
|
|
# For backwards compatibility, only include if non-null.
|
|
if values["n_gpu_layers"] is not None:
|
|
model_params["n_gpu_layers"] = values["n_gpu_layers"]
|
|
|
|
values["client"] = model
|
|
|
|
return values
|
|
|
|
@property
|
|
def _default_params(self) -> Dict[str, Any]:
|
|
"""Get the default parameters for calling llama_cpp."""
|
|
return {
|
|
"suffix": self.suffix,
|
|
"max_tokens": self.max_tokens,
|
|
"temperature": self.temperature,
|
|
"top_p": self.top_p,
|
|
"logprobs": self.logprobs,
|
|
"echo": self.echo,
|
|
"stop_sequences": self.stop, # key here is convention among LLM classes
|
|
"repeat_penalty": self.repeat_penalty,
|
|
"top_k": self.top_k,
|
|
}
|
|
|
|
@property
|
|
def _identifying_params(self) -> Dict[str, Any]:
|
|
"""Get the identifying parameters."""
|
|
return {**{"model_path": self.model_path}, **self._default_params}
|
|
|
|
@property
|
|
def _llm_type(self) -> str:
|
|
"""Return type of llm."""
|
|
return "lollms_generic_llm"
|
|
|
|
def _get_parameters(self, stop: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
"""
|
|
Performs sanity check, preparing paramaters in format needed by llama_cpp.
|
|
|
|
Args:
|
|
stop (Optional[List[str]]): List of stop sequences for llama_cpp.
|
|
|
|
Returns:
|
|
Dictionary containing the combined parameters.
|
|
"""
|
|
|
|
# Raise error if stop sequences are in both input and default params
|
|
if self.stop and stop is not None:
|
|
raise ValueError("`stop` found in both the input and default params.")
|
|
|
|
params = self._default_params
|
|
|
|
# llama_cpp expects the "stop" key not this, so we remove it:
|
|
params.pop("stop_sequences")
|
|
|
|
# then sets it as configured, or default to an empty list:
|
|
params["stop"] = self.stop or stop or []
|
|
|
|
return params
|
|
|
|
def _call(
|
|
self,
|
|
prompt: str,
|
|
stop: Optional[List[str]] = None,
|
|
run_manager: Optional[CallbackManagerForLLMRun] = None,
|
|
) -> str:
|
|
"""Call the model and return the output.
|
|
|
|
Args:
|
|
prompt: The prompt to use for generation.
|
|
stop: A list of strings to stop generation when encountered.
|
|
|
|
Returns:
|
|
The generated text.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain.llms import LlamaCpp
|
|
llm = LlamaCpp(model_path="/path/to/local/llama/model.bin")
|
|
llm("This is a prompt.")
|
|
"""
|
|
if self.streaming:
|
|
# If streaming is enabled, we use the stream
|
|
# method that yields as they are generated
|
|
# and return the combined strings from the first choices's text:
|
|
combined_text_output = ""
|
|
for token in self.stream(prompt=prompt, stop=stop, run_manager=run_manager):
|
|
combined_text_output += token["choices"][0]["text"]
|
|
return combined_text_output
|
|
else:
|
|
params = self._get_parameters(stop)
|
|
result = self.client(prompt=prompt, **params)
|
|
return result["choices"][0]["text"]
|
|
|
|
def stream(
|
|
self,
|
|
prompt: str,
|
|
stop: Optional[List[str]] = None,
|
|
run_manager: Optional[CallbackManagerForLLMRun] = None,
|
|
) -> Generator[Dict, None, None]:
|
|
"""Yields results objects as they are generated in real time.
|
|
|
|
BETA: this is a beta feature while we figure out the right abstraction:
|
|
Once that happens, this interface could change.
|
|
|
|
It also calls the callback manager's on_llm_new_token event with
|
|
similar parameters to the OpenAI LLM class method of the same name.
|
|
|
|
Args:
|
|
prompt: The prompts to pass into the model.
|
|
stop: Optional list of stop words to use when generating.
|
|
|
|
Returns:
|
|
A generator representing the stream of tokens being generated.
|
|
|
|
Yields:
|
|
A dictionary like objects containing a string token and metadata.
|
|
See llama-cpp-python docs and below for more.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain.llms import LlamaCpp
|
|
llm = LlamaCpp(
|
|
model_path="/path/to/local/model.bin",
|
|
temperature = 0.5
|
|
)
|
|
for chunk in llm.stream("Ask 'Hi, how are you?' like a pirate:'",
|
|
stop=["'","\n"]):
|
|
result = chunk["choices"][0]
|
|
print(result["text"], end='', flush=True)
|
|
|
|
"""
|
|
params = self._get_parameters(stop)
|
|
result = self.client(prompt=prompt, stream=True, **params)
|
|
for chunk in result:
|
|
token = chunk["choices"][0]["text"]
|
|
log_probs = chunk["choices"][0].get("logprobs", None)
|
|
if run_manager:
|
|
run_manager.on_llm_new_token(
|
|
token=token, verbose=self.verbose, log_probs=log_probs
|
|
)
|
|
yield chunk
|