test / src /gpt4all_llm.py
abugaber's picture
Upload folder using huggingface_hub
47d9b1d verified
import inspect
import os
from typing import Dict, Any, Optional, List, Iterator
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.schema.output import GenerationChunk
from pydantic import root_validator
from langchain.llms import gpt4all
from utils import FakeTokenizer, get_ngpus_vis, url_alive, download_simple
def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None, llamacpp_dict=None):
assert llamacpp_dict is not None
# defaults (some of these are generation parameters, so need to be passed in at generation time)
model_name = base_model.lower()
model = get_llm_gpt4all(model_name, model=None,
# max_new_tokens=max_new_tokens,
# temperature=temperature,
# repetition_penalty=repetition_penalty,
# top_k=top_k,
# top_p=top_p,
# callbacks=callbacks,
n_jobs=n_jobs,
# verbose=verbose,
# streaming=stream_output,
# prompter=prompter,
# context=context,
# iinput=iinput,
inner_class=True,
max_seq_len=max_seq_len,
llamacpp_dict=llamacpp_dict,
)
return model, FakeTokenizer(), 'cpu'
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
class H2OStreamingStdOutCallbackHandler(StreamingStdOutCallbackHandler):
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
"""Run on new LLM token. Only available when streaming is enabled."""
# streaming to std already occurs without this
# sys.stdout.write(token)
# sys.stdout.flush()
pass
def get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=[]):
# default from class
model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items() if k not in exclude_list}
# from our defaults
model_kwargs.update(default_kwargs)
# from user defaults
model_kwargs.update(llamacpp_dict)
# ensure only valid keys
func_names = list(inspect.signature(cls).parameters)
model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
# make int or float if can to satisfy types for class
for k, v in model_kwargs.items():
try:
if float(v) == int(v):
model_kwargs[k] = int(v)
else:
model_kwargs[k] = float(v)
except:
pass
return model_kwargs
def get_gpt4all_default_kwargs(max_new_tokens=256,
temperature=0.1,
repetition_penalty=1.0,
top_k=40,
top_p=0.7,
n_jobs=None,
verbose=False,
max_seq_len=None,
):
if n_jobs in [None, -1]:
n_jobs = int(os.getenv('OMP_NUM_THREADS', str(os.cpu_count()//2)))
n_jobs = max(1, min(20, n_jobs)) # hurts beyond some point
n_gpus = get_ngpus_vis()
default_kwargs = dict(context_erase=0.5,
n_batch=1,
max_tokens=max_seq_len - max_new_tokens,
n_predict=max_new_tokens,
repeat_last_n=64 if repetition_penalty != 1.0 else 0,
repeat_penalty=repetition_penalty,
temp=temperature,
temperature=temperature,
top_k=top_k,
top_p=top_p,
use_mlock=True,
n_ctx=max_seq_len,
n_threads=n_jobs,
verbose=verbose)
if n_gpus != 0:
default_kwargs.update(dict(n_gpu_layers=100))
return default_kwargs
def get_llm_gpt4all(model_name,
model=None,
max_new_tokens=256,
temperature=0.1,
repetition_penalty=1.0,
top_k=40,
top_p=0.7,
streaming=False,
callbacks=None,
prompter=None,
context='',
iinput='',
n_jobs=None,
verbose=False,
inner_class=False,
max_seq_len=None,
llamacpp_dict=None,
):
if not inner_class:
assert prompter is not None
default_kwargs = \
get_gpt4all_default_kwargs(max_new_tokens=max_new_tokens,
temperature=temperature,
repetition_penalty=repetition_penalty,
top_k=top_k,
top_p=top_p,
n_jobs=n_jobs,
verbose=verbose,
max_seq_len=max_seq_len,
)
if model_name == 'llama':
cls = H2OLlamaCpp
if model is None:
llamacpp_dict = llamacpp_dict.copy()
model_path = llamacpp_dict.pop('model_path_llama')
if os.path.isfile(os.path.basename(model_path)):
# e.g. if offline but previously downloaded
model_path = os.path.basename(model_path)
elif url_alive(model_path):
# online
ggml_path = os.getenv('GGML_PATH')
dest = os.path.join(ggml_path, os.path.basename(model_path)) if ggml_path else None
model_path = download_simple(model_path, dest=dest)
else:
model_path = model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput))
# migration to new langchain fix:
odd_keys = ['model_kwargs', 'grammar_path', 'grammar']
for key in odd_keys:
model_kwargs.pop(key, None)
llm = cls(**model_kwargs)
llm.client.verbose = verbose
inner_model = llm.client
elif model_name == 'gpt4all_llama':
cls = H2OGPT4All
if model is None:
llamacpp_dict = llamacpp_dict.copy()
model_path = llamacpp_dict.pop('model_name_gpt4all_llama')
if url_alive(model_path):
# online
ggml_path = os.getenv('GGML_PATH')
dest = os.path.join(ggml_path, os.path.basename(model_path)) if ggml_path else None
model_path = download_simple(model_path, dest=dest)
else:
model_path = model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(
dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput))
llm = cls(**model_kwargs)
inner_model = llm.client
elif model_name == 'gptj':
cls = H2OGPT4All
if model is None:
llamacpp_dict = llamacpp_dict.copy()
model_path = llamacpp_dict.pop('model_name_gptj') if model is None else model
if url_alive(model_path):
ggml_path = os.getenv('GGML_PATH')
dest = os.path.join(ggml_path, os.path.basename(model_path)) if ggml_path else None
model_path = download_simple(model_path, dest=dest)
else:
model_path = model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(
dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput))
llm = cls(**model_kwargs)
inner_model = llm.client
else:
raise RuntimeError("No such model_name %s" % model_name)
if inner_class:
return inner_model
else:
return llm
class H2OGPT4All(gpt4all.GPT4All):
model: Any
prompter: Any
context: Any = ''
iinput: Any = ''
"""Path to the pre-trained GPT4All model file."""
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that the python package exists in the environment."""
try:
if isinstance(values["model"], str):
from gpt4all import GPT4All as GPT4AllModel
full_path = values["model"]
model_path, delimiter, model_name = full_path.rpartition("/")
model_path += delimiter
values["client"] = GPT4AllModel(
model_name=model_name,
model_path=model_path or None,
model_type=values["backend"],
allow_download=True,
)
if values["n_threads"] is not None:
# set n_threads
values["client"].model.set_thread_count(values["n_threads"])
else:
values["client"] = values["model"]
if values["n_threads"] is not None:
# set n_threads
values["client"].model.set_thread_count(values["n_threads"])
try:
values["backend"] = values["client"].model_type
except AttributeError:
# The below is for compatibility with GPT4All Python bindings <= 0.2.3.
values["backend"] = values["client"].model.model_type
except ImportError:
raise ValueError(
"Could not import gpt4all python package. "
"Please install it with `pip install gpt4all`."
)
return values
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs,
) -> str:
# Roughly 4 chars per token if natural language
n_ctx = 2048
prompt = prompt[-self.max_tokens * 4:]
# use instruct prompting
data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
prompt = self.prompter.generate_prompt(data_point)
verbose = False
if verbose:
print("_call prompt: %s" % prompt, flush=True)
# FIXME: GPT4ALl doesn't support yield during generate, so cannot support streaming except via itself to stdout
return super()._call(prompt, stop=stop, run_manager=run_manager)
# FIXME: Unsure what uses
#def get_token_ids(self, text: str) -> List[int]:
# return self.client.tokenize(b" " + text.encode("utf-8"))
from langchain.llms import LlamaCpp
class H2OLlamaCpp(LlamaCpp):
model_path: Any
prompter: Any
context: Any
iinput: Any
"""Path to the pre-trained GPT4All model file."""
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that llama-cpp-python library is installed."""
if isinstance(values["model_path"], str):
model_path = values["model_path"]
model_param_names = [
"lora_path",
"lora_base",
"n_ctx",
"n_parts",
"seed",
"f16_kv",
"logits_all",
"vocab_only",
"use_mlock",
"n_threads",
"n_batch",
"use_mmap",
"last_n_tokens_size",
]
model_params = {k: values[k] for k in model_param_names}
# For backwards compatibility, only include if non-null.
if values["n_gpu_layers"] is not None:
model_params["n_gpu_layers"] = values["n_gpu_layers"]
try:
try:
from llama_cpp import Llama
except ImportError:
from llama_cpp_cuda import Llama
values["client"] = Llama(model_path, **model_params)
except ImportError:
raise ModuleNotFoundError(
"Could not import llama-cpp-python library. "
"Please install the llama-cpp-python library to "
"use this embedding model: pip install llama-cpp-python"
)
except Exception as e:
raise ValueError(
f"Could not load Llama model from path: {model_path}. "
f"Received error {e}"
)
else:
values["client"] = values["model_path"]
return values
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs,
) -> str:
verbose = False
# tokenize twice, just to count tokens, since llama cpp python wrapper has no way to truncate
# still have to avoid crazy sizes, else hit llama_tokenize: too many tokens -- might still hit, not fatal
prompt = prompt[-self.n_ctx * 4:]
prompt_tokens = self.client.tokenize(b" " + prompt.encode("utf-8"))
num_prompt_tokens = len(prompt_tokens)
if num_prompt_tokens > self.n_ctx:
# conservative by using int()
chars_per_token = int(len(prompt) / num_prompt_tokens)
prompt = prompt[-self.n_ctx * chars_per_token:]
if verbose:
print("reducing tokens, assuming average of %s chars/token: %s" % chars_per_token, flush=True)
prompt_tokens2 = self.client.tokenize(b" " + prompt.encode("utf-8"))
num_prompt_tokens2 = len(prompt_tokens2)
print("reduced tokens from %d -> %d" % (num_prompt_tokens, num_prompt_tokens2), flush=True)
# use instruct prompting
data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
prompt = self.prompter.generate_prompt(data_point)
if verbose:
print("_call prompt: %s" % prompt, flush=True)
if self.streaming:
# parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter
text = ""
for token in self.stream(input=prompt, stop=stop):
# for token in self.stream(input=prompt, stop=stop, run_manager=run_manager):
text_chunk = token # ["choices"][0]["text"]
# self.stream already calls text_callback
# if text_callback:
# text_callback(text_chunk)
text += text_chunk
# parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter
return text[len(prompt):]
else:
params = self._get_parameters(stop)
params = {**params, **kwargs}
result = self.client(prompt=prompt, **params)
return result["choices"][0]["text"]
def _stream(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> Iterator[GenerationChunk]:
# parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter
logprobs = 0
chunk = GenerationChunk(
text=prompt,
generation_info={"logprobs": logprobs},
)
yield chunk
if run_manager:
run_manager.on_llm_new_token(
token=chunk.text, verbose=self.verbose, log_probs=logprobs
)
# actual new tokens
for chunk in super()._stream(prompt, stop=stop, run_manager=run_manager, **kwargs):
yield chunk
def get_token_ids(self, text: str) -> List[int]:
return self.client.tokenize(b" " + text.encode("utf-8"))