aiben / src /gpt4all_llm.py
abugaber's picture
Upload folder using huggingface_hub
3943768 verified
import inspect
import os
import time
from typing import Dict, Any, Optional, List, Iterator
import filelock
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.schema.output import GenerationChunk
from langchain_community.llms import gpt4all
from pydantic.v1 import root_validator
from enums import coqui_lock_name
from utils import FakeTokenizer, url_alive, download_simple, clear_torch_cache, n_gpus_global, makedirs, get_lock_file
def get_model_tokenizer_gpt4all(base_model, n_jobs=None, gpu_id=None, n_gpus=None, max_seq_len=None,
llamacpp_dict=None,
llamacpp_path=None):
cvd = os.getenv('CUDA_VISIBLE_DEVICES')
if gpu_id is not None and gpu_id != -1:
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
assert llamacpp_dict is not None
# defaults (some of these are generation parameters, so need to be passed in at generation time)
model_name = base_model.lower()
llama_kwargs = dict(model_name=model_name,
model=None,
n_jobs=n_jobs,
n_gpus=n_gpus,
main_gpu=gpu_id if gpu_id not in [None, -1, '-1'] else 0,
inner_class=True,
max_seq_len=max_seq_len,
llamacpp_dict=llamacpp_dict,
llamacpp_path=llamacpp_path)
model, tokenizer, redo, max_seq_len = get_llm_gpt4all(**llama_kwargs)
if redo:
del model
del tokenizer
clear_torch_cache()
# auto max_seq_len
llama_kwargs.update(dict(max_seq_len=max_seq_len))
model, tokenizer, redo, max_seq_len = get_llm_gpt4all(**llama_kwargs)
if cvd is not None:
os.environ['CUDA_VISIBLE_DEVICES'] = cvd
else:
os.environ.pop('CUDA_VISIBLE_DEVICES', None)
return model, tokenizer, 'cpu' if n_gpus != 0 else 'cuda'
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
class H2OStreamingStdOutCallbackHandler(StreamingStdOutCallbackHandler):
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
"""Run on new LLM token. Only available when streaming is enabled."""
# streaming to std already occurs without this
# sys.stdout.write(token)
# sys.stdout.flush()
pass
def get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=[]):
# default from class
model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items() if k not in exclude_list}
# from our defaults
model_kwargs.update(default_kwargs)
# from user defaults
model_kwargs.update(llamacpp_dict)
# ensure only valid keys
func_names = list(inspect.signature(cls).parameters)
model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
# make int or float if can to satisfy types for class
for k, v in model_kwargs.items():
try:
if float(v) == int(v):
model_kwargs[k] = int(v)
else:
model_kwargs[k] = float(v)
except:
pass
return model_kwargs
def get_gpt4all_default_kwargs(max_new_tokens=256,
temperature=0.1,
seed=0,
repetition_penalty=1.0,
top_k=40,
top_p=0.7,
n_jobs=None,
verbose=False,
max_seq_len=None,
main_gpu=0,
):
if n_jobs in [None, -1]:
n_jobs = int(os.getenv('OMP_NUM_THREADS', str(os.cpu_count() // 2)))
n_jobs = max(1, min(20, n_jobs)) # hurts beyond some point
n_gpus = n_gpus_global
max_seq_len_local = max_seq_len if max_seq_len is not None else 2048 # fake for auto mode
default_kwargs = dict(context_erase=0.5,
n_batch=1,
max_tokens=max_new_tokens,
n_predict=max_new_tokens,
repeat_last_n=64 if repetition_penalty != 1.0 else 0,
repeat_penalty=repetition_penalty,
temp=temperature,
temperature=temperature,
seed=seed,
top_k=top_k,
top_p=top_p,
use_mlock=True,
n_ctx=max_seq_len_local,
n_threads=n_jobs,
main_gpu=main_gpu,
verbose=verbose)
if n_gpus != 0:
default_kwargs.update(dict(n_gpu_layers=100, f16_kv=True))
return default_kwargs
def get_llm_gpt4all(model_name=None,
model=None,
max_new_tokens=256,
temperature=0.1,
seed=0,
repetition_penalty=1.0,
top_k=40,
top_p=0.7,
streaming=False,
callbacks=None,
tokenizer=None,
prompter=None,
max_time=None,
context='',
iinput='',
chat_conversation=[],
user_prompt_for_fake_system_prompt=None,
n_jobs=None,
n_gpus=None,
main_gpu=0,
verbose=False,
inner_class=False,
max_seq_len=None,
llamacpp_path=None,
llamacpp_dict=None,
):
model_was_None = model is None
redo = False
if not inner_class:
assert prompter is not None
default_kwargs = \
get_gpt4all_default_kwargs(max_new_tokens=max_new_tokens,
temperature=temperature,
seed=seed,
repetition_penalty=repetition_penalty,
top_k=top_k,
top_p=top_p,
n_jobs=n_jobs,
verbose=verbose,
max_seq_len=max_seq_len,
main_gpu=main_gpu,
)
if model_name == 'llama':
# FIXME: streaming not thread safe due to:
# llama_cpp/utils.py: sys.stdout = self.outnull_file
# llama_cpp/utils.py: sys.stdout = self.old_stdout
cls = H2OLlamaCpp
if model is None:
llamacpp_dict = llamacpp_dict.copy()
model_path = llamacpp_dict.pop('model_path_llama')
model_file = model_path
if model_file.endswith('?download=true'):
model_file = model_file.replace('?download=true', '')
llamacpp_path = os.getenv('LLAMACPP_PATH', llamacpp_path) or './'
if os.path.isfile(os.path.basename(model_file)):
# e.g. if offline but previously downloaded
model_path = os.path.basename(model_file)
elif os.path.isfile(os.path.join(llamacpp_path, os.path.basename(model_file))):
# e.g. so don't have to point to full previously-downloaded path
model_path = os.path.join(llamacpp_path, os.path.basename(model_file))
elif url_alive(model_path):
# online
dest = os.path.join(llamacpp_path, os.path.basename(model_path)) if llamacpp_path else None
if dest.endswith('?download=true'):
dest = dest.replace('?download=true', '')
model_path = download_simple(model_path, dest=dest)
else:
model_path = model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput,
tokenizer=tokenizer,
chat_conversation=chat_conversation,
user_prompt_for_fake_system_prompt=user_prompt_for_fake_system_prompt,
n_gpus=n_gpus, max_time=max_time, ))
# migration to new langchain fix:
odd_keys = ['model_kwargs', 'grammar_path', 'grammar']
for key in odd_keys:
model_kwargs.pop(key, None)
llm = cls(**model_kwargs)
llm.client.verbose = verbose
inner_model = llm.client
if max_seq_len is None:
redo = True
max_seq_len = llm.client.n_embd()
print("Auto-detected LLaMa n_ctx=%s, will unload then reload with this setting." % max_seq_len)
if model_was_None is None:
# with multiple GPUs, something goes wrong unless generation occurs early before other imports
# CUDA error 704 at /tmp/pip-install-khkugdmy/llama-cpp-python_8c0a9782b7604a5aaf95ec79856eac97/vendor/llama.cpp/ggml-cuda.cu:6408: peer access is already enabled
# But don't do this action in case another thread doing llama.cpp, so just getting model ready.
inner_model("Say exactly one word", max_tokens=1)
inner_tokenizer = FakeTokenizer(tokenizer=llm.client, is_llama_cpp=True, model_max_length=max_seq_len)
elif model_name == 'gpt4all_llama':
# FIXME: streaming not thread safe due to:
# gpt4all/pyllmodel.py: sys.stdout = stream_processor
# gpt4all/pyllmodel.py: sys.stdout = old_stdout
cls = H2OGPT4All
if model is None:
llamacpp_dict = llamacpp_dict.copy()
model_path = llamacpp_dict.pop('model_name_gpt4all_llama')
if url_alive(model_path):
# online
llamacpp_path = os.getenv('LLAMACPP_PATH', llamacpp_path) or './'
dest = os.path.join(llamacpp_path, os.path.basename(model_path)) if llamacpp_path else None
model_path = download_simple(model_path, dest=dest)
else:
model_path = model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(
dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput,
tokenizer=tokenizer,
chat_conversation=chat_conversation,
user_prompt_for_fake_system_prompt=user_prompt_for_fake_system_prompt,
))
llm = cls(**model_kwargs)
inner_model = llm.client
inner_tokenizer = FakeTokenizer(model_max_length=max_seq_len)
elif model_name == 'gptj':
# FIXME: streaming not thread safe due to:
# gpt4all/pyllmodel.py: sys.stdout = stream_processor
# gpt4all/pyllmodel.py: sys.stdout = old_stdout
cls = H2OGPT4All
if model is None:
llamacpp_dict = llamacpp_dict.copy()
model_path = llamacpp_dict.pop('model_name_gptj') if model is None else model
if url_alive(model_path):
llamacpp_path = os.getenv('LLAMACPP_PATH', llamacpp_path) or './'
dest = os.path.join(llamacpp_path, os.path.basename(model_path)) if llamacpp_path else None
model_path = download_simple(model_path, dest=dest)
else:
model_path = model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(
dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput,
tokenizer=tokenizer,
chat_conversation=chat_conversation,
user_prompt_for_fake_system_prompt=user_prompt_for_fake_system_prompt,
))
llm = cls(**model_kwargs)
inner_model = llm.client
inner_tokenizer = FakeTokenizer(model_max_length=max_seq_len)
else:
raise RuntimeError("No such model_name %s" % model_name)
if inner_class:
return inner_model, inner_tokenizer, redo, max_seq_len
else:
return llm
class H2OGPT4All(gpt4all.GPT4All):
model: Any
tokenizer: Any = None
prompter: Any
context: Any = ''
iinput: Any = ''
chat_conversation = []
user_prompt_for_fake_system_prompt: Any = None
"""Path to the pre-trained GPT4All model file."""
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that the python package exists in the environment."""
try:
if isinstance(values["model"], str):
from gpt4all import GPT4All as GPT4AllModel
full_path = values["model"]
model_path, delimiter, model_name = full_path.rpartition("/")
model_path += delimiter
values["client"] = GPT4AllModel(
model_name=model_name,
model_path=model_path or None,
model_type=values["backend"],
allow_download=True,
)
if values["n_threads"] is not None:
# set n_threads
values["client"].model.set_thread_count(values["n_threads"])
else:
values["client"] = values["model"]
if values["n_threads"] is not None:
# set n_threads
values["client"].model.set_thread_count(values["n_threads"])
try:
values["backend"] = values["client"].model_type
except AttributeError:
# The below is for compatibility with GPT4All Python bindings <= 0.2.3.
values["backend"] = values["client"].model.model_type
except ImportError:
raise ValueError(
"Could not import gpt4all python package. "
"Please install it with `pip install gpt4all`."
)
return values
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs,
) -> str:
# Roughly 4 chars per token if natural language
n_ctx = 2048
prompt = prompt[-self.max_tokens * 4:]
# use instruct prompting
data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
prompt = self.prompter.generate_prompt(data_point,
chat_conversation=self.chat_conversation,
user_prompt_for_fake_system_prompt=self.user_prompt_for_fake_system_prompt,
)
verbose = False
if verbose:
print("_call prompt: %s" % prompt, flush=True)
# FIXME: GPT4ALl doesn't support yield during generate, so cannot support streaming except via itself to stdout
return super()._call(prompt, stop=stop, run_manager=run_manager)
# FIXME: Unsure what uses
# def get_token_ids(self, text: str) -> List[int]:
# return self.client.tokenize(b" " + text.encode("utf-8"))
from langchain_community.llms import LlamaCpp
class H2OLlamaCpp(LlamaCpp):
"""Path to the pre-trained GPT4All model file."""
model_path: Any
tokenizer: Any = None
prompter: Any
context: Any
iinput: Any
chat_conversation = []
count_input_tokens: Any = 0
prompts: Any = []
count_output_tokens: Any = 0
n_gpus: Any = -1
max_time: Any = None
user_prompt_for_fake_system_prompt: Any = None
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that llama-cpp-python library is installed."""
if isinstance(values["model_path"], str):
model_path = values["model_path"]
model_param_names = [
"lora_path",
"lora_base",
"n_ctx",
"n_parts",
"seed",
"f16_kv",
"logits_all",
"vocab_only",
"use_mlock",
"n_threads",
"n_batch",
"use_mmap",
"last_n_tokens_size",
]
model_params = {k: values[k] for k in model_param_names}
# For backwards compatibility, only include if non-null.
if values["n_gpu_layers"] is not None:
model_params["n_gpu_layers"] = values["n_gpu_layers"]
try:
try:
from llama_cpp import Llama
except Exception as e:
print("Failed to listen to n_gpus: %s, trying llama_cpp module" % str(e), flush=True)
try:
from llama_cpp import Llama
except ImportError:
from llama_cpp_cuda import Llama
values["client"] = Llama(model_path, **model_params)
except ImportError:
raise ModuleNotFoundError(
"Could not import llama-cpp-python library. "
"Please install the llama-cpp-python library to "
"use this embedding model: pip install llama-cpp-python"
)
except Exception as e:
raise ValueError(
f"Could not load Llama model from path: {model_path}. "
f"Received error {e}"
)
else:
values["client"] = values["model_path"]
return values
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs,
) -> str:
t0 = time.time()
verbose = False
inner_tokenizer = FakeTokenizer(tokenizer=self.client, is_llama_cpp=True, model_max_length=self.n_ctx)
assert inner_tokenizer is not None
from h2oai_pipeline import H2OTextGenerationPipeline
prompt, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt, inner_tokenizer,
max_prompt_length=self.max_tokens)
# use instruct prompting
data_point = dict(context=self.context, instruction=prompt, input=self.iinput)
prompt = self.prompter.generate_prompt(data_point,
chat_conversation=self.chat_conversation,
user_prompt_for_fake_system_prompt=self.user_prompt_for_fake_system_prompt,
)
self.count_input_tokens += self.get_num_tokens(prompt)
self.prompts.append(prompt)
if stop is None:
stop = []
stop.extend(self.prompter.stop_sequences)
if verbose:
print("_call prompt: %s" % prompt, flush=True)
# can't run llamacpp and coqui at same time, one has to win
with filelock.FileLock(get_lock_file('llamacpp')):
with filelock.FileLock(get_lock_file(coqui_lock_name)):
if self.streaming:
# parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter
text = ""
for token in self.stream(input=prompt, stop=stop):
if self.max_time is not None and (time.time() - t0) > self.max_time:
if verbose:
print("LLaMa.cpp reached max_time=%s" % self.max_time, flush=True)
break
# for token in self.stream(input=prompt, stop=stop, run_manager=run_manager):
text_chunk = token # ["choices"][0]["text"]
text += text_chunk
self.count_output_tokens += self.get_num_tokens(text)
text = self.remove_stop_text(text, stop=stop)
return text
else:
params = self._get_parameters(stop)
params = {**params, **kwargs}
result = self.client(prompt=prompt, **params)
text = result["choices"][0]["text"]
self.count_output_tokens += self.get_num_tokens(text)
text = self.remove_stop_text(text, stop=stop)
return text
def remove_stop_text(self, text, stop=None):
# remove stop sequences from the end of the generated text
if stop is None:
return text
for stop_seq in stop:
if stop_seq in text:
text = text[:text.index(stop_seq)]
return text
def _stream(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> Iterator[GenerationChunk]:
# parent expects only see actual new tokens, not prompt too
total_text = ''
for chunk in super()._stream(prompt, stop=stop, run_manager=run_manager, **kwargs):
# remove stop sequences from the end of the generated text
total_text += chunk.text
got_stop = False
if stop:
for stop_seq in stop:
if stop_seq in total_text:
got_stop = True
if not got_stop:
yield chunk
def get_token_ids(self, text: str) -> List[int]:
return self.client.tokenize(b" " + text.encode("utf-8"))