import inspect import os import time from typing import Dict, Any, Optional, List, Iterator import filelock from langchain.callbacks.manager import CallbackManagerForLLMRun from langchain.schema.output import GenerationChunk from langchain_community.llms import gpt4all from pydantic.v1 import root_validator from enums import coqui_lock_name from utils import FakeTokenizer, url_alive, download_simple, clear_torch_cache, n_gpus_global, makedirs, get_lock_file def get_model_tokenizer_gpt4all(base_model, n_jobs=None, gpu_id=None, n_gpus=None, max_seq_len=None, llamacpp_dict=None, llamacpp_path=None): cvd = os.getenv('CUDA_VISIBLE_DEVICES') if gpu_id is not None and gpu_id != -1: os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) assert llamacpp_dict is not None # defaults (some of these are generation parameters, so need to be passed in at generation time) model_name = base_model.lower() llama_kwargs = dict(model_name=model_name, model=None, n_jobs=n_jobs, n_gpus=n_gpus, main_gpu=gpu_id if gpu_id not in [None, -1, '-1'] else 0, inner_class=True, max_seq_len=max_seq_len, llamacpp_dict=llamacpp_dict, llamacpp_path=llamacpp_path) model, tokenizer, redo, max_seq_len = get_llm_gpt4all(**llama_kwargs) if redo: del model del tokenizer clear_torch_cache() # auto max_seq_len llama_kwargs.update(dict(max_seq_len=max_seq_len)) model, tokenizer, redo, max_seq_len = get_llm_gpt4all(**llama_kwargs) if cvd is not None: os.environ['CUDA_VISIBLE_DEVICES'] = cvd else: os.environ.pop('CUDA_VISIBLE_DEVICES', None) return model, tokenizer, 'cpu' if n_gpus != 0 else 'cuda' from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler class H2OStreamingStdOutCallbackHandler(StreamingStdOutCallbackHandler): def on_llm_new_token(self, token: str, **kwargs: Any) -> None: """Run on new LLM token. Only available when streaming is enabled.""" # streaming to std already occurs without this # sys.stdout.write(token) # sys.stdout.flush() pass def get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=[]): # default from class model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items() if k not in exclude_list} # from our defaults model_kwargs.update(default_kwargs) # from user defaults model_kwargs.update(llamacpp_dict) # ensure only valid keys func_names = list(inspect.signature(cls).parameters) model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names} # make int or float if can to satisfy types for class for k, v in model_kwargs.items(): try: if float(v) == int(v): model_kwargs[k] = int(v) else: model_kwargs[k] = float(v) except: pass return model_kwargs def get_gpt4all_default_kwargs(max_new_tokens=256, temperature=0.1, seed=0, repetition_penalty=1.0, top_k=40, top_p=0.7, n_jobs=None, verbose=False, max_seq_len=None, main_gpu=0, ): if n_jobs in [None, -1]: n_jobs = int(os.getenv('OMP_NUM_THREADS', str(os.cpu_count() // 2))) n_jobs = max(1, min(20, n_jobs)) # hurts beyond some point n_gpus = n_gpus_global max_seq_len_local = max_seq_len if max_seq_len is not None else 2048 # fake for auto mode default_kwargs = dict(context_erase=0.5, n_batch=1, max_tokens=max_new_tokens, n_predict=max_new_tokens, repeat_last_n=64 if repetition_penalty != 1.0 else 0, repeat_penalty=repetition_penalty, temp=temperature, temperature=temperature, seed=seed, top_k=top_k, top_p=top_p, use_mlock=True, n_ctx=max_seq_len_local, n_threads=n_jobs, main_gpu=main_gpu, verbose=verbose) if n_gpus != 0: default_kwargs.update(dict(n_gpu_layers=100, f16_kv=True)) return default_kwargs def get_llm_gpt4all(model_name=None, model=None, max_new_tokens=256, temperature=0.1, seed=0, repetition_penalty=1.0, top_k=40, top_p=0.7, streaming=False, callbacks=None, tokenizer=None, prompter=None, max_time=None, context='', iinput='', chat_conversation=[], user_prompt_for_fake_system_prompt=None, n_jobs=None, n_gpus=None, main_gpu=0, verbose=False, inner_class=False, max_seq_len=None, llamacpp_path=None, llamacpp_dict=None, ): model_was_None = model is None redo = False if not inner_class: assert prompter is not None default_kwargs = \ get_gpt4all_default_kwargs(max_new_tokens=max_new_tokens, temperature=temperature, seed=seed, repetition_penalty=repetition_penalty, top_k=top_k, top_p=top_p, n_jobs=n_jobs, verbose=verbose, max_seq_len=max_seq_len, main_gpu=main_gpu, ) if model_name == 'llama': # FIXME: streaming not thread safe due to: # llama_cpp/utils.py: sys.stdout = self.outnull_file # llama_cpp/utils.py: sys.stdout = self.old_stdout cls = H2OLlamaCpp if model is None: llamacpp_dict = llamacpp_dict.copy() model_path = llamacpp_dict.pop('model_path_llama') model_file = model_path if model_file.endswith('?download=true'): model_file = model_file.replace('?download=true', '') llamacpp_path = os.getenv('LLAMACPP_PATH', llamacpp_path) or './' if os.path.isfile(os.path.basename(model_file)): # e.g. if offline but previously downloaded model_path = os.path.basename(model_file) elif os.path.isfile(os.path.join(llamacpp_path, os.path.basename(model_file))): # e.g. so don't have to point to full previously-downloaded path model_path = os.path.join(llamacpp_path, os.path.basename(model_file)) elif url_alive(model_path): # online dest = os.path.join(llamacpp_path, os.path.basename(model_path)) if llamacpp_path else None if dest.endswith('?download=true'): dest = dest.replace('?download=true', '') model_path = download_simple(model_path, dest=dest) else: model_path = model model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming, prompter=prompter, context=context, iinput=iinput, tokenizer=tokenizer, chat_conversation=chat_conversation, user_prompt_for_fake_system_prompt=user_prompt_for_fake_system_prompt, n_gpus=n_gpus, max_time=max_time, )) # migration to new langchain fix: odd_keys = ['model_kwargs', 'grammar_path', 'grammar'] for key in odd_keys: model_kwargs.pop(key, None) llm = cls(**model_kwargs) llm.client.verbose = verbose inner_model = llm.client if max_seq_len is None: redo = True max_seq_len = llm.client.n_embd() print("Auto-detected LLaMa n_ctx=%s, will unload then reload with this setting." % max_seq_len) if model_was_None is None: # with multiple GPUs, something goes wrong unless generation occurs early before other imports # CUDA error 704 at /tmp/pip-install-khkugdmy/llama-cpp-python_8c0a9782b7604a5aaf95ec79856eac97/vendor/llama.cpp/ggml-cuda.cu:6408: peer access is already enabled # But don't do this action in case another thread doing llama.cpp, so just getting model ready. inner_model("Say exactly one word", max_tokens=1) inner_tokenizer = FakeTokenizer(tokenizer=llm.client, is_llama_cpp=True, model_max_length=max_seq_len) elif model_name == 'gpt4all_llama': # FIXME: streaming not thread safe due to: # gpt4all/pyllmodel.py: sys.stdout = stream_processor # gpt4all/pyllmodel.py: sys.stdout = old_stdout cls = H2OGPT4All if model is None: llamacpp_dict = llamacpp_dict.copy() model_path = llamacpp_dict.pop('model_name_gpt4all_llama') if url_alive(model_path): # online llamacpp_path = os.getenv('LLAMACPP_PATH', llamacpp_path) or './' dest = os.path.join(llamacpp_path, os.path.basename(model_path)) if llamacpp_path else None model_path = download_simple(model_path, dest=dest) else: model_path = model model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) model_kwargs.update( dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming, prompter=prompter, context=context, iinput=iinput, tokenizer=tokenizer, chat_conversation=chat_conversation, user_prompt_for_fake_system_prompt=user_prompt_for_fake_system_prompt, )) llm = cls(**model_kwargs) inner_model = llm.client inner_tokenizer = FakeTokenizer(model_max_length=max_seq_len) elif model_name == 'gptj': # FIXME: streaming not thread safe due to: # gpt4all/pyllmodel.py: sys.stdout = stream_processor # gpt4all/pyllmodel.py: sys.stdout = old_stdout cls = H2OGPT4All if model is None: llamacpp_dict = llamacpp_dict.copy() model_path = llamacpp_dict.pop('model_name_gptj') if model is None else model if url_alive(model_path): llamacpp_path = os.getenv('LLAMACPP_PATH', llamacpp_path) or './' dest = os.path.join(llamacpp_path, os.path.basename(model_path)) if llamacpp_path else None model_path = download_simple(model_path, dest=dest) else: model_path = model model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) model_kwargs.update( dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming, prompter=prompter, context=context, iinput=iinput, tokenizer=tokenizer, chat_conversation=chat_conversation, user_prompt_for_fake_system_prompt=user_prompt_for_fake_system_prompt, )) llm = cls(**model_kwargs) inner_model = llm.client inner_tokenizer = FakeTokenizer(model_max_length=max_seq_len) else: raise RuntimeError("No such model_name %s" % model_name) if inner_class: return inner_model, inner_tokenizer, redo, max_seq_len else: return llm class H2OGPT4All(gpt4all.GPT4All): model: Any tokenizer: Any = None prompter: Any context: Any = '' iinput: Any = '' chat_conversation = [] user_prompt_for_fake_system_prompt: Any = None """Path to the pre-trained GPT4All model file.""" @root_validator() def validate_environment(cls, values: Dict) -> Dict: """Validate that the python package exists in the environment.""" try: if isinstance(values["model"], str): from gpt4all import GPT4All as GPT4AllModel full_path = values["model"] model_path, delimiter, model_name = full_path.rpartition("/") model_path += delimiter values["client"] = GPT4AllModel( model_name=model_name, model_path=model_path or None, model_type=values["backend"], allow_download=True, ) if values["n_threads"] is not None: # set n_threads values["client"].model.set_thread_count(values["n_threads"]) else: values["client"] = values["model"] if values["n_threads"] is not None: # set n_threads values["client"].model.set_thread_count(values["n_threads"]) try: values["backend"] = values["client"].model_type except AttributeError: # The below is for compatibility with GPT4All Python bindings <= 0.2.3. values["backend"] = values["client"].model.model_type except ImportError: raise ValueError( "Could not import gpt4all python package. " "Please install it with `pip install gpt4all`." ) return values def _call( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs, ) -> str: # Roughly 4 chars per token if natural language n_ctx = 2048 prompt = prompt[-self.max_tokens * 4:] # use instruct prompting data_point = dict(context=self.context, instruction=prompt, input=self.iinput) prompt = self.prompter.generate_prompt(data_point, chat_conversation=self.chat_conversation, user_prompt_for_fake_system_prompt=self.user_prompt_for_fake_system_prompt, ) verbose = False if verbose: print("_call prompt: %s" % prompt, flush=True) # FIXME: GPT4ALl doesn't support yield during generate, so cannot support streaming except via itself to stdout return super()._call(prompt, stop=stop, run_manager=run_manager) # FIXME: Unsure what uses # def get_token_ids(self, text: str) -> List[int]: # return self.client.tokenize(b" " + text.encode("utf-8")) from langchain_community.llms import LlamaCpp class H2OLlamaCpp(LlamaCpp): """Path to the pre-trained GPT4All model file.""" model_path: Any tokenizer: Any = None prompter: Any context: Any iinput: Any chat_conversation = [] count_input_tokens: Any = 0 prompts: Any = [] count_output_tokens: Any = 0 n_gpus: Any = -1 max_time: Any = None user_prompt_for_fake_system_prompt: Any = None @root_validator() def validate_environment(cls, values: Dict) -> Dict: """Validate that llama-cpp-python library is installed.""" if isinstance(values["model_path"], str): model_path = values["model_path"] model_param_names = [ "lora_path", "lora_base", "n_ctx", "n_parts", "seed", "f16_kv", "logits_all", "vocab_only", "use_mlock", "n_threads", "n_batch", "use_mmap", "last_n_tokens_size", ] model_params = {k: values[k] for k in model_param_names} # For backwards compatibility, only include if non-null. if values["n_gpu_layers"] is not None: model_params["n_gpu_layers"] = values["n_gpu_layers"] try: try: from llama_cpp import Llama except Exception as e: print("Failed to listen to n_gpus: %s, trying llama_cpp module" % str(e), flush=True) try: from llama_cpp import Llama except ImportError: from llama_cpp_cuda import Llama values["client"] = Llama(model_path, **model_params) except ImportError: raise ModuleNotFoundError( "Could not import llama-cpp-python library. " "Please install the llama-cpp-python library to " "use this embedding model: pip install llama-cpp-python" ) except Exception as e: raise ValueError( f"Could not load Llama model from path: {model_path}. " f"Received error {e}" ) else: values["client"] = values["model_path"] return values def _call( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs, ) -> str: t0 = time.time() verbose = False inner_tokenizer = FakeTokenizer(tokenizer=self.client, is_llama_cpp=True, model_max_length=self.n_ctx) assert inner_tokenizer is not None from h2oai_pipeline import H2OTextGenerationPipeline prompt, num_prompt_tokens = H2OTextGenerationPipeline.limit_prompt(prompt, inner_tokenizer, max_prompt_length=self.max_tokens) # use instruct prompting data_point = dict(context=self.context, instruction=prompt, input=self.iinput) prompt = self.prompter.generate_prompt(data_point, chat_conversation=self.chat_conversation, user_prompt_for_fake_system_prompt=self.user_prompt_for_fake_system_prompt, ) self.count_input_tokens += self.get_num_tokens(prompt) self.prompts.append(prompt) if stop is None: stop = [] stop.extend(self.prompter.stop_sequences) if verbose: print("_call prompt: %s" % prompt, flush=True) # can't run llamacpp and coqui at same time, one has to win with filelock.FileLock(get_lock_file('llamacpp')): with filelock.FileLock(get_lock_file(coqui_lock_name)): if self.streaming: # parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter text = "" for token in self.stream(input=prompt, stop=stop): if self.max_time is not None and (time.time() - t0) > self.max_time: if verbose: print("LLaMa.cpp reached max_time=%s" % self.max_time, flush=True) break # for token in self.stream(input=prompt, stop=stop, run_manager=run_manager): text_chunk = token # ["choices"][0]["text"] text += text_chunk self.count_output_tokens += self.get_num_tokens(text) text = self.remove_stop_text(text, stop=stop) return text else: params = self._get_parameters(stop) params = {**params, **kwargs} result = self.client(prompt=prompt, **params) text = result["choices"][0]["text"] self.count_output_tokens += self.get_num_tokens(text) text = self.remove_stop_text(text, stop=stop) return text def remove_stop_text(self, text, stop=None): # remove stop sequences from the end of the generated text if stop is None: return text for stop_seq in stop: if stop_seq in text: text = text[:text.index(stop_seq)] return text def _stream( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> Iterator[GenerationChunk]: # parent expects only see actual new tokens, not prompt too total_text = '' for chunk in super()._stream(prompt, stop=stop, run_manager=run_manager, **kwargs): # remove stop sequences from the end of the generated text total_text += chunk.text got_stop = False if stop: for stop_seq in stop: if stop_seq in total_text: got_stop = True if not got_stop: yield chunk def get_token_ids(self, text: str) -> List[int]: return self.client.tokenize(b" " + text.encode("utf-8"))