llm_topic_modelling / tools /chatfuncs.py
seanpedrickcase's picture
Topic deduplication/merging now separated from summarisation. Gradio upgrade
854a758
from typing import TypeVar
import torch.cuda
import os
import time
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from tools.helper_functions import RUN_LOCAL_MODEL
torch.cuda.empty_cache()
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
model_type = None # global variable setup
full_text = "" # Define dummy source text (full text) just to enable highlight function to load
model = [] # Define empty list for model functions to run
tokenizer = [] #[] # Define empty list for model functions to run
local_model_type = "Gemma 2b"
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
# Check for torch cuda
print("Is CUDA enabled? ", torch.cuda.is_available())
print("Is a CUDA device available on this computer?", torch.backends.cudnn.enabled)
if torch.cuda.is_available():
torch_device = "cuda"
gpu_layers = -1
os.system("nvidia-smi")
else:
torch_device = "cpu"
gpu_layers = 0
print("Device used is: ", torch_device)
print("Running on device:", torch_device)
threads = torch.get_num_threads() # 8
print("CPU threads:", threads)
temperature: float = 0.1
top_k: int = 3
top_p: float = 1
repetition_penalty: float = 1.2 # Mild repetition penalty to prevent repeating table rows
last_n_tokens: int = 512
max_new_tokens: int = 4096 # 200
seed: int = 42
reset: bool = True
stream: bool = False
threads: int = threads
batch_size:int = 256
context_length:int = 16384
sample = True
class llama_cpp_init_config_gpu:
def __init__(self,
last_n_tokens=last_n_tokens,
seed=seed,
n_threads=threads,
n_batch=batch_size,
n_ctx=context_length,
n_gpu_layers=gpu_layers):
self.last_n_tokens = last_n_tokens
self.seed = seed
self.n_threads = n_threads
self.n_batch = n_batch
self.n_ctx = n_ctx
self.n_gpu_layers = n_gpu_layers
# self.stop: list[str] = field(default_factory=lambda: [stop_string])
def update_gpu(self, new_value):
self.n_gpu_layers = new_value
def update_context(self, new_value):
self.n_ctx = new_value
class llama_cpp_init_config_cpu(llama_cpp_init_config_gpu):
def __init__(self):
super().__init__()
self.n_gpu_layers = gpu_layers
self.n_ctx=context_length
gpu_config = llama_cpp_init_config_gpu()
cpu_config = llama_cpp_init_config_cpu()
class LlamaCPPGenerationConfig:
def __init__(self, temperature=temperature,
top_k=top_k,
top_p=top_p,
repeat_penalty=repetition_penalty,
seed=seed,
stream=stream,
max_tokens=max_new_tokens
):
self.temperature = temperature
self.top_k = top_k
self.top_p = top_p
self.repeat_penalty = repeat_penalty
self.seed = seed
self.max_tokens=max_tokens
self.stream = stream
def update_temp(self, new_value):
self.temperature = new_value
###
# Load local model
###
def get_model_path():
repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
model_dir = "model/gemma" #"model/phi" # Assuming this is your intended directory
# Construct the expected local path
local_path = os.path.join(model_dir, filename)
if os.path.exists(local_path):
print(f"Model already exists at: {local_path}")
return local_path
else:
print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
return hf_hub_download(repo_id=repo_id, filename=filename)
def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers, max_context_length:int=context_length, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
'''
Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
'''
print("Loading model ", local_model_type)
if local_model_type == "Gemma 2b":
if torch_device == "cuda":
gpu_config.update_gpu(gpu_layers)
gpu_config.update_context(max_context_length)
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
else:
gpu_config.update_gpu(gpu_layers)
cpu_config.update_gpu(gpu_layers)
# Update context length according to slider
gpu_config.update_context(max_context_length)
cpu_config.update_context(max_context_length)
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
#print(vars(gpu_config))
#print(vars(cpu_config))
model_path = get_model_path()
try:
print("GPU load variables:" , vars(gpu_config))
llama_model = Llama(model_path=model_path, **vars(gpu_config)) # type_k=8, type_v = 8, flash_attn=True,
except Exception as e:
print("GPU load failed")
print(e)
llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
tokenizer = []
model = llama_model
tokenizer = tokenizer
local_model_type = local_model_type
load_confirmation = "Finished loading model: " + local_model_type
print(load_confirmation)
return model, tokenizer
def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
"""
Calls your generation model with parameters from the LlamaCPPGenerationConfig object.
Args:
formatted_string (str): The formatted input text for the model.
gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
"""
# Extracting parameters from the gen_config object
temperature = gen_config.temperature
top_k = gen_config.top_k
top_p = gen_config.top_p
repeat_penalty = gen_config.repeat_penalty
seed = gen_config.seed
max_tokens = gen_config.max_tokens
stream = gen_config.stream
# Now you can call your model directly, passing the parameters:
output = model(
formatted_string,
temperature=temperature,
top_k=top_k,
top_p=top_p,
repeat_penalty=repeat_penalty,
seed=seed,
max_tokens=max_tokens,
stream=stream#,
#stop=["<|eot_id|>", "\n\n"]
)
return output
# This function is not used in this app
def llama_cpp_streaming(history, full_prompt, temperature=temperature):
gen_config = LlamaCPPGenerationConfig()
gen_config.update_temp(temperature)
print(vars(gen_config))
# Pull the generated text from the streamer, and update the model output.
start = time.time()
NUM_TOKENS=0
print('-'*4+'Start Generation'+'-'*4)
output = model(
full_prompt, **vars(gen_config))
history[-1][1] = ""
for out in output:
if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
history[-1][1] += out["choices"][0]["text"]
NUM_TOKENS+=1
yield history
else:
print(f"Unexpected output structure: {out}")
time_generate = time.time() - start
print('\n')
print('-'*4+'End Generation'+'-'*4)
print(f'Num of generated tokens: {NUM_TOKENS}')
print(f'Time for complete generation: {time_generate}s')
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')