Spaces:
Runtime error
Runtime error
from typing import TypeVar | |
import torch.cuda | |
import os | |
import time | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
from tools.helper_functions import RUN_LOCAL_MODEL | |
torch.cuda.empty_cache() | |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame') | |
model_type = None # global variable setup | |
full_text = "" # Define dummy source text (full text) just to enable highlight function to load | |
model = [] # Define empty list for model functions to run | |
tokenizer = [] #[] # Define empty list for model functions to run | |
local_model_type = "Gemma 2b" | |
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded | |
# Check for torch cuda | |
print("Is CUDA enabled? ", torch.cuda.is_available()) | |
print("Is a CUDA device available on this computer?", torch.backends.cudnn.enabled) | |
if torch.cuda.is_available(): | |
torch_device = "cuda" | |
gpu_layers = -1 | |
os.system("nvidia-smi") | |
else: | |
torch_device = "cpu" | |
gpu_layers = 0 | |
print("Device used is: ", torch_device) | |
print("Running on device:", torch_device) | |
threads = torch.get_num_threads() # 8 | |
print("CPU threads:", threads) | |
temperature: float = 0.1 | |
top_k: int = 3 | |
top_p: float = 1 | |
repetition_penalty: float = 1.2 # Mild repetition penalty to prevent repeating table rows | |
last_n_tokens: int = 512 | |
max_new_tokens: int = 4096 # 200 | |
seed: int = 42 | |
reset: bool = True | |
stream: bool = False | |
threads: int = threads | |
batch_size:int = 256 | |
context_length:int = 16384 | |
sample = True | |
class llama_cpp_init_config_gpu: | |
def __init__(self, | |
last_n_tokens=last_n_tokens, | |
seed=seed, | |
n_threads=threads, | |
n_batch=batch_size, | |
n_ctx=context_length, | |
n_gpu_layers=gpu_layers): | |
self.last_n_tokens = last_n_tokens | |
self.seed = seed | |
self.n_threads = n_threads | |
self.n_batch = n_batch | |
self.n_ctx = n_ctx | |
self.n_gpu_layers = n_gpu_layers | |
# self.stop: list[str] = field(default_factory=lambda: [stop_string]) | |
def update_gpu(self, new_value): | |
self.n_gpu_layers = new_value | |
def update_context(self, new_value): | |
self.n_ctx = new_value | |
class llama_cpp_init_config_cpu(llama_cpp_init_config_gpu): | |
def __init__(self): | |
super().__init__() | |
self.n_gpu_layers = gpu_layers | |
self.n_ctx=context_length | |
gpu_config = llama_cpp_init_config_gpu() | |
cpu_config = llama_cpp_init_config_cpu() | |
class LlamaCPPGenerationConfig: | |
def __init__(self, temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
repeat_penalty=repetition_penalty, | |
seed=seed, | |
stream=stream, | |
max_tokens=max_new_tokens | |
): | |
self.temperature = temperature | |
self.top_k = top_k | |
self.top_p = top_p | |
self.repeat_penalty = repeat_penalty | |
self.seed = seed | |
self.max_tokens=max_tokens | |
self.stream = stream | |
def update_temp(self, new_value): | |
self.temperature = new_value | |
### | |
# Load local model | |
### | |
def get_model_path(): | |
repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF") | |
filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf") | |
model_dir = "model/gemma" #"model/phi" # Assuming this is your intended directory | |
# Construct the expected local path | |
local_path = os.path.join(model_dir, filename) | |
if os.path.exists(local_path): | |
print(f"Model already exists at: {local_path}") | |
return local_path | |
else: | |
print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found") | |
return hf_hub_download(repo_id=repo_id, filename=filename) | |
def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers, max_context_length:int=context_length, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device): | |
''' | |
Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub. | |
''' | |
print("Loading model ", local_model_type) | |
if local_model_type == "Gemma 2b": | |
if torch_device == "cuda": | |
gpu_config.update_gpu(gpu_layers) | |
gpu_config.update_context(max_context_length) | |
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx) | |
else: | |
gpu_config.update_gpu(gpu_layers) | |
cpu_config.update_gpu(gpu_layers) | |
# Update context length according to slider | |
gpu_config.update_context(max_context_length) | |
cpu_config.update_context(max_context_length) | |
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx) | |
#print(vars(gpu_config)) | |
#print(vars(cpu_config)) | |
model_path = get_model_path() | |
try: | |
print("GPU load variables:" , vars(gpu_config)) | |
llama_model = Llama(model_path=model_path, **vars(gpu_config)) # type_k=8, type_v = 8, flash_attn=True, | |
except Exception as e: | |
print("GPU load failed") | |
print(e) | |
llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True, | |
tokenizer = [] | |
model = llama_model | |
tokenizer = tokenizer | |
local_model_type = local_model_type | |
load_confirmation = "Finished loading model: " + local_model_type | |
print(load_confirmation) | |
return model, tokenizer | |
def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model): | |
""" | |
Calls your generation model with parameters from the LlamaCPPGenerationConfig object. | |
Args: | |
formatted_string (str): The formatted input text for the model. | |
gen_config (LlamaCPPGenerationConfig): An object containing generation parameters. | |
""" | |
# Extracting parameters from the gen_config object | |
temperature = gen_config.temperature | |
top_k = gen_config.top_k | |
top_p = gen_config.top_p | |
repeat_penalty = gen_config.repeat_penalty | |
seed = gen_config.seed | |
max_tokens = gen_config.max_tokens | |
stream = gen_config.stream | |
# Now you can call your model directly, passing the parameters: | |
output = model( | |
formatted_string, | |
temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
repeat_penalty=repeat_penalty, | |
seed=seed, | |
max_tokens=max_tokens, | |
stream=stream#, | |
#stop=["<|eot_id|>", "\n\n"] | |
) | |
return output | |
# This function is not used in this app | |
def llama_cpp_streaming(history, full_prompt, temperature=temperature): | |
gen_config = LlamaCPPGenerationConfig() | |
gen_config.update_temp(temperature) | |
print(vars(gen_config)) | |
# Pull the generated text from the streamer, and update the model output. | |
start = time.time() | |
NUM_TOKENS=0 | |
print('-'*4+'Start Generation'+'-'*4) | |
output = model( | |
full_prompt, **vars(gen_config)) | |
history[-1][1] = "" | |
for out in output: | |
if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]: | |
history[-1][1] += out["choices"][0]["text"] | |
NUM_TOKENS+=1 | |
yield history | |
else: | |
print(f"Unexpected output structure: {out}") | |
time_generate = time.time() - start | |
print('\n') | |
print('-'*4+'End Generation'+'-'*4) | |
print(f'Num of generated tokens: {NUM_TOKENS}') | |
print(f'Time for complete generation: {time_generate}s') | |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}') | |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms') | |