Spaces:
Runtime error
Runtime error
Commit
·
1f0d087
1
Parent(s):
3db2499
Moved model load to chatfuncs submodule to hopefully avoid gpu run issues
Browse files- app.py +2 -88
- tools/chatfuncs.py +94 -11
- tools/llm_api_call.py +2 -2
app.py
CHANGED
@@ -9,11 +9,6 @@ from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt,
|
|
9 |
#from tools.aws_functions import load_data_from_aws
|
10 |
import gradio as gr
|
11 |
import pandas as pd
|
12 |
-
import tools.chatfuncs as chatf
|
13 |
-
from tools.chatfuncs import llama_cpp_init_config_gpu, llama_cpp_init_config_cpu
|
14 |
-
from llama_cpp import Llama
|
15 |
-
from huggingface_hub import hf_hub_download
|
16 |
-
from torch import cuda, backends
|
17 |
from datetime import datetime
|
18 |
|
19 |
today_rev = datetime.now().strftime("%Y%m%d")
|
@@ -27,90 +22,9 @@ access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
|
27 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
28 |
usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
29 |
|
30 |
-
|
31 |
-
# Load local model
|
32 |
-
###
|
33 |
-
|
34 |
-
# Check for torch cuda
|
35 |
-
print("Is CUDA enabled? ", cuda.is_available())
|
36 |
-
print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
|
37 |
-
if cuda.is_available():
|
38 |
-
torch_device = "cuda"
|
39 |
-
os.system("nvidia-smi")
|
40 |
-
else:
|
41 |
-
torch_device = "cpu"
|
42 |
-
|
43 |
-
print("Device used is: ", torch_device)
|
44 |
-
|
45 |
-
|
46 |
-
@spaces.GPU
|
47 |
-
def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu_config:llama_cpp_init_config_gpu=chatf.gpu_config, cpu_config:llama_cpp_init_config_cpu=chatf.cpu_config, torch_device:str=chatf.torch_device):
|
48 |
-
'''
|
49 |
-
Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
|
50 |
-
'''
|
51 |
-
print("Loading model ", local_model_type)
|
52 |
-
|
53 |
-
if local_model_type == "Gemma 2b":
|
54 |
-
if torch_device == "cuda":
|
55 |
-
gpu_config.update_gpu(gpu_layers)
|
56 |
-
gpu_config.update_context(max_context_length)
|
57 |
-
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
|
58 |
-
else:
|
59 |
-
gpu_config.update_gpu(gpu_layers)
|
60 |
-
cpu_config.update_gpu(gpu_layers)
|
61 |
-
|
62 |
-
# Update context length according to slider
|
63 |
-
gpu_config.update_context(max_context_length)
|
64 |
-
cpu_config.update_context(max_context_length)
|
65 |
-
|
66 |
-
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
|
67 |
-
|
68 |
-
#print(vars(gpu_config))
|
69 |
-
#print(vars(cpu_config))
|
70 |
-
|
71 |
-
def get_model_path():
|
72 |
-
repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
|
73 |
-
filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
|
74 |
-
model_dir = "model/gemma" #"model/phi" # Assuming this is your intended directory
|
75 |
-
|
76 |
-
# Construct the expected local path
|
77 |
-
local_path = os.path.join(model_dir, filename)
|
78 |
-
|
79 |
-
if os.path.exists(local_path):
|
80 |
-
print(f"Model already exists at: {local_path}")
|
81 |
-
return local_path
|
82 |
-
else:
|
83 |
-
print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
|
84 |
-
return hf_hub_download(repo_id=repo_id, filename=filename)
|
85 |
-
|
86 |
-
model_path = get_model_path()
|
87 |
-
|
88 |
-
try:
|
89 |
-
print(vars(gpu_config))
|
90 |
-
llama_model = Llama(model_path=model_path, **vars(gpu_config)) # type_k=8, type_v = 8, flash_attn=True,
|
91 |
-
|
92 |
-
except Exception as e:
|
93 |
-
print("GPU load failed")
|
94 |
-
print(e)
|
95 |
-
llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
|
96 |
-
|
97 |
-
tokenizer = []
|
98 |
-
|
99 |
-
chatf.model = llama_model
|
100 |
-
chatf.tokenizer = tokenizer
|
101 |
-
chatf.local_model_type = local_model_type
|
102 |
|
103 |
-
load_confirmation = "Finished loading model: " + local_model_type
|
104 |
-
|
105 |
-
print(load_confirmation)
|
106 |
-
return local_model_type, load_confirmation, local_model_type
|
107 |
-
|
108 |
-
|
109 |
-
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
110 |
-
local_model_type = "Gemma 2b"
|
111 |
if RUN_LOCAL_MODEL == "1":
|
112 |
-
load_model(local_model_type, chatf.gpu_layers, chatf.context_length, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
|
113 |
-
|
114 |
default_model_choice = "gemma_2b_it_local"
|
115 |
|
116 |
elif RUN_AWS_FUNCTIONS == "1":
|
@@ -351,7 +265,7 @@ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
|
351 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
352 |
|
353 |
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
354 |
-
print(f'The value of
|
355 |
|
356 |
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
|
357 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
|
|
9 |
#from tools.aws_functions import load_data_from_aws
|
10 |
import gradio as gr
|
11 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
12 |
from datetime import datetime
|
13 |
|
14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
|
|
22 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
23 |
usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
24 |
|
25 |
+
print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
if RUN_LOCAL_MODEL == "1":
|
|
|
|
|
28 |
default_model_choice = "gemma_2b_it_local"
|
29 |
|
30 |
elif RUN_AWS_FUNCTIONS == "1":
|
|
|
265 |
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
266 |
|
267 |
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
268 |
+
print(f'The value of MAX_QUEUE_SIZE is {MAX_QUEUE_SIZE}')
|
269 |
|
270 |
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
|
271 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
tools/chatfuncs.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
|
2 |
from typing import TypeVar
|
3 |
-
|
4 |
-
# Model packages
|
5 |
import torch.cuda
|
6 |
-
|
7 |
import time
|
8 |
import spaces
|
|
|
|
|
|
|
9 |
|
10 |
torch.cuda.empty_cache()
|
11 |
|
@@ -16,17 +16,26 @@ model_type = None # global variable setup
|
|
16 |
full_text = "" # Define dummy source text (full text) just to enable highlight function to load
|
17 |
|
18 |
model = [] # Define empty list for model functions to run
|
19 |
-
tokenizer = [] # Define empty list for model functions to run
|
|
|
|
|
20 |
|
|
|
21 |
|
22 |
-
#
|
|
|
|
|
23 |
if torch.cuda.is_available():
|
24 |
torch_device = "cuda"
|
25 |
gpu_layers = -1
|
|
|
26 |
else:
|
27 |
torch_device = "cpu"
|
28 |
gpu_layers = 0
|
29 |
|
|
|
|
|
|
|
30 |
print("Running on device:", torch_device)
|
31 |
threads = torch.get_num_threads() # 8
|
32 |
print("CPU threads:", threads)
|
@@ -79,7 +88,7 @@ gpu_config = llama_cpp_init_config_gpu()
|
|
79 |
cpu_config = llama_cpp_init_config_cpu()
|
80 |
|
81 |
|
82 |
-
class
|
83 |
def __init__(self, temperature=temperature,
|
84 |
top_k=top_k,
|
85 |
top_p=top_p,
|
@@ -99,10 +108,84 @@ class CtransGenGenerationConfig:
|
|
99 |
def update_temp(self, new_value):
|
100 |
self.temperature = new_value
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
def llama_cpp_streaming(history, full_prompt, temperature=temperature):
|
104 |
|
105 |
-
gen_config =
|
106 |
gen_config.update_temp(temperature)
|
107 |
|
108 |
print(vars(gen_config))
|
@@ -134,13 +217,13 @@ def llama_cpp_streaming(history, full_prompt, temperature=temperature):
|
|
134 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
135 |
|
136 |
@spaces.GPU
|
137 |
-
def call_llama_cpp_model(formatted_string, gen_config):
|
138 |
"""
|
139 |
-
Calls your generation model with parameters from the
|
140 |
|
141 |
Args:
|
142 |
formatted_string (str): The formatted input text for the model.
|
143 |
-
gen_config (
|
144 |
"""
|
145 |
# Extracting parameters from the gen_config object
|
146 |
temperature = gen_config.temperature
|
|
|
|
|
1 |
from typing import TypeVar
|
|
|
|
|
2 |
import torch.cuda
|
3 |
+
import os
|
4 |
import time
|
5 |
import spaces
|
6 |
+
from llama_cpp import Llama
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
+
from tools.helper_functions import RUN_LOCAL_MODEL
|
9 |
|
10 |
torch.cuda.empty_cache()
|
11 |
|
|
|
16 |
full_text = "" # Define dummy source text (full text) just to enable highlight function to load
|
17 |
|
18 |
model = [] # Define empty list for model functions to run
|
19 |
+
tokenizer = [] #[] # Define empty list for model functions to run
|
20 |
+
|
21 |
+
local_model_type = "Gemma 2b"
|
22 |
|
23 |
+
# Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
|
24 |
|
25 |
+
# Check for torch cuda
|
26 |
+
print("Is CUDA enabled? ", torch.cuda.is_available())
|
27 |
+
print("Is a CUDA device available on this computer?", torch.backends.cudnn.enabled)
|
28 |
if torch.cuda.is_available():
|
29 |
torch_device = "cuda"
|
30 |
gpu_layers = -1
|
31 |
+
os.system("nvidia-smi")
|
32 |
else:
|
33 |
torch_device = "cpu"
|
34 |
gpu_layers = 0
|
35 |
|
36 |
+
print("Device used is: ", torch_device)
|
37 |
+
|
38 |
+
|
39 |
print("Running on device:", torch_device)
|
40 |
threads = torch.get_num_threads() # 8
|
41 |
print("CPU threads:", threads)
|
|
|
88 |
cpu_config = llama_cpp_init_config_cpu()
|
89 |
|
90 |
|
91 |
+
class LlamaCPPGenerationConfig:
|
92 |
def __init__(self, temperature=temperature,
|
93 |
top_k=top_k,
|
94 |
top_p=top_p,
|
|
|
108 |
def update_temp(self, new_value):
|
109 |
self.temperature = new_value
|
110 |
|
111 |
+
###
|
112 |
+
# Load local model
|
113 |
+
###
|
114 |
+
|
115 |
+
@spaces.GPU
|
116 |
+
def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
|
117 |
+
'''
|
118 |
+
Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
|
119 |
+
'''
|
120 |
+
print("Loading model ", local_model_type)
|
121 |
+
|
122 |
+
if local_model_type == "Gemma 2b":
|
123 |
+
if torch_device == "cuda":
|
124 |
+
gpu_config.update_gpu(gpu_layers)
|
125 |
+
gpu_config.update_context(max_context_length)
|
126 |
+
print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
|
127 |
+
else:
|
128 |
+
gpu_config.update_gpu(gpu_layers)
|
129 |
+
cpu_config.update_gpu(gpu_layers)
|
130 |
+
|
131 |
+
# Update context length according to slider
|
132 |
+
gpu_config.update_context(max_context_length)
|
133 |
+
cpu_config.update_context(max_context_length)
|
134 |
+
|
135 |
+
print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
|
136 |
+
|
137 |
+
#print(vars(gpu_config))
|
138 |
+
#print(vars(cpu_config))
|
139 |
+
|
140 |
+
def get_model_path():
|
141 |
+
repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
|
142 |
+
filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
|
143 |
+
model_dir = "model/gemma" #"model/phi" # Assuming this is your intended directory
|
144 |
+
|
145 |
+
# Construct the expected local path
|
146 |
+
local_path = os.path.join(model_dir, filename)
|
147 |
+
|
148 |
+
if os.path.exists(local_path):
|
149 |
+
print(f"Model already exists at: {local_path}")
|
150 |
+
return local_path
|
151 |
+
else:
|
152 |
+
print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
|
153 |
+
return hf_hub_download(repo_id=repo_id, filename=filename)
|
154 |
+
|
155 |
+
model_path = get_model_path()
|
156 |
+
|
157 |
+
try:
|
158 |
+
print(vars(gpu_config))
|
159 |
+
llama_model = Llama(model_path=model_path, **vars(gpu_config)) # type_k=8, type_v = 8, flash_attn=True,
|
160 |
+
|
161 |
+
except Exception as e:
|
162 |
+
print("GPU load failed")
|
163 |
+
print(e)
|
164 |
+
llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
|
165 |
+
|
166 |
+
tokenizer = []
|
167 |
+
|
168 |
+
model = llama_model
|
169 |
+
tokenizer = tokenizer
|
170 |
+
local_model_type = local_model_type
|
171 |
+
|
172 |
+
load_confirmation = "Finished loading model: " + local_model_type
|
173 |
+
|
174 |
+
print(load_confirmation)
|
175 |
+
return local_model_type, load_confirmation, local_model_type, model, tokenizer
|
176 |
+
|
177 |
+
###
|
178 |
+
# Load local model
|
179 |
+
###
|
180 |
+
if RUN_LOCAL_MODEL == "1":
|
181 |
+
print("Loading model")
|
182 |
+
local_model_type, load_confirmation, local_model_type, model, tokenizer = load_model(local_model_type, gpu_layers, context_length, gpu_config, cpu_config, torch_device)
|
183 |
+
print("model loaded:", model)
|
184 |
+
|
185 |
|
186 |
def llama_cpp_streaming(history, full_prompt, temperature=temperature):
|
187 |
|
188 |
+
gen_config = LlamaCPPGenerationConfig()
|
189 |
gen_config.update_temp(temperature)
|
190 |
|
191 |
print(vars(gen_config))
|
|
|
217 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
218 |
|
219 |
@spaces.GPU
|
220 |
+
def call_llama_cpp_model(formatted_string:str, gen_config:str):
|
221 |
"""
|
222 |
+
Calls your generation model with parameters from the LlamaCPPGenerationConfig object.
|
223 |
|
224 |
Args:
|
225 |
formatted_string (str): The formatted input text for the model.
|
226 |
+
gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
|
227 |
"""
|
228 |
# Extracting parameters from the gen_config object
|
229 |
temperature = gen_config.temperature
|
tools/llm_api_call.py
CHANGED
@@ -19,7 +19,7 @@ GradioFileData = gr.FileData
|
|
19 |
|
20 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
21 |
from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
|
22 |
-
from tools.chatfuncs import model,
|
23 |
|
24 |
# ResponseObject class for AWS Bedrock calls
|
25 |
class ResponseObject:
|
@@ -409,7 +409,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
409 |
try:
|
410 |
print("Calling Gemma 2B Instruct model, attempt", i + 1)
|
411 |
|
412 |
-
gen_config =
|
413 |
gen_config.update_temp(temperature)
|
414 |
|
415 |
response = call_llama_cpp_model(prompt, gen_config)
|
|
|
19 |
|
20 |
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
|
21 |
from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
|
22 |
+
from tools.chatfuncs import model, LlamaCPPGenerationConfig, temperature, context_length, call_llama_cpp_model
|
23 |
|
24 |
# ResponseObject class for AWS Bedrock calls
|
25 |
class ResponseObject:
|
|
|
409 |
try:
|
410 |
print("Calling Gemma 2B Instruct model, attempt", i + 1)
|
411 |
|
412 |
+
gen_config = LlamaCPPGenerationConfig()
|
413 |
gen_config.update_temp(temperature)
|
414 |
|
415 |
response = call_llama_cpp_model(prompt, gen_config)
|