seanpedrickcase commited on
Commit
1f0d087
·
1 Parent(s): 3db2499

Moved model load to chatfuncs submodule to hopefully avoid gpu run issues

Browse files
Files changed (3) hide show
  1. app.py +2 -88
  2. tools/chatfuncs.py +94 -11
  3. tools/llm_api_call.py +2 -2
app.py CHANGED
@@ -9,11 +9,6 @@ from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt,
9
  #from tools.aws_functions import load_data_from_aws
10
  import gradio as gr
11
  import pandas as pd
12
- import tools.chatfuncs as chatf
13
- from tools.chatfuncs import llama_cpp_init_config_gpu, llama_cpp_init_config_cpu
14
- from llama_cpp import Llama
15
- from huggingface_hub import hf_hub_download
16
- from torch import cuda, backends
17
  from datetime import datetime
18
 
19
  today_rev = datetime.now().strftime("%Y%m%d")
@@ -27,90 +22,9 @@ access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
27
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
28
  usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
29
 
30
- ###
31
- # Load local model
32
- ###
33
-
34
- # Check for torch cuda
35
- print("Is CUDA enabled? ", cuda.is_available())
36
- print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
37
- if cuda.is_available():
38
- torch_device = "cuda"
39
- os.system("nvidia-smi")
40
- else:
41
- torch_device = "cpu"
42
-
43
- print("Device used is: ", torch_device)
44
-
45
-
46
- @spaces.GPU
47
- def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu_config:llama_cpp_init_config_gpu=chatf.gpu_config, cpu_config:llama_cpp_init_config_cpu=chatf.cpu_config, torch_device:str=chatf.torch_device):
48
- '''
49
- Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
50
- '''
51
- print("Loading model ", local_model_type)
52
-
53
- if local_model_type == "Gemma 2b":
54
- if torch_device == "cuda":
55
- gpu_config.update_gpu(gpu_layers)
56
- gpu_config.update_context(max_context_length)
57
- print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
58
- else:
59
- gpu_config.update_gpu(gpu_layers)
60
- cpu_config.update_gpu(gpu_layers)
61
-
62
- # Update context length according to slider
63
- gpu_config.update_context(max_context_length)
64
- cpu_config.update_context(max_context_length)
65
-
66
- print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
67
-
68
- #print(vars(gpu_config))
69
- #print(vars(cpu_config))
70
-
71
- def get_model_path():
72
- repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
73
- filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
74
- model_dir = "model/gemma" #"model/phi" # Assuming this is your intended directory
75
-
76
- # Construct the expected local path
77
- local_path = os.path.join(model_dir, filename)
78
-
79
- if os.path.exists(local_path):
80
- print(f"Model already exists at: {local_path}")
81
- return local_path
82
- else:
83
- print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
84
- return hf_hub_download(repo_id=repo_id, filename=filename)
85
-
86
- model_path = get_model_path()
87
-
88
- try:
89
- print(vars(gpu_config))
90
- llama_model = Llama(model_path=model_path, **vars(gpu_config)) # type_k=8, type_v = 8, flash_attn=True,
91
-
92
- except Exception as e:
93
- print("GPU load failed")
94
- print(e)
95
- llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
96
-
97
- tokenizer = []
98
-
99
- chatf.model = llama_model
100
- chatf.tokenizer = tokenizer
101
- chatf.local_model_type = local_model_type
102
 
103
- load_confirmation = "Finished loading model: " + local_model_type
104
-
105
- print(load_confirmation)
106
- return local_model_type, load_confirmation, local_model_type
107
-
108
-
109
- # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
110
- local_model_type = "Gemma 2b"
111
  if RUN_LOCAL_MODEL == "1":
112
- load_model(local_model_type, chatf.gpu_layers, chatf.context_length, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
113
-
114
  default_model_choice = "gemma_2b_it_local"
115
 
116
  elif RUN_AWS_FUNCTIONS == "1":
@@ -351,7 +265,7 @@ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
351
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
352
 
353
  MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
354
- print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
355
 
356
  MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
357
  print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
 
9
  #from tools.aws_functions import load_data_from_aws
10
  import gradio as gr
11
  import pandas as pd
 
 
 
 
 
12
  from datetime import datetime
13
 
14
  today_rev = datetime.now().strftime("%Y%m%d")
 
22
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
23
  usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
24
 
25
+ print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
 
 
 
 
 
 
 
27
  if RUN_LOCAL_MODEL == "1":
 
 
28
  default_model_choice = "gemma_2b_it_local"
29
 
30
  elif RUN_AWS_FUNCTIONS == "1":
 
265
  print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
266
 
267
  MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
268
+ print(f'The value of MAX_QUEUE_SIZE is {MAX_QUEUE_SIZE}')
269
 
270
  MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
271
  print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
tools/chatfuncs.py CHANGED
@@ -1,11 +1,11 @@
1
-
2
  from typing import TypeVar
3
-
4
- # Model packages
5
  import torch.cuda
6
- from transformers import pipeline
7
  import time
8
  import spaces
 
 
 
9
 
10
  torch.cuda.empty_cache()
11
 
@@ -16,17 +16,26 @@ model_type = None # global variable setup
16
  full_text = "" # Define dummy source text (full text) just to enable highlight function to load
17
 
18
  model = [] # Define empty list for model functions to run
19
- tokenizer = [] # Define empty list for model functions to run
 
 
20
 
 
21
 
22
- # Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
 
 
23
  if torch.cuda.is_available():
24
  torch_device = "cuda"
25
  gpu_layers = -1
 
26
  else:
27
  torch_device = "cpu"
28
  gpu_layers = 0
29
 
 
 
 
30
  print("Running on device:", torch_device)
31
  threads = torch.get_num_threads() # 8
32
  print("CPU threads:", threads)
@@ -79,7 +88,7 @@ gpu_config = llama_cpp_init_config_gpu()
79
  cpu_config = llama_cpp_init_config_cpu()
80
 
81
 
82
- class CtransGenGenerationConfig:
83
  def __init__(self, temperature=temperature,
84
  top_k=top_k,
85
  top_p=top_p,
@@ -99,10 +108,84 @@ class CtransGenGenerationConfig:
99
  def update_temp(self, new_value):
100
  self.temperature = new_value
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  def llama_cpp_streaming(history, full_prompt, temperature=temperature):
104
 
105
- gen_config = CtransGenGenerationConfig()
106
  gen_config.update_temp(temperature)
107
 
108
  print(vars(gen_config))
@@ -134,13 +217,13 @@ def llama_cpp_streaming(history, full_prompt, temperature=temperature):
134
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
135
 
136
  @spaces.GPU
137
- def call_llama_cpp_model(formatted_string, gen_config):
138
  """
139
- Calls your generation model with parameters from the CtransGenGenerationConfig object.
140
 
141
  Args:
142
  formatted_string (str): The formatted input text for the model.
143
- gen_config (CtransGenGenerationConfig): An object containing generation parameters.
144
  """
145
  # Extracting parameters from the gen_config object
146
  temperature = gen_config.temperature
 
 
1
  from typing import TypeVar
 
 
2
  import torch.cuda
3
+ import os
4
  import time
5
  import spaces
6
+ from llama_cpp import Llama
7
+ from huggingface_hub import hf_hub_download
8
+ from tools.helper_functions import RUN_LOCAL_MODEL
9
 
10
  torch.cuda.empty_cache()
11
 
 
16
  full_text = "" # Define dummy source text (full text) just to enable highlight function to load
17
 
18
  model = [] # Define empty list for model functions to run
19
+ tokenizer = [] #[] # Define empty list for model functions to run
20
+
21
+ local_model_type = "Gemma 2b"
22
 
23
+ # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
24
 
25
+ # Check for torch cuda
26
+ print("Is CUDA enabled? ", torch.cuda.is_available())
27
+ print("Is a CUDA device available on this computer?", torch.backends.cudnn.enabled)
28
  if torch.cuda.is_available():
29
  torch_device = "cuda"
30
  gpu_layers = -1
31
+ os.system("nvidia-smi")
32
  else:
33
  torch_device = "cpu"
34
  gpu_layers = 0
35
 
36
+ print("Device used is: ", torch_device)
37
+
38
+
39
  print("Running on device:", torch_device)
40
  threads = torch.get_num_threads() # 8
41
  print("CPU threads:", threads)
 
88
  cpu_config = llama_cpp_init_config_cpu()
89
 
90
 
91
+ class LlamaCPPGenerationConfig:
92
  def __init__(self, temperature=temperature,
93
  top_k=top_k,
94
  top_p=top_p,
 
108
  def update_temp(self, new_value):
109
  self.temperature = new_value
110
 
111
+ ###
112
+ # Load local model
113
+ ###
114
+
115
+ @spaces.GPU
116
+ def load_model(local_model_type:str, gpu_layers:int, max_context_length:int, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
117
+ '''
118
+ Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
119
+ '''
120
+ print("Loading model ", local_model_type)
121
+
122
+ if local_model_type == "Gemma 2b":
123
+ if torch_device == "cuda":
124
+ gpu_config.update_gpu(gpu_layers)
125
+ gpu_config.update_context(max_context_length)
126
+ print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
127
+ else:
128
+ gpu_config.update_gpu(gpu_layers)
129
+ cpu_config.update_gpu(gpu_layers)
130
+
131
+ # Update context length according to slider
132
+ gpu_config.update_context(max_context_length)
133
+ cpu_config.update_context(max_context_length)
134
+
135
+ print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
136
+
137
+ #print(vars(gpu_config))
138
+ #print(vars(cpu_config))
139
+
140
+ def get_model_path():
141
+ repo_id = os.environ.get("REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
142
+ filename = os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
143
+ model_dir = "model/gemma" #"model/phi" # Assuming this is your intended directory
144
+
145
+ # Construct the expected local path
146
+ local_path = os.path.join(model_dir, filename)
147
+
148
+ if os.path.exists(local_path):
149
+ print(f"Model already exists at: {local_path}")
150
+ return local_path
151
+ else:
152
+ print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
153
+ return hf_hub_download(repo_id=repo_id, filename=filename)
154
+
155
+ model_path = get_model_path()
156
+
157
+ try:
158
+ print(vars(gpu_config))
159
+ llama_model = Llama(model_path=model_path, **vars(gpu_config)) # type_k=8, type_v = 8, flash_attn=True,
160
+
161
+ except Exception as e:
162
+ print("GPU load failed")
163
+ print(e)
164
+ llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
165
+
166
+ tokenizer = []
167
+
168
+ model = llama_model
169
+ tokenizer = tokenizer
170
+ local_model_type = local_model_type
171
+
172
+ load_confirmation = "Finished loading model: " + local_model_type
173
+
174
+ print(load_confirmation)
175
+ return local_model_type, load_confirmation, local_model_type, model, tokenizer
176
+
177
+ ###
178
+ # Load local model
179
+ ###
180
+ if RUN_LOCAL_MODEL == "1":
181
+ print("Loading model")
182
+ local_model_type, load_confirmation, local_model_type, model, tokenizer = load_model(local_model_type, gpu_layers, context_length, gpu_config, cpu_config, torch_device)
183
+ print("model loaded:", model)
184
+
185
 
186
  def llama_cpp_streaming(history, full_prompt, temperature=temperature):
187
 
188
+ gen_config = LlamaCPPGenerationConfig()
189
  gen_config.update_temp(temperature)
190
 
191
  print(vars(gen_config))
 
217
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
218
 
219
  @spaces.GPU
220
+ def call_llama_cpp_model(formatted_string:str, gen_config:str):
221
  """
222
+ Calls your generation model with parameters from the LlamaCPPGenerationConfig object.
223
 
224
  Args:
225
  formatted_string (str): The formatted input text for the model.
226
+ gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
227
  """
228
  # Extracting parameters from the gen_config object
229
  temperature = gen_config.temperature
tools/llm_api_call.py CHANGED
@@ -19,7 +19,7 @@ GradioFileData = gr.FileData
19
 
20
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
21
  from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
22
- from tools.chatfuncs import model, CtransGenGenerationConfig, temperature, context_length, call_llama_cpp_model
23
 
24
  # ResponseObject class for AWS Bedrock calls
25
  class ResponseObject:
@@ -409,7 +409,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
409
  try:
410
  print("Calling Gemma 2B Instruct model, attempt", i + 1)
411
 
412
- gen_config = CtransGenGenerationConfig()
413
  gen_config.update_temp(temperature)
414
 
415
  response = call_llama_cpp_model(prompt, gen_config)
 
19
 
20
  from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt
21
  from tools.helper_functions import output_folder, detect_file_type, get_file_path_end, read_file, get_or_create_env_var, model_name_map, put_columns_in_df
22
+ from tools.chatfuncs import model, LlamaCPPGenerationConfig, temperature, context_length, call_llama_cpp_model
23
 
24
  # ResponseObject class for AWS Bedrock calls
25
  class ResponseObject:
 
409
  try:
410
  print("Calling Gemma 2B Instruct model, attempt", i + 1)
411
 
412
+ gen_config = LlamaCPPGenerationConfig()
413
  gen_config.update_temp(temperature)
414
 
415
  response = call_llama_cpp_model(prompt, gen_config)