seanpedrickcase commited on
Commit
d9427a2
·
1 Parent(s): cc6683a

Trying to move calls to @spaces.GPU to specific gemma calls to use the local model more efficiently

Browse files
Files changed (2) hide show
  1. tools/chatfuncs.py +35 -40
  2. tools/llm_api_call.py +1 -3
tools/chatfuncs.py CHANGED
@@ -2,6 +2,7 @@ from typing import TypeVar
2
  import torch.cuda
3
  import os
4
  import time
 
5
  from llama_cpp import Llama
6
  from huggingface_hub import hf_hub_download
7
  from tools.helper_functions import RUN_LOCAL_MODEL
@@ -125,7 +126,7 @@ def get_model_path():
125
  print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
126
  return hf_hub_download(repo_id=repo_id, filename=filename)
127
 
128
-
129
  def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers, max_context_length:int=context_length, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
130
  '''
131
  Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
@@ -172,15 +173,41 @@ def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers,
172
  print(load_confirmation)
173
  return model, tokenizer
174
 
175
- ###
176
- # Load local model
177
- ###
178
- # if RUN_LOCAL_MODEL == "1":
179
- # print("Loading model")
180
- # local_model_type, load_confirmation, local_model_type, model, tokenizer = load_model(local_model_type, gpu_layers, context_length, gpu_config, cpu_config, torch_device)
181
- # print("model loaded:", model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
 
183
 
 
 
184
  def llama_cpp_streaming(history, full_prompt, temperature=temperature):
185
 
186
  gen_config = LlamaCPPGenerationConfig()
@@ -213,35 +240,3 @@ def llama_cpp_streaming(history, full_prompt, temperature=temperature):
213
  print(f'Time for complete generation: {time_generate}s')
214
  print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
215
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
216
-
217
- def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
218
- """
219
- Calls your generation model with parameters from the LlamaCPPGenerationConfig object.
220
-
221
- Args:
222
- formatted_string (str): The formatted input text for the model.
223
- gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
224
- """
225
- # Extracting parameters from the gen_config object
226
- temperature = gen_config.temperature
227
- top_k = gen_config.top_k
228
- top_p = gen_config.top_p
229
- repeat_penalty = gen_config.repeat_penalty
230
- seed = gen_config.seed
231
- max_tokens = gen_config.max_tokens
232
- stream = gen_config.stream
233
-
234
- # Now you can call your model directly, passing the parameters:
235
- output = model(
236
- formatted_string,
237
- temperature=temperature,
238
- top_k=top_k,
239
- top_p=top_p,
240
- repeat_penalty=repeat_penalty,
241
- seed=seed,
242
- max_tokens=max_tokens,
243
- stream=stream#,
244
- #stop=["<|eot_id|>", "\n\n"]
245
- )
246
-
247
- return output
 
2
  import torch.cuda
3
  import os
4
  import time
5
+ import spaces
6
  from llama_cpp import Llama
7
  from huggingface_hub import hf_hub_download
8
  from tools.helper_functions import RUN_LOCAL_MODEL
 
126
  print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
127
  return hf_hub_download(repo_id=repo_id, filename=filename)
128
 
129
+ @spaces.GPU
130
  def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers, max_context_length:int=context_length, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
131
  '''
132
  Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
 
173
  print(load_confirmation)
174
  return model, tokenizer
175
 
176
+ @spaces.GPU
177
+ def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
178
+ """
179
+ Calls your generation model with parameters from the LlamaCPPGenerationConfig object.
180
+
181
+ Args:
182
+ formatted_string (str): The formatted input text for the model.
183
+ gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
184
+ """
185
+ # Extracting parameters from the gen_config object
186
+ temperature = gen_config.temperature
187
+ top_k = gen_config.top_k
188
+ top_p = gen_config.top_p
189
+ repeat_penalty = gen_config.repeat_penalty
190
+ seed = gen_config.seed
191
+ max_tokens = gen_config.max_tokens
192
+ stream = gen_config.stream
193
+
194
+ # Now you can call your model directly, passing the parameters:
195
+ output = model(
196
+ formatted_string,
197
+ temperature=temperature,
198
+ top_k=top_k,
199
+ top_p=top_p,
200
+ repeat_penalty=repeat_penalty,
201
+ seed=seed,
202
+ max_tokens=max_tokens,
203
+ stream=stream#,
204
+ #stop=["<|eot_id|>", "\n\n"]
205
+ )
206
 
207
+ return output
208
 
209
+
210
+ # This function is not used in this app
211
  def llama_cpp_streaming(history, full_prompt, temperature=temperature):
212
 
213
  gen_config = LlamaCPPGenerationConfig()
 
240
  print(f'Time for complete generation: {time_generate}s')
241
  print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
242
  print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/llm_api_call.py CHANGED
@@ -10,7 +10,6 @@ import json
10
  import math
11
  import string
12
  import re
13
- import spaces
14
  from rapidfuzz import process, fuzz
15
  from tqdm import tqdm
16
  from gradio import Progress
@@ -880,7 +879,6 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
880
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
881
 
882
 
883
- @spaces.GPU
884
  def extract_topics(in_data_file,
885
  file_data:pd.DataFrame,
886
  existing_topics_table:pd.DataFrame,
@@ -1616,7 +1614,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
1616
 
1617
  return latest_response_text, conversation_history, whole_conversation_metadata
1618
 
1619
- @spaces.GPU
1620
  def summarise_output_topics(summarised_references:pd.DataFrame,
1621
  unique_table_df:pd.DataFrame,
1622
  reference_table_df:pd.DataFrame,
 
10
  import math
11
  import string
12
  import re
 
13
  from rapidfuzz import process, fuzz
14
  from tqdm import tqdm
15
  from gradio import Progress
 
879
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
880
 
881
 
 
882
  def extract_topics(in_data_file,
883
  file_data:pd.DataFrame,
884
  existing_topics_table:pd.DataFrame,
 
1614
 
1615
  return latest_response_text, conversation_history, whole_conversation_metadata
1616
 
1617
+
1618
  def summarise_output_topics(summarised_references:pd.DataFrame,
1619
  unique_table_df:pd.DataFrame,
1620
  reference_table_df:pd.DataFrame,