Spaces:
Runtime error
Runtime error
Commit
·
d9427a2
1
Parent(s):
cc6683a
Trying to move calls to @spaces.GPU to specific gemma calls to use the local model more efficiently
Browse files- tools/chatfuncs.py +35 -40
- tools/llm_api_call.py +1 -3
tools/chatfuncs.py
CHANGED
@@ -2,6 +2,7 @@ from typing import TypeVar
|
|
2 |
import torch.cuda
|
3 |
import os
|
4 |
import time
|
|
|
5 |
from llama_cpp import Llama
|
6 |
from huggingface_hub import hf_hub_download
|
7 |
from tools.helper_functions import RUN_LOCAL_MODEL
|
@@ -125,7 +126,7 @@ def get_model_path():
|
|
125 |
print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
|
126 |
return hf_hub_download(repo_id=repo_id, filename=filename)
|
127 |
|
128 |
-
|
129 |
def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers, max_context_length:int=context_length, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
|
130 |
'''
|
131 |
Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
|
@@ -172,15 +173,41 @@ def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers,
|
|
172 |
print(load_confirmation)
|
173 |
return model, tokenizer
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
|
|
183 |
|
|
|
|
|
184 |
def llama_cpp_streaming(history, full_prompt, temperature=temperature):
|
185 |
|
186 |
gen_config = LlamaCPPGenerationConfig()
|
@@ -213,35 +240,3 @@ def llama_cpp_streaming(history, full_prompt, temperature=temperature):
|
|
213 |
print(f'Time for complete generation: {time_generate}s')
|
214 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
215 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
216 |
-
|
217 |
-
def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
|
218 |
-
"""
|
219 |
-
Calls your generation model with parameters from the LlamaCPPGenerationConfig object.
|
220 |
-
|
221 |
-
Args:
|
222 |
-
formatted_string (str): The formatted input text for the model.
|
223 |
-
gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
|
224 |
-
"""
|
225 |
-
# Extracting parameters from the gen_config object
|
226 |
-
temperature = gen_config.temperature
|
227 |
-
top_k = gen_config.top_k
|
228 |
-
top_p = gen_config.top_p
|
229 |
-
repeat_penalty = gen_config.repeat_penalty
|
230 |
-
seed = gen_config.seed
|
231 |
-
max_tokens = gen_config.max_tokens
|
232 |
-
stream = gen_config.stream
|
233 |
-
|
234 |
-
# Now you can call your model directly, passing the parameters:
|
235 |
-
output = model(
|
236 |
-
formatted_string,
|
237 |
-
temperature=temperature,
|
238 |
-
top_k=top_k,
|
239 |
-
top_p=top_p,
|
240 |
-
repeat_penalty=repeat_penalty,
|
241 |
-
seed=seed,
|
242 |
-
max_tokens=max_tokens,
|
243 |
-
stream=stream#,
|
244 |
-
#stop=["<|eot_id|>", "\n\n"]
|
245 |
-
)
|
246 |
-
|
247 |
-
return output
|
|
|
2 |
import torch.cuda
|
3 |
import os
|
4 |
import time
|
5 |
+
import spaces
|
6 |
from llama_cpp import Llama
|
7 |
from huggingface_hub import hf_hub_download
|
8 |
from tools.helper_functions import RUN_LOCAL_MODEL
|
|
|
126 |
print(f"Checking default Hugging Face folder. Downloading model from Hugging Face Hub if not found")
|
127 |
return hf_hub_download(repo_id=repo_id, filename=filename)
|
128 |
|
129 |
+
@spaces.GPU
|
130 |
def load_model(local_model_type:str=local_model_type, gpu_layers:int=gpu_layers, max_context_length:int=context_length, gpu_config:llama_cpp_init_config_gpu=gpu_config, cpu_config:llama_cpp_init_config_cpu=cpu_config, torch_device:str=torch_device):
|
131 |
'''
|
132 |
Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
|
|
|
173 |
print(load_confirmation)
|
174 |
return model, tokenizer
|
175 |
|
176 |
+
@spaces.GPU
|
177 |
+
def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
|
178 |
+
"""
|
179 |
+
Calls your generation model with parameters from the LlamaCPPGenerationConfig object.
|
180 |
+
|
181 |
+
Args:
|
182 |
+
formatted_string (str): The formatted input text for the model.
|
183 |
+
gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
|
184 |
+
"""
|
185 |
+
# Extracting parameters from the gen_config object
|
186 |
+
temperature = gen_config.temperature
|
187 |
+
top_k = gen_config.top_k
|
188 |
+
top_p = gen_config.top_p
|
189 |
+
repeat_penalty = gen_config.repeat_penalty
|
190 |
+
seed = gen_config.seed
|
191 |
+
max_tokens = gen_config.max_tokens
|
192 |
+
stream = gen_config.stream
|
193 |
+
|
194 |
+
# Now you can call your model directly, passing the parameters:
|
195 |
+
output = model(
|
196 |
+
formatted_string,
|
197 |
+
temperature=temperature,
|
198 |
+
top_k=top_k,
|
199 |
+
top_p=top_p,
|
200 |
+
repeat_penalty=repeat_penalty,
|
201 |
+
seed=seed,
|
202 |
+
max_tokens=max_tokens,
|
203 |
+
stream=stream#,
|
204 |
+
#stop=["<|eot_id|>", "\n\n"]
|
205 |
+
)
|
206 |
|
207 |
+
return output
|
208 |
|
209 |
+
|
210 |
+
# This function is not used in this app
|
211 |
def llama_cpp_streaming(history, full_prompt, temperature=temperature):
|
212 |
|
213 |
gen_config = LlamaCPPGenerationConfig()
|
|
|
240 |
print(f'Time for complete generation: {time_generate}s')
|
241 |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
242 |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/llm_api_call.py
CHANGED
@@ -10,7 +10,6 @@ import json
|
|
10 |
import math
|
11 |
import string
|
12 |
import re
|
13 |
-
import spaces
|
14 |
from rapidfuzz import process, fuzz
|
15 |
from tqdm import tqdm
|
16 |
from gradio import Progress
|
@@ -880,7 +879,6 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
880 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
881 |
|
882 |
|
883 |
-
@spaces.GPU
|
884 |
def extract_topics(in_data_file,
|
885 |
file_data:pd.DataFrame,
|
886 |
existing_topics_table:pd.DataFrame,
|
@@ -1616,7 +1614,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
|
|
1616 |
|
1617 |
return latest_response_text, conversation_history, whole_conversation_metadata
|
1618 |
|
1619 |
-
|
1620 |
def summarise_output_topics(summarised_references:pd.DataFrame,
|
1621 |
unique_table_df:pd.DataFrame,
|
1622 |
reference_table_df:pd.DataFrame,
|
|
|
10 |
import math
|
11 |
import string
|
12 |
import re
|
|
|
13 |
from rapidfuzz import process, fuzz
|
14 |
from tqdm import tqdm
|
15 |
from gradio import Progress
|
|
|
879 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
880 |
|
881 |
|
|
|
882 |
def extract_topics(in_data_file,
|
883 |
file_data:pd.DataFrame,
|
884 |
existing_topics_table:pd.DataFrame,
|
|
|
1614 |
|
1615 |
return latest_response_text, conversation_history, whole_conversation_metadata
|
1616 |
|
1617 |
+
|
1618 |
def summarise_output_topics(summarised_references:pd.DataFrame,
|
1619 |
unique_table_df:pd.DataFrame,
|
1620 |
reference_table_df:pd.DataFrame,
|