Spaces:
Paused
Paused
Merge branch 'main' into aysu
Browse files- requirements.txt +1 -0
- src/deepeval/base_task.py +8 -3
- src/deepeval/bias_task.py +1 -1
- src/deepeval/commonsense_reasoning_task.py +2 -2
- src/deepeval/complex_reasoning.py +1 -1
- src/deepeval/deepeval_task_manager.py +10 -6
- src/deepeval/faithfulness_task.py +1 -1
- src/deepeval/instruction_following_task.py +1 -1
- src/deepeval/nli.py +2 -2
- src/deepeval/reading_comp_mc.py +4 -3
- src/deepeval/reading_comprehension_task.py +1 -1
- src/deepeval/sentiment_analysis_task.py +1 -1
- src/deepeval/summarization_task.py +1 -1
- src/deepeval/toxicity_task.py +1 -1
- src/deepeval/truthfulness_task.py +1 -1
- src/deepeval/turkish_general_knowledge_task.py +1 -1
- svc/router.py +86 -44
requirements.txt
CHANGED
@@ -7,5 +7,6 @@ python-jose
|
|
7 |
python-multipart
|
8 |
deepeval
|
9 |
--extra-index-url https://download.pytorch.org/whl/cu113
|
|
|
10 |
torch
|
11 |
sentencepiece
|
|
|
7 |
python-multipart
|
8 |
deepeval
|
9 |
--extra-index-url https://download.pytorch.org/whl/cu113
|
10 |
+
huggingface-hub>=0.29.1
|
11 |
torch
|
12 |
sentencepiece
|
src/deepeval/base_task.py
CHANGED
@@ -76,8 +76,8 @@ class BaseTask(ABC):
|
|
76 |
answer = self.tokenizer.decode(output[0][-1])
|
77 |
|
78 |
return answer
|
79 |
-
|
80 |
-
def generate_response_mcqa_multi_token(self, msg, max_new_tokens=
|
81 |
"""
|
82 |
Handles multiple-choice questions where answers might have multiple tokens.
|
83 |
"""
|
@@ -146,7 +146,7 @@ class BaseTask(ABC):
|
|
146 |
{"role": "assistant", "content": "I am here to help you with any questions you may have."},
|
147 |
{"role": "user", "content": prompt},
|
148 |
]
|
149 |
-
|
150 |
formatted_chat = self.tokenizer.apply_chat_template(
|
151 |
chat,
|
152 |
tokenize=False,
|
@@ -188,6 +188,11 @@ class BaseTask(ABC):
|
|
188 |
print("Loading dataset from Hugging Face.")
|
189 |
dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
|
190 |
print("Dataset loaded.")
|
|
|
|
|
|
|
|
|
|
|
191 |
return dataset
|
192 |
|
193 |
@abstractmethod
|
|
|
76 |
answer = self.tokenizer.decode(output[0][-1])
|
77 |
|
78 |
return answer
|
79 |
+
|
80 |
+
def generate_response_mcqa_multi_token(self, msg, max_new_tokens=2, choices: list = []):
|
81 |
"""
|
82 |
Handles multiple-choice questions where answers might have multiple tokens.
|
83 |
"""
|
|
|
146 |
{"role": "assistant", "content": "I am here to help you with any questions you may have."},
|
147 |
{"role": "user", "content": prompt},
|
148 |
]
|
149 |
+
|
150 |
formatted_chat = self.tokenizer.apply_chat_template(
|
151 |
chat,
|
152 |
tokenize=False,
|
|
|
188 |
print("Loading dataset from Hugging Face.")
|
189 |
dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
|
190 |
print("Dataset loaded.")
|
191 |
+
|
192 |
+
# Load %25 of each dataset
|
193 |
+
print("Original dataset size: ", len(dataset))
|
194 |
+
dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.25)))
|
195 |
+
print("Reduced dataset size: ", len(dataset))
|
196 |
return dataset
|
197 |
|
198 |
@abstractmethod
|
src/deepeval/bias_task.py
CHANGED
@@ -10,7 +10,7 @@ class BiasTask(BaseTask):
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
-
return dataset
|
14 |
|
15 |
def evaluate(self) -> dict[str, Any]:
|
16 |
|
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset
|
14 |
|
15 |
def evaluate(self) -> dict[str, Any]:
|
16 |
|
src/deepeval/commonsense_reasoning_task.py
CHANGED
@@ -10,7 +10,7 @@ class CommonsenseReasoningTask(BaseTask):
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
-
return dataset
|
14 |
|
15 |
|
16 |
def evaluate(self) -> dict[str, Any]:
|
@@ -57,7 +57,7 @@ class CommonsenseReasoningTask(BaseTask):
|
|
57 |
message = prompt
|
58 |
|
59 |
# Get/format answer of the model
|
60 |
-
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=
|
61 |
responses.append(model_answer)
|
62 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
63 |
|
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset
|
14 |
|
15 |
|
16 |
def evaluate(self) -> dict[str, Any]:
|
|
|
57 |
message = prompt
|
58 |
|
59 |
# Get/format answer of the model
|
60 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
61 |
responses.append(model_answer)
|
62 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
63 |
|
src/deepeval/complex_reasoning.py
CHANGED
@@ -11,7 +11,7 @@ class ComplexReasoningTask(BaseTask):
|
|
11 |
|
12 |
def load_dataset_from_hf(self):
|
13 |
dataset = super().load_dataset_from_hf()
|
14 |
-
return dataset
|
15 |
|
16 |
|
17 |
def evaluate(self) -> dict[str, Any]:
|
|
|
11 |
|
12 |
def load_dataset_from_hf(self):
|
13 |
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset
|
15 |
|
16 |
|
17 |
def evaluate(self) -> dict[str, Any]:
|
src/deepeval/deepeval_task_manager.py
CHANGED
@@ -74,11 +74,15 @@ class DeepEvalTaskManager:
|
|
74 |
"""Execute validated tasks in order."""
|
75 |
results = {}
|
76 |
for task_name, task_method in self.tasks_to_run.items():
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
82 |
return results
|
83 |
|
84 |
def sentiment_analysis_tr(self):
|
@@ -182,6 +186,6 @@ class DeepEvalTaskManager:
|
|
182 |
return res
|
183 |
|
184 |
if __name__ == "__main__":
|
185 |
-
des = DeepEvalTaskManager("google/gemma-2b-it", ["
|
186 |
res = des.run_tasks()
|
187 |
print(res)
|
|
|
74 |
"""Execute validated tasks in order."""
|
75 |
results = {}
|
76 |
for task_name, task_method in self.tasks_to_run.items():
|
77 |
+
try:
|
78 |
+
print("Running task: ", task_name)
|
79 |
+
task_enum = getattr(Task, task_name)
|
80 |
+
task_value = task_enum.value
|
81 |
+
results[task_value] = task_method() # Call the stored method reference
|
82 |
+
except Exception as e:
|
83 |
+
print(f"Error At Task: {task_name} - {e}")
|
84 |
+
continue
|
85 |
+
print("All tasks completed.")
|
86 |
return results
|
87 |
|
88 |
def sentiment_analysis_tr(self):
|
|
|
186 |
return res
|
187 |
|
188 |
if __name__ == "__main__":
|
189 |
+
des = DeepEvalTaskManager("google/gemma-2-2b-it", ["TOXICITY", "BIAS"])
|
190 |
res = des.run_tasks()
|
191 |
print(res)
|
src/deepeval/faithfulness_task.py
CHANGED
@@ -9,7 +9,7 @@ class FaithfulnessTask(BaseTask):
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
-
return dataset
|
13 |
|
14 |
def evaluate(self) -> dict[str, Any]:
|
15 |
|
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
+
return dataset
|
13 |
|
14 |
def evaluate(self) -> dict[str, Any]:
|
15 |
|
src/deepeval/instruction_following_task.py
CHANGED
@@ -10,7 +10,7 @@ class InstructionFollowingTask(BaseTask):
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
-
return dataset
|
14 |
|
15 |
def evaluate(self) -> dict[str, Any]:
|
16 |
results = []
|
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset
|
14 |
|
15 |
def evaluate(self) -> dict[str, Any]:
|
16 |
results = []
|
src/deepeval/nli.py
CHANGED
@@ -10,7 +10,7 @@ class NLITask(BaseTask):
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
-
return dataset
|
14 |
|
15 |
|
16 |
def evaluate(self) -> dict[str, Any]:
|
@@ -48,7 +48,7 @@ class NLITask(BaseTask):
|
|
48 |
message = prompt
|
49 |
|
50 |
# Get/format answer of the model
|
51 |
-
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=
|
52 |
responses.append(model_answer)
|
53 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
54 |
|
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset
|
14 |
|
15 |
|
16 |
def evaluate(self) -> dict[str, Any]:
|
|
|
48 |
message = prompt
|
49 |
|
50 |
# Get/format answer of the model
|
51 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
52 |
responses.append(model_answer)
|
53 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
54 |
|
src/deepeval/reading_comp_mc.py
CHANGED
@@ -11,7 +11,7 @@ class ReadingComprehensionMCTask(BaseTask):
|
|
11 |
|
12 |
def load_dataset_from_hf(self):
|
13 |
dataset = super().load_dataset_from_hf()
|
14 |
-
return dataset
|
15 |
|
16 |
|
17 |
def evaluate(self) -> dict[str, Any]:
|
@@ -41,8 +41,9 @@ class ReadingComprehensionMCTask(BaseTask):
|
|
41 |
answer_index = answer
|
42 |
else:
|
43 |
answer_index = int(answer)
|
44 |
-
|
45 |
-
|
|
|
46 |
|
47 |
# Construct the prompt/message
|
48 |
instruction = ""
|
|
|
11 |
|
12 |
def load_dataset_from_hf(self):
|
13 |
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset
|
15 |
|
16 |
|
17 |
def evaluate(self) -> dict[str, Any]:
|
|
|
41 |
answer_index = answer
|
42 |
else:
|
43 |
answer_index = int(answer)
|
44 |
+
|
45 |
+
answer_index = answer_index - 1 # Because the answer is 1-indexed
|
46 |
+
correct_answer_letter = chr(65 + answer_index)
|
47 |
|
48 |
# Construct the prompt/message
|
49 |
instruction = ""
|
src/deepeval/reading_comprehension_task.py
CHANGED
@@ -28,7 +28,7 @@ class ReadingComprehensionTask(BaseTask):
|
|
28 |
|
29 |
def load_dataset_from_hf(self):
|
30 |
dataset = super().load_dataset_from_hf()
|
31 |
-
return dataset
|
32 |
|
33 |
def evaluate(self) -> dict[str, Any]:
|
34 |
results = []
|
|
|
28 |
|
29 |
def load_dataset_from_hf(self):
|
30 |
dataset = super().load_dataset_from_hf()
|
31 |
+
return dataset
|
32 |
|
33 |
def evaluate(self) -> dict[str, Any]:
|
34 |
results = []
|
src/deepeval/sentiment_analysis_task.py
CHANGED
@@ -9,7 +9,7 @@ class SentimentAnalysisTask(BaseTask):
|
|
9 |
def load_dataset_from_hf(self):
|
10 |
print("Loading the dataset")
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
-
return dataset
|
13 |
|
14 |
|
15 |
def evaluate(self) -> dict[str, Any]:
|
|
|
9 |
def load_dataset_from_hf(self):
|
10 |
print("Loading the dataset")
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
+
return dataset
|
13 |
|
14 |
|
15 |
def evaluate(self) -> dict[str, Any]:
|
src/deepeval/summarization_task.py
CHANGED
@@ -9,7 +9,7 @@ class SummarizationTask(BaseTask):
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
-
return dataset
|
13 |
|
14 |
def evaluate(self) -> dict[str, Any]:
|
15 |
results = []
|
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
+
return dataset
|
13 |
|
14 |
def evaluate(self) -> dict[str, Any]:
|
15 |
results = []
|
src/deepeval/toxicity_task.py
CHANGED
@@ -9,7 +9,7 @@ class ToxicityTask(BaseTask):
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
-
return dataset
|
13 |
|
14 |
|
15 |
def evaluate(self) -> dict[str, Any]:
|
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
+
return dataset
|
13 |
|
14 |
|
15 |
def evaluate(self) -> dict[str, Any]:
|
src/deepeval/truthfulness_task.py
CHANGED
@@ -26,7 +26,7 @@ class TruthfulnessTask(BaseTask):
|
|
26 |
|
27 |
def load_dataset_from_hf(self):
|
28 |
dataset = super().load_dataset_from_hf()
|
29 |
-
return dataset
|
30 |
|
31 |
def evaluate(self) -> dict[str, Any]:
|
32 |
results = []
|
|
|
26 |
|
27 |
def load_dataset_from_hf(self):
|
28 |
dataset = super().load_dataset_from_hf()
|
29 |
+
return dataset
|
30 |
|
31 |
def evaluate(self) -> dict[str, Any]:
|
32 |
results = []
|
src/deepeval/turkish_general_knowledge_task.py
CHANGED
@@ -9,7 +9,7 @@ class TurkishGeneralKnowledgeTask(BaseTask):
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
-
return dataset
|
13 |
|
14 |
def evaluate(self):
|
15 |
responses = []
|
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
+
return dataset
|
13 |
|
14 |
def evaluate(self):
|
15 |
responses = []
|
svc/router.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from fastapi import APIRouter, HTTPException, Depends
|
2 |
import logging
|
3 |
|
@@ -8,11 +9,13 @@ from auth.authentication import get_current_user, create_access_token
|
|
8 |
from dotenv import load_dotenv
|
9 |
import os
|
10 |
import json
|
|
|
11 |
from src.deepeval.deepeval_task_manager import DeepEvalTaskManager
|
12 |
import torch
|
13 |
import gc
|
14 |
from time import time
|
15 |
from huggingface_hub import HfApi, ModelInfo
|
|
|
16 |
|
17 |
|
18 |
router = APIRouter()
|
@@ -25,7 +28,6 @@ HF_TOKEN = os.getenv("HF_TOKEN")
|
|
25 |
|
26 |
# Or configure a HfApi client
|
27 |
hf_api = HfApi(
|
28 |
-
endpoint="https://huggingface.co", # Can be a Private Hub endpoint.
|
29 |
token=HF_TOKEN, # Token is not persisted on the machine.
|
30 |
)
|
31 |
|
@@ -48,6 +50,11 @@ async def deep_eval_status():
|
|
48 |
#Return running with 200 status code
|
49 |
return {"status": "running"}
|
50 |
|
|
|
|
|
|
|
|
|
|
|
51 |
@router.post("/chat", response_model=TaskResponse)
|
52 |
def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
|
53 |
logger.info(request)
|
@@ -82,48 +89,83 @@ def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_c
|
|
82 |
return TaskResponse(results=dumped)
|
83 |
|
84 |
|
85 |
-
@router.post("/deepeval/eval", response_model=TaskResponse)
|
86 |
-
async def deep_eval_suite(request: DeepEvalSuiteRequest):
|
87 |
-
des = DeepEvalTaskManager(request.model_name, request.tasks)
|
88 |
-
start_time = time()
|
89 |
-
results = des.run_tasks() #TODO: format should be different. Check metunlp/results repo for the correct format
|
90 |
-
end_time = time()
|
91 |
-
duration = round(end_time - start_time, 2) # total_evaluation_time_seconds
|
92 |
-
|
93 |
-
model_info: ModelInfo = hf_api.model_info(request.model_name)
|
94 |
-
|
95 |
-
config = {
|
96 |
-
"model_source": "hf",
|
97 |
-
"num_fewshot": 0,
|
98 |
-
"batch_size": 8,
|
99 |
-
"batch_sizes": [],
|
100 |
-
"device": "cuda:0", # TODO: take this from requests
|
101 |
-
# "no_cache": true,
|
102 |
-
# "limit": null,
|
103 |
-
# "bootstrap_iters": 100000,
|
104 |
-
# "description_dict": null,
|
105 |
-
"model_dtype": "torch.float16", # TODO: take this from requests
|
106 |
-
"model_name": request.model_name,
|
107 |
-
"model_sha": model_info.sha
|
108 |
-
}
|
109 |
-
|
110 |
-
tbr_dict = {
|
111 |
-
"results": results,
|
112 |
-
"config": config,
|
113 |
-
"total_evaluation_time_seconds": duration,
|
114 |
-
"start_time": start_time,
|
115 |
-
"end_time": end_time
|
116 |
-
}
|
117 |
-
|
118 |
-
json_results = json.dumps(tbr_dict)
|
119 |
-
|
120 |
-
#Free up VRAM
|
121 |
-
torch.cuda.empty_cache()
|
122 |
-
|
123 |
-
#Free up RAM
|
124 |
-
des = None
|
125 |
-
gc.collect()
|
126 |
-
|
127 |
-
return TaskResponse(results=json_results)
|
128 |
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime, timedelta
|
2 |
from fastapi import APIRouter, HTTPException, Depends
|
3 |
import logging
|
4 |
|
|
|
9 |
from dotenv import load_dotenv
|
10 |
import os
|
11 |
import json
|
12 |
+
from pathlib import Path
|
13 |
from src.deepeval.deepeval_task_manager import DeepEvalTaskManager
|
14 |
import torch
|
15 |
import gc
|
16 |
from time import time
|
17 |
from huggingface_hub import HfApi, ModelInfo
|
18 |
+
import threading
|
19 |
|
20 |
|
21 |
router = APIRouter()
|
|
|
28 |
|
29 |
# Or configure a HfApi client
|
30 |
hf_api = HfApi(
|
|
|
31 |
token=HF_TOKEN, # Token is not persisted on the machine.
|
32 |
)
|
33 |
|
|
|
50 |
#Return running with 200 status code
|
51 |
return {"status": "running"}
|
52 |
|
53 |
+
@router.get("/deepeval/hardware")
|
54 |
+
def hardware_status():
|
55 |
+
info = get_gpu_tier()
|
56 |
+
return info
|
57 |
+
|
58 |
@router.post("/chat", response_model=TaskResponse)
|
59 |
def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
|
60 |
logger.info(request)
|
|
|
89 |
return TaskResponse(results=dumped)
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
|
94 |
+
@router.post("/deepeval/eval", response_model=TaskResponse)
|
95 |
+
def deep_eval_suite(request: DeepEvalSuiteRequest):
|
96 |
+
def run_in_background():
|
97 |
+
try:
|
98 |
+
torch.cuda.empty_cache()
|
99 |
+
des = DeepEvalTaskManager(request.model_name, request.tasks)
|
100 |
+
|
101 |
+
start_time = time()
|
102 |
+
results = des.run_tasks()
|
103 |
+
end_time = time()
|
104 |
+
duration = round(end_time - start_time, 2)
|
105 |
+
|
106 |
+
model_info: ModelInfo = hf_api.model_info(request.model_name)
|
107 |
+
|
108 |
+
config = {
|
109 |
+
"model_source": "hf",
|
110 |
+
"num_fewshot": 0,
|
111 |
+
"batch_size": 8,
|
112 |
+
"device": "cuda:0",
|
113 |
+
"model_dtype": "torch.float16",
|
114 |
+
"model_name": request.model_name,
|
115 |
+
"model_sha": model_info.sha,
|
116 |
+
}
|
117 |
+
|
118 |
+
final_results = {
|
119 |
+
"results": results,
|
120 |
+
"config": config,
|
121 |
+
"total_evaluation_time_seconds": duration,
|
122 |
+
"start_time": start_time,
|
123 |
+
"end_time": end_time
|
124 |
+
}
|
125 |
+
|
126 |
+
# Save and upload
|
127 |
+
dumped = json.dumps(final_results, indent=2)
|
128 |
+
path = Path("/tmp", request.model_name, f"results_{datetime.now()}.json")
|
129 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
130 |
+
path.write_text(dumped)
|
131 |
+
|
132 |
+
RESULTS_REPO = "metunlp/results"
|
133 |
+
hf_api.upload_file(
|
134 |
+
path_or_fileobj=path,
|
135 |
+
path_in_repo=path.relative_to("/tmp").as_posix(),
|
136 |
+
repo_id=RESULTS_REPO,
|
137 |
+
repo_type="dataset",
|
138 |
+
)
|
139 |
+
|
140 |
+
logger.info(f"β
Uploaded results to HF Hub for {request.model_name}")
|
141 |
+
|
142 |
+
except Exception as e:
|
143 |
+
logger.exception(f"β Background evaluation failed: {e}")
|
144 |
+
|
145 |
+
# π Start evaluation in background
|
146 |
+
threading.Thread(target=run_in_background, daemon=True).start()
|
147 |
+
|
148 |
+
# β
Immediately respond
|
149 |
+
return TaskResponse(results=json.dumps({"status": "Evaluation started in background"}))
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
def get_gpu_tier():
|
155 |
+
if not torch.cuda.is_available():
|
156 |
+
return {"gpu": "CPU", "tier": "cpu"}
|
157 |
+
|
158 |
+
gpu_name = torch.cuda.get_device_name(0).lower()
|
159 |
+
|
160 |
+
# Normalize GPU model to your custom tier system
|
161 |
+
if "t4" in gpu_name:
|
162 |
+
# You can improve this by checking memory or other context
|
163 |
+
return {"gpu": "Tesla T4", "tier": "t4-medium"}
|
164 |
+
elif "l4" in gpu_name:
|
165 |
+
return {"gpu": "NVIDIA L4", "tier": "l4x1"}
|
166 |
+
elif "l40s" in gpu_name:
|
167 |
+
return {"gpu": "NVIDIA L40S", "tier": "l40sx1"}
|
168 |
+
elif "a10g" in gpu_name:
|
169 |
+
return {"gpu": "NVIDIA A10G", "tier": "a10g"}
|
170 |
+
else:
|
171 |
+
return {"gpu": gpu_name, "tier": "unknown"}
|