Ahmet Kaan Sever commited on
Commit
66ad5b0
Β·
2 Parent(s): 495c135 dbf76bc

Merge branch 'main' into aysu

Browse files
requirements.txt CHANGED
@@ -7,5 +7,6 @@ python-jose
7
  python-multipart
8
  deepeval
9
  --extra-index-url https://download.pytorch.org/whl/cu113
 
10
  torch
11
  sentencepiece
 
7
  python-multipart
8
  deepeval
9
  --extra-index-url https://download.pytorch.org/whl/cu113
10
+ huggingface-hub>=0.29.1
11
  torch
12
  sentencepiece
src/deepeval/base_task.py CHANGED
@@ -76,8 +76,8 @@ class BaseTask(ABC):
76
  answer = self.tokenizer.decode(output[0][-1])
77
 
78
  return answer
79
-
80
- def generate_response_mcqa_multi_token(self, msg, max_new_tokens=5, choices: list = []):
81
  """
82
  Handles multiple-choice questions where answers might have multiple tokens.
83
  """
@@ -146,7 +146,7 @@ class BaseTask(ABC):
146
  {"role": "assistant", "content": "I am here to help you with any questions you may have."},
147
  {"role": "user", "content": prompt},
148
  ]
149
-
150
  formatted_chat = self.tokenizer.apply_chat_template(
151
  chat,
152
  tokenize=False,
@@ -188,6 +188,11 @@ class BaseTask(ABC):
188
  print("Loading dataset from Hugging Face.")
189
  dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
190
  print("Dataset loaded.")
 
 
 
 
 
191
  return dataset
192
 
193
  @abstractmethod
 
76
  answer = self.tokenizer.decode(output[0][-1])
77
 
78
  return answer
79
+
80
+ def generate_response_mcqa_multi_token(self, msg, max_new_tokens=2, choices: list = []):
81
  """
82
  Handles multiple-choice questions where answers might have multiple tokens.
83
  """
 
146
  {"role": "assistant", "content": "I am here to help you with any questions you may have."},
147
  {"role": "user", "content": prompt},
148
  ]
149
+
150
  formatted_chat = self.tokenizer.apply_chat_template(
151
  chat,
152
  tokenize=False,
 
188
  print("Loading dataset from Hugging Face.")
189
  dataset= load_dataset(self.dataset_repo, token=HF_TOKEN, split="train")
190
  print("Dataset loaded.")
191
+
192
+ # Load %25 of each dataset
193
+ print("Original dataset size: ", len(dataset))
194
+ dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.25)))
195
+ print("Reduced dataset size: ", len(dataset))
196
  return dataset
197
 
198
  @abstractmethod
src/deepeval/bias_task.py CHANGED
@@ -10,7 +10,7 @@ class BiasTask(BaseTask):
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
- return dataset.select(range(min(3, len(dataset))))
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
 
 
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
+ return dataset
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
 
src/deepeval/commonsense_reasoning_task.py CHANGED
@@ -10,7 +10,7 @@ class CommonsenseReasoningTask(BaseTask):
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
- return dataset.select(range(min(10, len(dataset))))
14
 
15
 
16
  def evaluate(self) -> dict[str, Any]:
@@ -57,7 +57,7 @@ class CommonsenseReasoningTask(BaseTask):
57
  message = prompt
58
 
59
  # Get/format answer of the model
60
- model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
61
  responses.append(model_answer)
62
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
63
 
 
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
+ return dataset
14
 
15
 
16
  def evaluate(self) -> dict[str, Any]:
 
57
  message = prompt
58
 
59
  # Get/format answer of the model
60
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
61
  responses.append(model_answer)
62
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
63
 
src/deepeval/complex_reasoning.py CHANGED
@@ -11,7 +11,7 @@ class ComplexReasoningTask(BaseTask):
11
 
12
  def load_dataset_from_hf(self):
13
  dataset = super().load_dataset_from_hf()
14
- return dataset.select(range(min(10, len(dataset))))
15
 
16
 
17
  def evaluate(self) -> dict[str, Any]:
 
11
 
12
  def load_dataset_from_hf(self):
13
  dataset = super().load_dataset_from_hf()
14
+ return dataset
15
 
16
 
17
  def evaluate(self) -> dict[str, Any]:
src/deepeval/deepeval_task_manager.py CHANGED
@@ -74,11 +74,15 @@ class DeepEvalTaskManager:
74
  """Execute validated tasks in order."""
75
  results = {}
76
  for task_name, task_method in self.tasks_to_run.items():
77
- print("Running task: ", task_name)
78
- task_enum = getattr(Task, task_name)
79
- task_value = task_enum.value
80
- results[task_value] = task_method() # Call the stored method reference
81
-
 
 
 
 
82
  return results
83
 
84
  def sentiment_analysis_tr(self):
@@ -182,6 +186,6 @@ class DeepEvalTaskManager:
182
  return res
183
 
184
  if __name__ == "__main__":
185
- des = DeepEvalTaskManager("google/gemma-2b-it", ["BIAS_MC"])
186
  res = des.run_tasks()
187
  print(res)
 
74
  """Execute validated tasks in order."""
75
  results = {}
76
  for task_name, task_method in self.tasks_to_run.items():
77
+ try:
78
+ print("Running task: ", task_name)
79
+ task_enum = getattr(Task, task_name)
80
+ task_value = task_enum.value
81
+ results[task_value] = task_method() # Call the stored method reference
82
+ except Exception as e:
83
+ print(f"Error At Task: {task_name} - {e}")
84
+ continue
85
+ print("All tasks completed.")
86
  return results
87
 
88
  def sentiment_analysis_tr(self):
 
186
  return res
187
 
188
  if __name__ == "__main__":
189
+ des = DeepEvalTaskManager("google/gemma-2-2b-it", ["TOXICITY", "BIAS"])
190
  res = des.run_tasks()
191
  print(res)
src/deepeval/faithfulness_task.py CHANGED
@@ -9,7 +9,7 @@ class FaithfulnessTask(BaseTask):
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
- return dataset.select(range(min(3, len(dataset))))
13
 
14
  def evaluate(self) -> dict[str, Any]:
15
 
 
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
+ return dataset
13
 
14
  def evaluate(self) -> dict[str, Any]:
15
 
src/deepeval/instruction_following_task.py CHANGED
@@ -10,7 +10,7 @@ class InstructionFollowingTask(BaseTask):
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
- return dataset.select(range(min(3, len(dataset))))
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
  results = []
 
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
+ return dataset
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
  results = []
src/deepeval/nli.py CHANGED
@@ -10,7 +10,7 @@ class NLITask(BaseTask):
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
- return dataset.select(range(min(10, len(dataset))))
14
 
15
 
16
  def evaluate(self) -> dict[str, Any]:
@@ -48,7 +48,7 @@ class NLITask(BaseTask):
48
  message = prompt
49
 
50
  # Get/format answer of the model
51
- model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
52
  responses.append(model_answer)
53
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
54
 
 
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
+ return dataset
14
 
15
 
16
  def evaluate(self) -> dict[str, Any]:
 
48
  message = prompt
49
 
50
  # Get/format answer of the model
51
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
52
  responses.append(model_answer)
53
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
54
 
src/deepeval/reading_comp_mc.py CHANGED
@@ -11,7 +11,7 @@ class ReadingComprehensionMCTask(BaseTask):
11
 
12
  def load_dataset_from_hf(self):
13
  dataset = super().load_dataset_from_hf()
14
- return dataset.select(range(min(10, len(dataset))))
15
 
16
 
17
  def evaluate(self) -> dict[str, Any]:
@@ -41,8 +41,9 @@ class ReadingComprehensionMCTask(BaseTask):
41
  answer_index = answer
42
  else:
43
  answer_index = int(answer)
44
- correct_answer_letter = chr(64 + answer_index) # 65 - 1 since we need the indexing to start from 0
45
-
 
46
 
47
  # Construct the prompt/message
48
  instruction = ""
 
11
 
12
  def load_dataset_from_hf(self):
13
  dataset = super().load_dataset_from_hf()
14
+ return dataset
15
 
16
 
17
  def evaluate(self) -> dict[str, Any]:
 
41
  answer_index = answer
42
  else:
43
  answer_index = int(answer)
44
+
45
+ answer_index = answer_index - 1 # Because the answer is 1-indexed
46
+ correct_answer_letter = chr(65 + answer_index)
47
 
48
  # Construct the prompt/message
49
  instruction = ""
src/deepeval/reading_comprehension_task.py CHANGED
@@ -28,7 +28,7 @@ class ReadingComprehensionTask(BaseTask):
28
 
29
  def load_dataset_from_hf(self):
30
  dataset = super().load_dataset_from_hf()
31
- return dataset.select(range(min(3, len(dataset))))
32
 
33
  def evaluate(self) -> dict[str, Any]:
34
  results = []
 
28
 
29
  def load_dataset_from_hf(self):
30
  dataset = super().load_dataset_from_hf()
31
+ return dataset
32
 
33
  def evaluate(self) -> dict[str, Any]:
34
  results = []
src/deepeval/sentiment_analysis_task.py CHANGED
@@ -9,7 +9,7 @@ class SentimentAnalysisTask(BaseTask):
9
  def load_dataset_from_hf(self):
10
  print("Loading the dataset")
11
  dataset = super().load_dataset_from_hf()
12
- return dataset.select(range(min(10, len(dataset))))
13
 
14
 
15
  def evaluate(self) -> dict[str, Any]:
 
9
  def load_dataset_from_hf(self):
10
  print("Loading the dataset")
11
  dataset = super().load_dataset_from_hf()
12
+ return dataset
13
 
14
 
15
  def evaluate(self) -> dict[str, Any]:
src/deepeval/summarization_task.py CHANGED
@@ -9,7 +9,7 @@ class SummarizationTask(BaseTask):
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
- return dataset.select(range(min(3, len(dataset))))
13
 
14
  def evaluate(self) -> dict[str, Any]:
15
  results = []
 
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
+ return dataset
13
 
14
  def evaluate(self) -> dict[str, Any]:
15
  results = []
src/deepeval/toxicity_task.py CHANGED
@@ -9,7 +9,7 @@ class ToxicityTask(BaseTask):
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
- return dataset.select(range(min(3, len(dataset))))
13
 
14
 
15
  def evaluate(self) -> dict[str, Any]:
 
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
+ return dataset
13
 
14
 
15
  def evaluate(self) -> dict[str, Any]:
src/deepeval/truthfulness_task.py CHANGED
@@ -26,7 +26,7 @@ class TruthfulnessTask(BaseTask):
26
 
27
  def load_dataset_from_hf(self):
28
  dataset = super().load_dataset_from_hf()
29
- return dataset.select(range(min(3, len(dataset))))
30
 
31
  def evaluate(self) -> dict[str, Any]:
32
  results = []
 
26
 
27
  def load_dataset_from_hf(self):
28
  dataset = super().load_dataset_from_hf()
29
+ return dataset
30
 
31
  def evaluate(self) -> dict[str, Any]:
32
  results = []
src/deepeval/turkish_general_knowledge_task.py CHANGED
@@ -9,7 +9,7 @@ class TurkishGeneralKnowledgeTask(BaseTask):
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
- return dataset.select(range(min(10, len(dataset))))
13
 
14
  def evaluate(self):
15
  responses = []
 
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
+ return dataset
13
 
14
  def evaluate(self):
15
  responses = []
svc/router.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from fastapi import APIRouter, HTTPException, Depends
2
  import logging
3
 
@@ -8,11 +9,13 @@ from auth.authentication import get_current_user, create_access_token
8
  from dotenv import load_dotenv
9
  import os
10
  import json
 
11
  from src.deepeval.deepeval_task_manager import DeepEvalTaskManager
12
  import torch
13
  import gc
14
  from time import time
15
  from huggingface_hub import HfApi, ModelInfo
 
16
 
17
 
18
  router = APIRouter()
@@ -25,7 +28,6 @@ HF_TOKEN = os.getenv("HF_TOKEN")
25
 
26
  # Or configure a HfApi client
27
  hf_api = HfApi(
28
- endpoint="https://huggingface.co", # Can be a Private Hub endpoint.
29
  token=HF_TOKEN, # Token is not persisted on the machine.
30
  )
31
 
@@ -48,6 +50,11 @@ async def deep_eval_status():
48
  #Return running with 200 status code
49
  return {"status": "running"}
50
 
 
 
 
 
 
51
  @router.post("/chat", response_model=TaskResponse)
52
  def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
53
  logger.info(request)
@@ -82,48 +89,83 @@ def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_c
82
  return TaskResponse(results=dumped)
83
 
84
 
85
- @router.post("/deepeval/eval", response_model=TaskResponse)
86
- async def deep_eval_suite(request: DeepEvalSuiteRequest):
87
- des = DeepEvalTaskManager(request.model_name, request.tasks)
88
- start_time = time()
89
- results = des.run_tasks() #TODO: format should be different. Check metunlp/results repo for the correct format
90
- end_time = time()
91
- duration = round(end_time - start_time, 2) # total_evaluation_time_seconds
92
-
93
- model_info: ModelInfo = hf_api.model_info(request.model_name)
94
-
95
- config = {
96
- "model_source": "hf",
97
- "num_fewshot": 0,
98
- "batch_size": 8,
99
- "batch_sizes": [],
100
- "device": "cuda:0", # TODO: take this from requests
101
- # "no_cache": true,
102
- # "limit": null,
103
- # "bootstrap_iters": 100000,
104
- # "description_dict": null,
105
- "model_dtype": "torch.float16", # TODO: take this from requests
106
- "model_name": request.model_name,
107
- "model_sha": model_info.sha
108
- }
109
-
110
- tbr_dict = {
111
- "results": results,
112
- "config": config,
113
- "total_evaluation_time_seconds": duration,
114
- "start_time": start_time,
115
- "end_time": end_time
116
- }
117
-
118
- json_results = json.dumps(tbr_dict)
119
-
120
- #Free up VRAM
121
- torch.cuda.empty_cache()
122
-
123
- #Free up RAM
124
- des = None
125
- gc.collect()
126
-
127
- return TaskResponse(results=json_results)
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timedelta
2
  from fastapi import APIRouter, HTTPException, Depends
3
  import logging
4
 
 
9
  from dotenv import load_dotenv
10
  import os
11
  import json
12
+ from pathlib import Path
13
  from src.deepeval.deepeval_task_manager import DeepEvalTaskManager
14
  import torch
15
  import gc
16
  from time import time
17
  from huggingface_hub import HfApi, ModelInfo
18
+ import threading
19
 
20
 
21
  router = APIRouter()
 
28
 
29
  # Or configure a HfApi client
30
  hf_api = HfApi(
 
31
  token=HF_TOKEN, # Token is not persisted on the machine.
32
  )
33
 
 
50
  #Return running with 200 status code
51
  return {"status": "running"}
52
 
53
+ @router.get("/deepeval/hardware")
54
+ def hardware_status():
55
+ info = get_gpu_tier()
56
+ return info
57
+
58
  @router.post("/chat", response_model=TaskResponse)
59
  def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
60
  logger.info(request)
 
89
  return TaskResponse(results=dumped)
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
 
94
+ @router.post("/deepeval/eval", response_model=TaskResponse)
95
+ def deep_eval_suite(request: DeepEvalSuiteRequest):
96
+ def run_in_background():
97
+ try:
98
+ torch.cuda.empty_cache()
99
+ des = DeepEvalTaskManager(request.model_name, request.tasks)
100
+
101
+ start_time = time()
102
+ results = des.run_tasks()
103
+ end_time = time()
104
+ duration = round(end_time - start_time, 2)
105
+
106
+ model_info: ModelInfo = hf_api.model_info(request.model_name)
107
+
108
+ config = {
109
+ "model_source": "hf",
110
+ "num_fewshot": 0,
111
+ "batch_size": 8,
112
+ "device": "cuda:0",
113
+ "model_dtype": "torch.float16",
114
+ "model_name": request.model_name,
115
+ "model_sha": model_info.sha,
116
+ }
117
+
118
+ final_results = {
119
+ "results": results,
120
+ "config": config,
121
+ "total_evaluation_time_seconds": duration,
122
+ "start_time": start_time,
123
+ "end_time": end_time
124
+ }
125
+
126
+ # Save and upload
127
+ dumped = json.dumps(final_results, indent=2)
128
+ path = Path("/tmp", request.model_name, f"results_{datetime.now()}.json")
129
+ path.parent.mkdir(parents=True, exist_ok=True)
130
+ path.write_text(dumped)
131
+
132
+ RESULTS_REPO = "metunlp/results"
133
+ hf_api.upload_file(
134
+ path_or_fileobj=path,
135
+ path_in_repo=path.relative_to("/tmp").as_posix(),
136
+ repo_id=RESULTS_REPO,
137
+ repo_type="dataset",
138
+ )
139
+
140
+ logger.info(f"βœ… Uploaded results to HF Hub for {request.model_name}")
141
+
142
+ except Exception as e:
143
+ logger.exception(f"❌ Background evaluation failed: {e}")
144
+
145
+ # πŸ” Start evaluation in background
146
+ threading.Thread(target=run_in_background, daemon=True).start()
147
+
148
+ # βœ… Immediately respond
149
+ return TaskResponse(results=json.dumps({"status": "Evaluation started in background"}))
150
+
151
+
152
+
153
+
154
+ def get_gpu_tier():
155
+ if not torch.cuda.is_available():
156
+ return {"gpu": "CPU", "tier": "cpu"}
157
+
158
+ gpu_name = torch.cuda.get_device_name(0).lower()
159
+
160
+ # Normalize GPU model to your custom tier system
161
+ if "t4" in gpu_name:
162
+ # You can improve this by checking memory or other context
163
+ return {"gpu": "Tesla T4", "tier": "t4-medium"}
164
+ elif "l4" in gpu_name:
165
+ return {"gpu": "NVIDIA L4", "tier": "l4x1"}
166
+ elif "l40s" in gpu_name:
167
+ return {"gpu": "NVIDIA L40S", "tier": "l40sx1"}
168
+ elif "a10g" in gpu_name:
169
+ return {"gpu": "NVIDIA A10G", "tier": "a10g"}
170
+ else:
171
+ return {"gpu": gpu_name, "tier": "unknown"}