aacengiz commited on
Commit
7b3d3a5
·
1 Parent(s): 847b372
auth/authentication.py DELETED
@@ -1,33 +0,0 @@
1
- from fastapi.security import OAuth2PasswordBearer
2
- from fastapi import HTTPException, Depends
3
- from jose import JWTError, jwt
4
- from datetime import datetime, timedelta
5
-
6
-
7
- SECRET_KEY = "llmbenchmark_tr" # your secret key
8
- ALGORITHM = "HS256"
9
- ACCESS_TOKEN_EXPIRE_MINUTES = 30
10
-
11
- oauth2_scheme = OAuth2PasswordBearer(tokenUrl="api/token")
12
-
13
- def create_access_token(data: dict):
14
- to_encode = data.copy()
15
- expire = datetime.now() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
16
- to_encode.update({"exp": expire})
17
- encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
18
- return encoded_jwt
19
-
20
- def get_current_user(token: str = Depends(oauth2_scheme)):
21
- credentials_exception = HTTPException(
22
- status_code=401,
23
- detail="Could not validate credentials",
24
- headers={"WWW-Authenticate": "Bearer"},
25
- )
26
- try:
27
- payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
28
- username: str = payload.get("sub")
29
- if username is None:
30
- raise credentials_exception
31
- return username
32
- except JWTError:
33
- raise credentials_exception
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/deepeval/base_task.py CHANGED
@@ -3,7 +3,7 @@ import itertools
3
  from datasets import load_dataset
4
  import os
5
  from dotenv import load_dotenv
6
- from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor, Gemma3ForCausalLM
7
  import torch
8
  from typing import List
9
  load_dotenv()
@@ -29,7 +29,7 @@ class BaseTask(ABC):
29
  @staticmethod
30
  def load_model(model_name: str, device):
31
  """Loads model and tokenizer once and caches it."""
32
- if "gemma-3" in model_name:
33
  model = Gemma3ForCausalLM.from_pretrained(
34
  model_name,
35
  #device_map=device, #Gives Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device. error
 
3
  from datasets import load_dataset
4
  import os
5
  from dotenv import load_dotenv
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor#, Gemma3ForCausalLM
7
  import torch
8
  from typing import List
9
  load_dotenv()
 
29
  @staticmethod
30
  def load_model(model_name: str, device):
31
  """Loads model and tokenizer once and caches it."""
32
+ if False:#"gemma-3" in model_name:
33
  model = Gemma3ForCausalLM.from_pretrained(
34
  model_name,
35
  #device_map=device, #Gives Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device. error
src/deepeval/commonsense_reasoning_task.py CHANGED
@@ -2,13 +2,14 @@ from src.deepeval.base_task import BaseTask
2
  from src.deepeval.utils import accuracy, accuracy_standard_error
3
  from typing import Any
4
 
5
- class SentimentAnalysisTask(BaseTask):
6
  def __init__(self, model_name):
7
  super().__init__("metunlp/commonsense", model_name=model_name)
8
 
9
  def load_dataset_from_hf(self):
 
10
  dataset = super().load_dataset_from_hf()
11
- return dataset.select(range(min(10, len(dataset))))
12
 
13
 
14
  def evaluate(self) -> dict[str, Any]:
@@ -16,7 +17,7 @@ class SentimentAnalysisTask(BaseTask):
16
  total_count = len(self.dataset)
17
  n_correct = 0
18
  for row in self.dataset:
19
- sentence = row["sentence"]
20
  label = row["label"]
21
  choices=[row["choice1"], row["choice2"]]
22
  formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
@@ -28,17 +29,19 @@ class SentimentAnalysisTask(BaseTask):
28
  else:
29
  question = "Seçeneklerden hangisi uygun?" # Alternatif
30
 
31
- prompt = f"Premise:\n{line["text"]}\nSoru:{question}\nSeçenekler:\n{formatted_choices}"
32
 
33
  messages = prompt
34
-
35
- answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
36
- print("Answer:", answer)
37
- responses.append(answer)
38
- correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
39
- model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
40
  if correct_answer_letter == model_answer_cleaned:
41
  n_correct += 1
 
 
 
42
 
43
  acc = accuracy(n_correct, total_count)
44
  acc_stderr = accuracy_standard_error(acc, total_count)
 
2
  from src.deepeval.utils import accuracy, accuracy_standard_error
3
  from typing import Any
4
 
5
+ class CommonsenseReasoningTask(BaseTask):
6
  def __init__(self, model_name):
7
  super().__init__("metunlp/commonsense", model_name=model_name)
8
 
9
  def load_dataset_from_hf(self):
10
+ print("Loading the dataset")
11
  dataset = super().load_dataset_from_hf()
12
+ return dataset.select(range(min(1, len(dataset))))
13
 
14
 
15
  def evaluate(self) -> dict[str, Any]:
 
17
  total_count = len(self.dataset)
18
  n_correct = 0
19
  for row in self.dataset:
20
+ print(row)
21
  label = row["label"]
22
  choices=[row["choice1"], row["choice2"]]
23
  formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
 
29
  else:
30
  question = "Seçeneklerden hangisi uygun?" # Alternatif
31
 
32
+ prompt = f"Bağlam:\n{row["text"]}\nÖnerme:\n{row["context"]}\nSoru:{question}\nSeçenekler:\n{formatted_choices}"
33
 
34
  messages = prompt
35
+
36
+ model_answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
37
+
38
+ correct_answer_letter = "A" if row["answer"] == 1 else "B" if row["answer"] == 2 else None
39
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
 
40
  if correct_answer_letter == model_answer_cleaned:
41
  n_correct += 1
42
+ print(f"Correct Answer: {correct_answer_letter}")
43
+ print(f"Model Answer: {model_answer}")
44
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
45
 
46
  acc = accuracy(n_correct, total_count)
47
  acc_stderr = accuracy_standard_error(acc, total_count)
src/deepeval/deepeval_task_manager.py CHANGED
@@ -3,6 +3,7 @@ from dotenv import load_dotenv
3
  from enum import Enum
4
  from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
5
  from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
 
6
  from typing import List
7
  load_dotenv()
8
 
@@ -12,6 +13,7 @@ class Task(Enum):
12
  # SUMMARIZATION = "summarization"
13
  SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
14
  TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
 
15
 
16
 
17
  class DeepEvalTaskManager:
@@ -51,8 +53,13 @@ class DeepEvalTaskManager:
51
  res = turkish_general_knowledge_task.evaluate()
52
  return res
53
 
 
 
 
 
 
54
 
55
  if __name__ == "__main__":
56
- des = DeepEvalTaskManager("google/gemma-3-4b-it", ["TURKISH_GENERAL_KNOWLEDGE"])
57
  res = des.run_tasks()
58
  print(res)
 
3
  from enum import Enum
4
  from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
5
  from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
6
+ from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask
7
  from typing import List
8
  load_dotenv()
9
 
 
13
  # SUMMARIZATION = "summarization"
14
  SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
15
  TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
16
+ COMMONSENSE_REASONING = "commonsense_reasoning"
17
 
18
 
19
  class DeepEvalTaskManager:
 
53
  res = turkish_general_knowledge_task.evaluate()
54
  return res
55
 
56
+ def commonsense_reasoning(self):
57
+ commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
58
+ res = commonsense_reasoning_task.evaluate()
59
+ return res
60
+
61
 
62
  if __name__ == "__main__":
63
+ des = DeepEvalTaskManager("google/gemma-2-2b-it", ["TURKISH_GENERAL_KNOWLEDGE","COMMONSENSE_REASONING"])
64
  res = des.run_tasks()
65
  print(res)
src/deepeval/sentiment_analysis_task.py CHANGED
@@ -7,6 +7,7 @@ class SentimentAnalysisTask(BaseTask):
7
  super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
8
 
9
  def load_dataset_from_hf(self):
 
10
  dataset = super().load_dataset_from_hf()
11
  return dataset.select(range(min(10, len(dataset))))
12
 
 
7
  super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
8
 
9
  def load_dataset_from_hf(self):
10
+ print("Loading the dataset")
11
  dataset = super().load_dataset_from_hf()
12
  return dataset.select(range(min(10, len(dataset))))
13
 
src/deepeval/turkish_general_knowledge_task.py CHANGED
@@ -8,7 +8,7 @@ class TurkishGeneralKnowledgeTask(BaseTask):
8
 
9
  def load_dataset_from_hf(self):
10
  dataset = super().load_dataset_from_hf()
11
- return dataset.select(range(min(10, len(dataset))))
12
 
13
  def evaluate(self):
14
  responses = []
 
8
 
9
  def load_dataset_from_hf(self):
10
  dataset = super().load_dataset_from_hf()
11
+ return dataset.select(range(min(1, len(dataset))))
12
 
13
  def evaluate(self):
14
  responses = []