Spaces:
Paused
Paused
File size: 2,385 Bytes
52b6367 cec00dd f74f2a9 ec9c39a dbf76bc cec00dd e8c3b4b cec00dd e8c3b4b cec00dd e8c3b4b cec00dd e8c3b4b cec00dd e8c3b4b cec00dd 9dd8848 e8c3b4b 9dd8848 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from datetime import datetime
from src.deepeval.base_task import BaseTask
from deepeval.metrics import PromptAlignmentMetric
from deepeval.test_case import LLMTestCase
from typing import Any
class InstructionFollowingTask(BaseTask):
def __init__(self, model_name: str):
super().__init__("metunlp/instruction_following_tr", model_name=model_name)
def load_dataset_from_hf(self):
dataset = super().load_dataset_lmjudge_from_hf()
return dataset
def evaluate(self) -> dict[str, Any]:
results = []
total_model_time = 0
total_judge_time = 0
for i, row in enumerate(self.dataset):
start_model = datetime.now()
input_text = row.get("input", "")
instruction_text = row.get("instruction", "")
prompt = (
f"Girdi: {input_text}\n"
f"Talimat: {instruction_text}\n"
f"Çıkıt:"
)
output = self.generate_response(prompt, max_new_tokens=200)
end_model = datetime.now()
total_model_time += (end_model - start_model).total_seconds()
start_judge = datetime.now()
test_case = LLMTestCase(
input=input_text,
actual_output=output
)
metric = PromptAlignmentMetric(
prompt_instructions=[instruction_text],
model="gpt-4o-mini",
include_reason=True
)
metric.measure(test_case)
end_judge = datetime.now()
total_judge_time += (end_judge - start_judge).total_seconds()
results.append({
"index": i,
"score": metric.score,
"reason": metric.reason,
"score_breakdown": metric.score_breakdown,
"input": input_text,
"instruction": instruction_text,
"output": output
})
#Sum all scores in results and divide to nubmer of results
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
print(f"Total model time: {total_model_time} seconds")
print(f"Total judge time: {total_judge_time} seconds")
return {"results": overallScore}
|