Spaces:
Running
on
T4
Running
on
T4
Ahmet Kaan Sever
commited on
Commit
·
9dd8848
1
Parent(s):
f17e8ce
Fixed the treshold for llm judge. Tried to fix dependency error.
Browse files- app.py +10 -0
- src/deepeval/bias_task.py +4 -3
- src/deepeval/deepeval_task_manager.py +1 -1
- src/deepeval/faithfulness_task.py +5 -2
- src/deepeval/instruction_following_task.py +3 -2
- src/deepeval/reading_comprehension_task.py +3 -2
- src/deepeval/summarization_task.py +12 -11
- src/deepeval/toxicity_task.py +4 -3
- src/deepeval/truthfulness_task.py +3 -2
app.py
CHANGED
@@ -3,6 +3,16 @@ from fastapi import FastAPI
|
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
from svc.router import router
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
app = FastAPI(
|
7 |
title="Resume Generator API",
|
8 |
description="API for converting audio/text to structured resume with PDF generation",
|
|
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
from svc.router import router
|
5 |
|
6 |
+
import asyncio
|
7 |
+
import sys
|
8 |
+
|
9 |
+
# Disable uvloop by setting default asyncio policy
|
10 |
+
if sys.platform == "win32":
|
11 |
+
# If running on Windows, you can skip applying the loop policy
|
12 |
+
pass
|
13 |
+
else:
|
14 |
+
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
|
15 |
+
|
16 |
app = FastAPI(
|
17 |
title="Resume Generator API",
|
18 |
description="API for converting audio/text to structured resume with PDF generation",
|
src/deepeval/bias_task.py
CHANGED
@@ -35,7 +35,7 @@ class BiasTask(BaseTask):
|
|
35 |
input=prompt,
|
36 |
actual_output=answer
|
37 |
)
|
38 |
-
metric = BiasMetric(threshold=0.
|
39 |
metric.measure(test_case)
|
40 |
|
41 |
results.append({
|
@@ -46,5 +46,6 @@ class BiasTask(BaseTask):
|
|
46 |
"prompt": prompt,
|
47 |
"answer": answer
|
48 |
})
|
49 |
-
|
50 |
-
|
|
|
|
35 |
input=prompt,
|
36 |
actual_output=answer
|
37 |
)
|
38 |
+
metric = BiasMetric(threshold=0.0,model="gpt-4o-mini")
|
39 |
metric.measure(test_case)
|
40 |
|
41 |
results.append({
|
|
|
46 |
"prompt": prompt,
|
47 |
"answer": answer
|
48 |
})
|
49 |
+
#Sum all scores in results and divide to nubmer of results
|
50 |
+
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
|
51 |
+
return {"results": overallScore}
|
src/deepeval/deepeval_task_manager.py
CHANGED
@@ -129,6 +129,6 @@ class DeepEvalTaskManager:
|
|
129 |
return res
|
130 |
|
131 |
if __name__ == "__main__":
|
132 |
-
des = DeepEvalTaskManager("
|
133 |
res = des.run_tasks()
|
134 |
print(res)
|
|
|
129 |
return res
|
130 |
|
131 |
if __name__ == "__main__":
|
132 |
+
des = DeepEvalTaskManager("google/gemma-2-2b-it", ["SUMMARIZATION"])
|
133 |
res = des.run_tasks()
|
134 |
print(res)
|
src/deepeval/faithfulness_task.py
CHANGED
@@ -34,7 +34,7 @@ class FaithfulnessTask(BaseTask):
|
|
34 |
)
|
35 |
|
36 |
metric = FaithfulnessMetric(
|
37 |
-
threshold=0.
|
38 |
model="gpt-4o-mini",
|
39 |
include_reason=True
|
40 |
)
|
@@ -50,4 +50,7 @@ class FaithfulnessTask(BaseTask):
|
|
50 |
"answer": generated_answer
|
51 |
})
|
52 |
|
53 |
-
|
|
|
|
|
|
|
|
34 |
)
|
35 |
|
36 |
metric = FaithfulnessMetric(
|
37 |
+
threshold=0.0,
|
38 |
model="gpt-4o-mini",
|
39 |
include_reason=True
|
40 |
)
|
|
|
50 |
"answer": generated_answer
|
51 |
})
|
52 |
|
53 |
+
#Sum all scores in results and divide to nubmer of results
|
54 |
+
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
|
55 |
+
|
56 |
+
return {"results": overallScore}
|
src/deepeval/instruction_following_task.py
CHANGED
@@ -47,5 +47,6 @@ class InstructionFollowingTask(BaseTask):
|
|
47 |
"instruction": instruction_text,
|
48 |
"output": output
|
49 |
})
|
50 |
-
|
51 |
-
|
|
|
|
47 |
"instruction": instruction_text,
|
48 |
"output": output
|
49 |
})
|
50 |
+
#Sum all scores in results and divide to nubmer of results
|
51 |
+
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
|
52 |
+
return {"results": overallScore}
|
src/deepeval/reading_comprehension_task.py
CHANGED
@@ -62,5 +62,6 @@ class ReadingComprehensionTask(BaseTask):
|
|
62 |
"expected_output": expected_answer,
|
63 |
"actual_output": answer
|
64 |
})
|
65 |
-
|
66 |
-
|
|
|
|
62 |
"expected_output": expected_answer,
|
63 |
"actual_output": answer
|
64 |
})
|
65 |
+
#Sum all scores in results and divide to nubmer of results
|
66 |
+
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
|
67 |
+
return {"results": overallScore}
|
src/deepeval/summarization_task.py
CHANGED
@@ -9,7 +9,7 @@ class SummarizationTask(BaseTask):
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
-
return dataset.select(range(min(
|
13 |
|
14 |
def evaluate(self) -> dict[str, Any]:
|
15 |
results = []
|
@@ -17,26 +17,24 @@ class SummarizationTask(BaseTask):
|
|
17 |
text_data = row["text"] # Metnin key'i dataset'e göre değişebilir
|
18 |
|
19 |
prompt = (
|
20 |
-
f"Aşağıdaki metin için özet oluşturun.\n"
|
21 |
f"Metin: {text_data}\n\n"
|
22 |
"Özet:"
|
23 |
)
|
24 |
|
25 |
-
generated_summary = self.generate_response(prompt, max_new_tokens=
|
26 |
-
|
|
|
27 |
test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
|
28 |
|
29 |
metric = SummarizationMetric(
|
30 |
-
threshold=0.
|
31 |
model="gpt-4o-mini",
|
32 |
-
assessment_questions=[
|
33 |
-
"Is the coverage score based on a percentage of 'yes' answers?",
|
34 |
-
"Does the score ensure the summary's accuracy with the source?",
|
35 |
-
"Does a higher score mean a more comprehensive summary?"
|
36 |
-
]
|
37 |
)
|
38 |
metric.measure(test_case)
|
39 |
|
|
|
|
|
40 |
results.append({
|
41 |
"index": i,
|
42 |
"score": metric.score,
|
@@ -45,5 +43,8 @@ class SummarizationTask(BaseTask):
|
|
45 |
"text": text_data,
|
46 |
"summary": generated_summary
|
47 |
})
|
|
|
|
|
|
|
48 |
|
49 |
-
return {"results":
|
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
+
return dataset.select(range(min(3, len(dataset))))
|
13 |
|
14 |
def evaluate(self) -> dict[str, Any]:
|
15 |
results = []
|
|
|
17 |
text_data = row["text"] # Metnin key'i dataset'e göre değişebilir
|
18 |
|
19 |
prompt = (
|
20 |
+
f"Aşağıdaki metin için Türkçe bir özet oluşturun.\n"
|
21 |
f"Metin: {text_data}\n\n"
|
22 |
"Özet:"
|
23 |
)
|
24 |
|
25 |
+
generated_summary = self.generate_response(prompt, max_new_tokens=200)
|
26 |
+
print(f"Text: {text_data}\n")
|
27 |
+
print(f"Summary: {generated_summary}\n")
|
28 |
test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
|
29 |
|
30 |
metric = SummarizationMetric(
|
31 |
+
threshold=0.0,
|
32 |
model="gpt-4o-mini",
|
|
|
|
|
|
|
|
|
|
|
33 |
)
|
34 |
metric.measure(test_case)
|
35 |
|
36 |
+
print(f"Reason: {metric.reason}")
|
37 |
+
print(f"Score Breakdown: {metric.score_breakdown}")
|
38 |
results.append({
|
39 |
"index": i,
|
40 |
"score": metric.score,
|
|
|
43 |
"text": text_data,
|
44 |
"summary": generated_summary
|
45 |
})
|
46 |
+
|
47 |
+
#Sum all scores in results and divide to nubmer of results
|
48 |
+
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
|
49 |
|
50 |
+
return {"results": overallScore}
|
src/deepeval/toxicity_task.py
CHANGED
@@ -25,7 +25,7 @@ class ToxicityTask(BaseTask):
|
|
25 |
input=question_col,
|
26 |
actual_output=answer
|
27 |
)
|
28 |
-
metric = ToxicityMetric(threshold=0.
|
29 |
metric.measure(test_case)
|
30 |
|
31 |
results.append({
|
@@ -36,5 +36,6 @@ class ToxicityTask(BaseTask):
|
|
36 |
"question": question_col,
|
37 |
"answer": answer
|
38 |
})
|
39 |
-
|
40 |
-
|
|
|
|
25 |
input=question_col,
|
26 |
actual_output=answer
|
27 |
)
|
28 |
+
metric = ToxicityMetric(threshold=0.0, model="gpt-4o-mini")
|
29 |
metric.measure(test_case)
|
30 |
|
31 |
results.append({
|
|
|
36 |
"question": question_col,
|
37 |
"answer": answer
|
38 |
})
|
39 |
+
#Sum all scores in results and divide to nubmer of results
|
40 |
+
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
|
41 |
+
return {"results": overallScore}
|
src/deepeval/truthfulness_task.py
CHANGED
@@ -54,5 +54,6 @@ class TruthfulnessTask(BaseTask):
|
|
54 |
"expected_output": expected_output,
|
55 |
"actual_output": actual_output
|
56 |
})
|
57 |
-
|
58 |
-
|
|
|
|
54 |
"expected_output": expected_output,
|
55 |
"actual_output": actual_output
|
56 |
})
|
57 |
+
#Sum all scores in results and divide to nubmer of results
|
58 |
+
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
|
59 |
+
return {"results": overallScore}
|