Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
·
b3fd791
1
Parent(s):
7f12787
cleanup
Browse files
halueval-cli.py
CHANGED
@@ -37,12 +37,12 @@ def main():
|
|
37 |
|
38 |
task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
|
39 |
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
results = evaluator.simple_evaluate(model="hf-auto", model_args=eval_request.get_model_args(), tasks=task_names, num_fewshot=0,
|
43 |
-
batch_size=4, device=DEVICE, use_cache=None, limit=8, write_out=True)
|
44 |
-
|
45 |
-
print('AAA', results)
|
46 |
|
47 |
if __name__ == "__main__":
|
48 |
main()
|
|
|
37 |
|
38 |
task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
|
39 |
|
40 |
+
for task_name in task_names:
|
41 |
+
print(f"Selected Tasks: [{task_name}]")
|
42 |
+
results = evaluator.simple_evaluate(model="hf-auto", model_args=eval_request.get_model_args(), tasks=[task_name], num_fewshot=0,
|
43 |
+
batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
|
44 |
+
print('AAA', results)
|
45 |
|
|
|
|
|
|
|
|
|
46 |
|
47 |
if __name__ == "__main__":
|
48 |
main()
|
src/backend/tasks/halueval/halueval_dialogue.yaml
CHANGED
@@ -5,7 +5,7 @@ output_type: generate_until
|
|
5 |
training_split: data
|
6 |
validation_split: data
|
7 |
doc_to_text: !function utils.doc_to_text_dialogue
|
8 |
-
doc_to_target: !function utils.
|
9 |
process_results: !function utils.process_results_qa
|
10 |
fewshot_delimiter: "\n"
|
11 |
generation_kwargs:
|
|
|
5 |
training_split: data
|
6 |
validation_split: data
|
7 |
doc_to_text: !function utils.doc_to_text_dialogue
|
8 |
+
doc_to_target: !function utils.doc_to_target_qa
|
9 |
process_results: !function utils.process_results_qa
|
10 |
fewshot_delimiter: "\n"
|
11 |
generation_kwargs:
|
src/backend/tasks/halueval/halueval_summarization.yaml
CHANGED
@@ -5,7 +5,7 @@ output_type: generate_until
|
|
5 |
training_split: data
|
6 |
validation_split: data
|
7 |
doc_to_text: !function utils.doc_to_text_summarization
|
8 |
-
doc_to_target: !function utils.
|
9 |
process_results: !function utils.process_results_qa
|
10 |
fewshot_delimiter: "\n"
|
11 |
generation_kwargs:
|
|
|
5 |
training_split: data
|
6 |
validation_split: data
|
7 |
doc_to_text: !function utils.doc_to_text_summarization
|
8 |
+
doc_to_target: !function utils.doc_to_target_qa
|
9 |
process_results: !function utils.process_results_qa
|
10 |
fewshot_delimiter: "\n"
|
11 |
generation_kwargs:
|
src/backend/tasks/halueval/utils.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
|
2 |
-
import transformers.data.metrics.squad_metrics as squad_metrics
|
3 |
|
4 |
QA_INSTURCTIONS = """I want you act as an answer judge. Given a question and an answer, your objective is to determine if the provided answer contains non-factual or hallucinated information. You SHOULD give your judgement based on the following hallucination types and the world knowledge.
|
5 |
|
@@ -83,21 +82,18 @@ You should try your best to determine if the summary contains non-factual or hal
|
|
83 |
|
84 |
|
85 |
def doc_to_text_qa(doc: dict[str, str]) -> str:
|
86 |
-
# print('XXX doc_to_text_qa')
|
87 |
# prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
|
88 |
doc_text = QA_INSTURCTIONS + "\n\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
|
89 |
return doc_text
|
90 |
|
91 |
|
92 |
def doc_to_text_dialogue(doc: dict[str, str]) -> str:
|
93 |
-
# print('XXX doc_to_text_dialogue')
|
94 |
# prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
|
95 |
doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
|
96 |
return doc_text
|
97 |
|
98 |
|
99 |
def doc_to_text_summarization(doc: dict[str, str]) -> str:
|
100 |
-
# print('XXX doc_to_text_dialogue')
|
101 |
# prompt1 = instruction + "\n\n#Document#: " + document
|
102 |
# prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
|
103 |
doc_text_1 = SUMMARIZATION_INSTRUCTIONS + "\n\n#Document#: " + doc["document"]
|
@@ -106,15 +102,7 @@ def doc_to_text_summarization(doc: dict[str, str]) -> str:
|
|
106 |
return doc_text
|
107 |
|
108 |
|
109 |
-
def doc_to_text_summarization(doc: dict[str, str]) -> str:
|
110 |
-
# print('XXX doc_to_text_dialogue')
|
111 |
-
# prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
|
112 |
-
doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
|
113 |
-
return doc_text
|
114 |
-
|
115 |
-
|
116 |
def doc_to_target_qa(doc: dict[str, str]) -> str:
|
117 |
-
# print('XXX doc_to_target_qa')
|
118 |
return doc['hallucination']
|
119 |
|
120 |
|
|
|
1 |
+
# Main reference: https://github.com/RUCAIBox/HaluEval/blob/main/evaluation/evaluate.py
|
|
|
2 |
|
3 |
QA_INSTURCTIONS = """I want you act as an answer judge. Given a question and an answer, your objective is to determine if the provided answer contains non-factual or hallucinated information. You SHOULD give your judgement based on the following hallucination types and the world knowledge.
|
4 |
|
|
|
82 |
|
83 |
|
84 |
def doc_to_text_qa(doc: dict[str, str]) -> str:
|
|
|
85 |
# prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
|
86 |
doc_text = QA_INSTURCTIONS + "\n\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
|
87 |
return doc_text
|
88 |
|
89 |
|
90 |
def doc_to_text_dialogue(doc: dict[str, str]) -> str:
|
|
|
91 |
# prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
|
92 |
doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
|
93 |
return doc_text
|
94 |
|
95 |
|
96 |
def doc_to_text_summarization(doc: dict[str, str]) -> str:
|
|
|
97 |
# prompt1 = instruction + "\n\n#Document#: " + document
|
98 |
# prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
|
99 |
doc_text_1 = SUMMARIZATION_INSTRUCTIONS + "\n\n#Document#: " + doc["document"]
|
|
|
102 |
return doc_text
|
103 |
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
def doc_to_target_qa(doc: dict[str, str]) -> str:
|
|
|
106 |
return doc['hallucination']
|
107 |
|
108 |
|