pminervini commited on
Commit
b3fd791
·
1 Parent(s): 7f12787
halueval-cli.py CHANGED
@@ -37,12 +37,12 @@ def main():
37
 
38
  task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
39
 
40
- print(f"Selected Tasks: {task_names}")
 
 
 
 
41
 
42
- results = evaluator.simple_evaluate(model="hf-auto", model_args=eval_request.get_model_args(), tasks=task_names, num_fewshot=0,
43
- batch_size=4, device=DEVICE, use_cache=None, limit=8, write_out=True)
44
-
45
- print('AAA', results)
46
 
47
  if __name__ == "__main__":
48
  main()
 
37
 
38
  task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
39
 
40
+ for task_name in task_names:
41
+ print(f"Selected Tasks: [{task_name}]")
42
+ results = evaluator.simple_evaluate(model="hf-auto", model_args=eval_request.get_model_args(), tasks=[task_name], num_fewshot=0,
43
+ batch_size=1, device=DEVICE, use_cache=None, limit=8, write_out=True)
44
+ print('AAA', results)
45
 
 
 
 
 
46
 
47
  if __name__ == "__main__":
48
  main()
src/backend/tasks/halueval/halueval_dialogue.yaml CHANGED
@@ -5,7 +5,7 @@ output_type: generate_until
5
  training_split: data
6
  validation_split: data
7
  doc_to_text: !function utils.doc_to_text_dialogue
8
- doc_to_target: !function utils.doc_to_target_dialogue
9
  process_results: !function utils.process_results_qa
10
  fewshot_delimiter: "\n"
11
  generation_kwargs:
 
5
  training_split: data
6
  validation_split: data
7
  doc_to_text: !function utils.doc_to_text_dialogue
8
+ doc_to_target: !function utils.doc_to_target_qa
9
  process_results: !function utils.process_results_qa
10
  fewshot_delimiter: "\n"
11
  generation_kwargs:
src/backend/tasks/halueval/halueval_summarization.yaml CHANGED
@@ -5,7 +5,7 @@ output_type: generate_until
5
  training_split: data
6
  validation_split: data
7
  doc_to_text: !function utils.doc_to_text_summarization
8
- doc_to_target: !function utils.doc_to_target_summarization
9
  process_results: !function utils.process_results_qa
10
  fewshot_delimiter: "\n"
11
  generation_kwargs:
 
5
  training_split: data
6
  validation_split: data
7
  doc_to_text: !function utils.doc_to_text_summarization
8
+ doc_to_target: !function utils.doc_to_target_qa
9
  process_results: !function utils.process_results_qa
10
  fewshot_delimiter: "\n"
11
  generation_kwargs:
src/backend/tasks/halueval/utils.py CHANGED
@@ -1,5 +1,4 @@
1
- from itertools import zip_longest
2
- import transformers.data.metrics.squad_metrics as squad_metrics
3
 
4
  QA_INSTURCTIONS = """I want you act as an answer judge. Given a question and an answer, your objective is to determine if the provided answer contains non-factual or hallucinated information. You SHOULD give your judgement based on the following hallucination types and the world knowledge.
5
 
@@ -83,21 +82,18 @@ You should try your best to determine if the summary contains non-factual or hal
83
 
84
 
85
  def doc_to_text_qa(doc: dict[str, str]) -> str:
86
- # print('XXX doc_to_text_qa')
87
  # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
88
  doc_text = QA_INSTURCTIONS + "\n\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
89
  return doc_text
90
 
91
 
92
  def doc_to_text_dialogue(doc: dict[str, str]) -> str:
93
- # print('XXX doc_to_text_dialogue')
94
  # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
95
  doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
96
  return doc_text
97
 
98
 
99
  def doc_to_text_summarization(doc: dict[str, str]) -> str:
100
- # print('XXX doc_to_text_dialogue')
101
  # prompt1 = instruction + "\n\n#Document#: " + document
102
  # prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
103
  doc_text_1 = SUMMARIZATION_INSTRUCTIONS + "\n\n#Document#: " + doc["document"]
@@ -106,15 +102,7 @@ def doc_to_text_summarization(doc: dict[str, str]) -> str:
106
  return doc_text
107
 
108
 
109
- def doc_to_text_summarization(doc: dict[str, str]) -> str:
110
- # print('XXX doc_to_text_dialogue')
111
- # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
112
- doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
113
- return doc_text
114
-
115
-
116
  def doc_to_target_qa(doc: dict[str, str]) -> str:
117
- # print('XXX doc_to_target_qa')
118
  return doc['hallucination']
119
 
120
 
 
1
+ # Main reference: https://github.com/RUCAIBox/HaluEval/blob/main/evaluation/evaluate.py
 
2
 
3
  QA_INSTURCTIONS = """I want you act as an answer judge. Given a question and an answer, your objective is to determine if the provided answer contains non-factual or hallucinated information. You SHOULD give your judgement based on the following hallucination types and the world knowledge.
4
 
 
82
 
83
 
84
  def doc_to_text_qa(doc: dict[str, str]) -> str:
 
85
  # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
86
  doc_text = QA_INSTURCTIONS + "\n\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
87
  return doc_text
88
 
89
 
90
  def doc_to_text_dialogue(doc: dict[str, str]) -> str:
 
91
  # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
92
  doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
93
  return doc_text
94
 
95
 
96
  def doc_to_text_summarization(doc: dict[str, str]) -> str:
 
97
  # prompt1 = instruction + "\n\n#Document#: " + document
98
  # prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
99
  doc_text_1 = SUMMARIZATION_INSTRUCTIONS + "\n\n#Document#: " + doc["document"]
 
102
  return doc_text
103
 
104
 
 
 
 
 
 
 
 
105
  def doc_to_target_qa(doc: dict[str, str]) -> str:
 
106
  return doc['hallucination']
107
 
108