Spaces:

holistic-ai
/

explainbility_benchmark

Sleeping

explainbility_benchmark / util /evaluator.py

Zekun Wu

add

727b506 5 months ago

15.7 kB

	import json

	from util.assistants import GPTAgent
	import json_repair

	class evaluator:
	def __init__(self, model_name='GPT4-turbo'):
	self.model = GPTAgent(model_name)

	def validate_scores(self, scores):
	required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]

	for key in required_keys:
	if key not in scores:
	return {k: {"Score": -1, "Justification": "Invalid input"} for k in required_keys}

	score_data = scores[key]

	if not isinstance(score_data, dict):
	return {k: {"Score": -1, "Justification": "Invalid input format"} for k in required_keys}

	if "Score" not in score_data or not isinstance(score_data["Score"], (int, float)) or not (
	0 <= score_data["Score"] <= 10):
	return {k: {"Score": -1, "Justification": "Invalid score value"} for k in required_keys}

	if "Justification" not in score_data or not isinstance(score_data["Justification"], str) or not score_data[
	"Justification"].strip():
	return {k: {"Score": -1, "Justification": "Invalid or missing justification"} for k in required_keys}

	return scores

	def evaluate_single(self, question,explanation):

	evaluation_prompt = f"""You are provided with a user's query and the corresponding explanation generated by
	an Chatbot. Your task is to evaluate the explanation based on the following five principles. Each principle
	should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all,
	and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating.

	Query:
	{question}

	Provided Explanation:
	{explanation}

	Evaluation Criteria:

	Factually Correct:
	Definition: The explanation must be accurate and relevant to the question and the subject matter.
	Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.

	Useful:
	Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
	Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions?

	Context Specific:
	Definition: The explanation should be relevant to the specific context or scenario implied by the question.
	Score: (0-10) How well does the explanation address the specific context or scenario of the question?

	User Specific:
	Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
	Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user?

	Provides Pluralism:
	Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
	Score: (0-10) How well does the explanation provide or support multiple perspectives?

	After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.

	Example JSON format:
	{{
	"Factually Correct": {{
	"Justification": "xxx",
	"Score": 9
	}},
	"Useful": {{
	"Justification": "xxx",
	"Score": 8.5
	}},
	"Context Specific": {{
	"Justification": "xxx",
	"Score": 8
	}},
	"User Specific": {{
	"Justification": "xxx",
	"Score": 7.5
	}},
	"Provides Pluralism": {{
	"Justification": "xxx",
	"Score": 7
	}}
	}}

	Answer:
	"""

	response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=500).strip()

	print(response)
	try:
	scores = json.loads(response)
	except json.JSONDecodeError:
	# Attempt to repair the JSON if decoding fails
	repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
	try:
	scores = json.loads(repaired_json)
	except json.JSONDecodeError:
	print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
	return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1}


	return self.validate_scores(scores)

	def format_conversation(self, conversation):
	formatted_conversation = "\n".join(
	f"{exchange['role'].capitalize()}: {exchange['content']}" for exchange in conversation
	)
	return formatted_conversation

	def evaluate_conversation(self, conversation, context):
	formatted_conversation = self.format_conversation(conversation)
	evaluation_prompt = f"""
	You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the explanation based on the following five principles. Each principle
	should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all,
	and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating.

	Conversation:
	{formatted_conversation}

	Context:
	{context}

	Evaluation Criteria:

	Factually Correct:
	Definition: The explanation must be accurate and relevant to the question and the subject matter.
	Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.

	Useful:
	Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
	Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions?

	Context Specific:
	Definition: The explanation should be relevant to the specific context or scenario implied by the question.
	Score: (0-10) How well does the explanation address the specific context or scenario of the question?

	User Specific:
	Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
	Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user?

	Provides Pluralism:
	Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
	Score: (0-10) How well does the explanation provide or support multiple perspectives?

	After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.

	Example JSON format:
	{{
	"Factually Correct": {{
	"Justification": "xxx",
	"Score": 9
	}},
	"Useful": {{
	"Justification": "xxx",
	"Score": 8.5
	}},
	"Context Specific": {{
	"Justification": "xxx",
	"Score": 8
	}},
	"User Specific": {{
	"Justification": "xxx",
	"Score": 7.5
	}},
	"Provides Pluralism": {{
	"Justification": "xxx",
	"Score": 7
	}}
	}}

	Answer:
	"""

	print(evaluation_prompt)

	response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=1000).strip()
	try:
	scores = json.loads(response)
	except json.JSONDecodeError:
	repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
	try:
	scores = json.loads(repaired_json)
	except json.JSONDecodeError:
	print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
	return {key: -1 for key in ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]}

	return self.validate_scores(scores)


	def write_evaluation_commentary(scores):
	evaluation_details = []

	for principle, details in scores.items():
	print(details)
	score = details.get('Score', -1)
	justification = details.get('Justification', '')

	if score == -1:
	evaluation_details.append(
	{'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
	'Justification': justification})
	continue

	if principle == "Factually Correct":
	if score >= 0.8:
	comment = "Excellent accuracy! The information is precise and directly relevant to the question."
	elif score >= 0.5:
	comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
	else:
	comment = "The explanation contains significant inaccuracies or irrelevant information."
	elif principle == "Useful":
	if score >= 0.8:
	comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
	elif score >= 0.5:
	comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
	else:
	comment = "The explanation does little to help understand or apply the information provided."
	elif principle == "Context Specific":
	if score >= 0.8:
	comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
	elif score >= 0.5:
	comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
	else:
	comment = "Fails to address the context of the question, lacking relevance or specificity."
	elif principle == "User Specific":
	if score >= 0.8:
	comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
	elif score >= 0.5:
	comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
	else:
	comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
	elif principle == "Provides Pluralism":
	if score >= 0.8:
	comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
	elif score >= 0.5:
	comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
	else:
	comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."

	evaluation_details.append(
	{'Principle': principle, 'Score': score, 'Justification': justification,'Commentary': comment})

	return evaluation_details
	# def write_evaluation_commentary(scores):
	# evaluation_details = []
	# for principle, score in scores.items():
	#
	# if score == -1:
	# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
	# continue
	#
	# if principle == "Factually Correct":
	# if score >= 0.8:
	# comment = "Excellent accuracy! The information is precise and directly relevant to the question."
	# elif score >= 0.5:
	# comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
	# else:
	# comment = "The explanation contains significant inaccuracies or irrelevant information."
	# elif principle == "Useful":
	# if score >= 0.8:
	# comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
	# elif score >= 0.5:
	# comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
	# else:
	# comment = "The explanation does little to help understand or apply the information provided."
	# elif principle == "Context Specific":
	# if score >= 0.8:
	# comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
	# elif score >= 0.5:
	# comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
	# else:
	# comment = "Fails to address the context of the question, lacking relevance or specificity."
	# elif principle == "User Specific":
	# if score >= 0.8:
	# comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
	# elif score >= 0.5:
	# comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
	# else:
	# comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
	# elif principle == "Provides Pluralism":
	# if score >= 0.8:
	# comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
	# elif score >= 0.5:
	# comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
	# else:
	# comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
	#
	# evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
	# return evaluation_details

	if __name__ == '__main__':

	eval = evaluator()
	conversation = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": "Who won the world series in 2020?"},
	{"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
	{"role": "user", "content": "Where was it played?"}
	]
	context = "general user, user_background is sports enthusiast"
	results = eval.evaluate_conversation(conversation, context)
	print(results)
	print(write_evaluation_commentary(results))