Spaces:

Backup-bdg
/

OpenHands

Build error

App Files Files Community

OpenHands / evaluation /benchmarks /the_agent_company /scripts /summarise_results.py

Backup-bdg

Upload 964 files

51ff9e5 verified 7 days ago

raw

history blame contribute delete

11.8 kB

	###########################################################################################################
	# Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/summarise_results.py
	###########################################################################################################


	import glob
	import json
	import os
	import re
	import sys


	def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
	"""
	Calculate the cost of the model call.
	"""
	if 'claude-3-5-sonnet' in model.lower():
	# https://www.anthropic.com/pricing#anthropic-api, accessed 12/11/2024
	return 0.000003 * prompt_tokens + 0.000015 * completion_tokens
	elif 'gpt-4o' in model.lower():
	# https://openai.com/api/pricing/, accessed 12/11/2024
	return 0.0000025 * prompt_tokens + 0.00001 * completion_tokens
	elif 'gemini-1.5-pro' in model.lower():
	# https://ai.google.dev/pricing#1_5pro, accessed 12/11/2024
	# assuming prompts up to 128k tokens
	cost = 0.00000125 * prompt_tokens + 0.000005 * completion_tokens
	if prompt_tokens > 128000:
	cost *= 2
	return cost
	elif 'gemini-2.0-flash-exp' in model.lower():
	# price unknown for gemini-2.0-flash-exp, assuming same price as gemini-1.5-flash
	cost = 0.000000075 * prompt_tokens + 0.0000003 * completion_tokens
	if prompt_tokens > 128000:
	cost *= 2
	return cost
	elif 'qwen2-72b' in model.lower():
	# assuming hosted on Together
	# https://www.together.ai/pricing, accessed 12/11/2024
	return 0.0000009 * (prompt_tokens + completion_tokens)
	elif 'qwen2p5-72b' in model.lower():
	# assuming hosted on Together
	# https://www.together.ai/pricing, accessed 12/14/2024
	return 0.0000012 * (prompt_tokens + completion_tokens)
	elif 'llama-v3p1-405b-instruct' in model.lower():
	# assuming hosted on Fireworks AI
	# https://fireworks.ai/pricing, accessed 12/11/2024
	return 0.000003 * (prompt_tokens + completion_tokens)
	elif 'llama-v3p1-70b-instruct' in model.lower():
	# assuming hosted on Fireworks AI
	return 0.0000009 * (prompt_tokens + completion_tokens)
	elif 'llama-v3p3-70b-instruct' in model.lower():
	# assuming hosted on Fireworks AI
	return 0.0000009 * (prompt_tokens + completion_tokens)
	elif 'amazon.nova-pro-v1:0' in model.lower():
	# assuming hosted on Amazon Bedrock
	# https://aws.amazon.com/bedrock/pricing/, accessed 12/11/2024
	return 0.0000008 * prompt_tokens + 0.0000032 * completion_tokens
	else:
	raise ValueError(f'Unknown model: {model}')


	def analyze_eval_json_file(filepath: str) -> tuple[int, int]:
	"""
	Analyze a single eval JSON file and extract the total and result from final_score.

	Args:
	filepath: Path to the JSON file

	Returns:
	Tuple containing (total, result) from final_score
	"""
	try:
	with open(filepath, 'r') as f:
	data = json.load(f)

	final_score = data.get('final_score', {})
	return (final_score.get('total', 0), final_score.get('result', 0))
	except json.JSONDecodeError as e:
	print(f'Error decoding JSON in {filepath}: {e}')
	return (0, 0)
	except Exception as e:
	print(f'Error processing {filepath}: {e}')
	return (0, 0)


	def analyze_traj_json_file(filepath: str) -> tuple[int, float]:
	"""
	Analyze a single trajectory JSON file and extract the steps and tokens
	for each step. Then estimate the cost based on the tokens and the model type.
	Note: this is assuming there's no prompt caching at all.
	"""
	steps: int = 0
	cost: float = 0.0
	with open(filepath, 'r') as f:
	data = json.load(f)
	response_id = None
	for action in data:
	if 'tool_call_metadata' in action:
	if action['tool_call_metadata']['model_response']['id'] != response_id:
	response_id = action['tool_call_metadata']['model_response']['id']
	else:
	# openhands displays the same model response meta data multiple times, when
	# a single LLM call leads to multiple actions and observations.
	continue
	steps += 1
	usage = action['tool_call_metadata']['model_response']['usage']
	model: str = action['tool_call_metadata']['model_response']['model']
	prompt_tokens = usage['prompt_tokens']
	completion_tokens = usage['completion_tokens']
	cost += calculate_cost(model, prompt_tokens, completion_tokens)

	return (steps, cost)


	def analyze_folder(
	folder_path: str,
	) -> tuple[dict[str, tuple[int, int]], dict[str, tuple[int, float]]]:
	"""
	Analyze all eval_.json & traj_.json files in the specified folder.

	Args:
	folder_path: Path to the folder containing JSON files

	Returns:
	dictionaries:
	- eval_results: Dictionary with filename as key and (total, result) tuple as value
	- traj_results: Dictionary with filename as key and (steps, cost) tuple as value
	"""
	eval_results = {}
	traj_results = {}

	eval_pattern = os.path.join(folder_path, 'eval_*.json')
	traj_pattern = os.path.join(folder_path, 'traj_*.json')

	for filepath in glob.glob(eval_pattern):
	filename = os.path.basename(filepath)
	total, result = analyze_eval_json_file(filepath)
	key = re.search(r'eval_(.+)\.json', filename).group(1)
	eval_results[key] = (total, result)

	for filepath in glob.glob(traj_pattern):
	filename = os.path.basename(filepath)
	steps, cost = analyze_traj_json_file(filepath)
	key = re.search(r'traj_(.+)\.json', filename).group(1)
	traj_results[key] = (steps, cost)

	return eval_results, traj_results


	def get_task_nature_category(task_name: str) -> str:
	"""
	Get the nature category of the task.
	"""
	task_nature = task_name.split('-')[0]
	if task_nature.lower() in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance']:
	return task_nature
	else:
	return 'other'


	def calculate_score(total: int, result: int) -> float:
	"""
	Calculate the score as a number between 0 and 1.

	Formula: score = (result / total) * 0.5 + (result // total) * 0.5
	Explanation:
	- (result / total) * 0.5: This is the completion ratio, scaled down to a 0-0.5 range.
	- (result // total) * 0.5: This is a binary score indicating whether the task was completed or not.

	Args:
	total: Total possible points
	result: Actual points achieved

	Returns:
	Score as a number between 0 and 1
	"""
	return (result / total * 0.5) + (result // total * 0.5)


	def is_perfect_completion(total: int, result: int) -> bool:
	"""
	Check if the task achieved perfect completion.

	Args:
	total: Total possible points
	result: Actual points achieved

	Returns:
	True if result equals total, False otherwise
	"""
	return total > 0 and total == result


	def main():
	if len(sys.argv) != 2:
	print('Usage: poetry run python summarise_results.py <folder_path>')
	sys.exit(1)

	folder_path = sys.argv[1]

	if not os.path.isdir(folder_path):
	print(f"Error: '{folder_path}' is not a valid directory")
	sys.exit(1)

	eval_results, traj_results = analyze_folder(folder_path)

	if not eval_results:
	print(f'No eval_*.json files found in {folder_path}')
	return

	# Create list of results with completion ratios for sorting
	detailed_results = [
	(
	task_name,
	total,
	result,
	calculate_score(total, result),
	is_perfect_completion(total, result),
	get_task_nature_category(task_name),
	)
	for task_name, (total, result) in eval_results.items()
	]

	# Sort by score in descending order
	detailed_results.sort(key=lambda x: (-x[3], x[0]))

	# Calculate perfect completion stats
	perfect_completions = sum(
	1 for _, _, _, _, is_perfect, _ in detailed_results if is_perfect
	)

	# Print header
	print('\n# Evaluation Results Report')
	print('\n## Results per File')
	print('\nSorted by score (⭐ indicates perfect completion)\n')

	# Print table header
	print(
	'\| Filename \| Total \| Result \| Score \| Steps \| Cost (assuming no prompt caching)\|'
	)
	print('\|----------\|--------\|---------\|-------\|-------\|------\|')

	# Print individual file results
	for task_name, total, result, score, is_perfect, task_nature in detailed_results:
	perfect_marker = ' ⭐' if is_perfect else ''
	print(
	f'\| {task_name} \| {total:,} \| {result:,} \| {score:.2f}{perfect_marker} \| {traj_results[task_name][0]} \| {traj_results[task_name][1]:.2f} \|'
	)

	# Print summary section
	print('\n## Summary\n')
	print(f'Tasks Evaluated: {len(eval_results)}\n')
	print(
	f'Perfect Completions: {perfect_completions}/{len(eval_results)} ({(perfect_completions / len(eval_results) * 100):.2f}%)\n'
	)

	overall_score = (
	sum(score for _, _, _, score, _, _ in detailed_results)
	/ len(detailed_results)
	* 100
	)
	avg_steps = sum(steps for steps, _ in traj_results.values()) / len(traj_results)
	avg_cost = sum(cost for _, cost in traj_results.values()) / len(traj_results)
	print(f'Overall Score: {overall_score:.2f}%\n')
	print(f'Average Steps: {avg_steps:.2f}\n')
	print(f'Average Cost (USD): {avg_cost:.2f}\n')

	# Additional statistics
	if detailed_results:
	highest_score = max(score for _, _, _, score, _, _ in detailed_results)
	lowest_score = min(score for _, _, _, score, _, _ in detailed_results)
	median_score = detailed_results[len(detailed_results) // 2][3]
	avg_score = sum(score for _, _, _, score, _, _ in detailed_results) / len(
	detailed_results
	)

	print('\n## Statistics\n')
	print('\| Metric \| Value \|')
	print('\|---------\|--------\|')
	print(f'\| Highest Task Score \| {highest_score * 100:.2f}% \|')
	print(f'\| Lowest Task Score \| {lowest_score * 100:.2f}% \|')
	print(f'\| Median Task Score \| {median_score * 100:.2f}% \|')
	print(f'\| Average Task Score \| {avg_score * 100:.2f}% \|')

	# compute avg score per nature category
	print('\n## Statistics per Nature Category\n')
	print('\| Metric \| Value \|')
	print('\|---------\|--------\|')
	for task_nature in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance', 'other']:
	num_of_tasks = sum(
	1
	for _, _, _, _, _, nature_category in detailed_results
	if nature_category == task_nature
	)
	task_nature_score = (
	sum(
	score
	for _, _, _, score, _, nature_category in detailed_results
	if nature_category == task_nature
	)
	/ num_of_tasks
	)
	perfect_completions = sum(
	1
	for _, _, _, _, is_perfect, nature_category in detailed_results
	if nature_category == task_nature and is_perfect
	)
	print(
	f'\| Perfect Completions for {task_nature} \| {perfect_completions}/{num_of_tasks} ({perfect_completions / num_of_tasks * 100:.2f}%) \|'
	)
	print(
	f'\| Average Score for {task_nature} \| {task_nature_score * 100:.2f}% \|'
	)


	if __name__ == '__main__':
	main()