Spaces:

jablonkagroup
/

eval-cards-gallery

Running

App Files Files Community

eval-cards-gallery / app.py

n0w0f

fix: remove duplication in table view

ee6ab25 4 months ago

raw

history blame contribute delete

17.6 kB

	import datetime
	import os
	import re

	import gradio as gr
	import pandas as pd
	import yaml

	# Constants
	EVAL_CARDS_DIR = "eval_cards"
	TEMPLATE_PATH = "template.yaml"


	# Ensure the eval cards directory exists
	os.makedirs(EVAL_CARDS_DIR, exist_ok=True)

	# Copy the template to the appropriate location
	with open("template.yaml", "w") as f:
	with open("yaml_template.yaml", "r") as template_file:
	f.write(template_file.read())


	def load_template():
	"""Load the YAML template"""
	with open(TEMPLATE_PATH, "r") as file:
	return file.read()


	def yaml_to_dict(yaml_str):
	"""Convert YAML string to Python dictionary"""
	try:
	return yaml.safe_load(yaml_str)
	except yaml.YAMLError as e:
	return {"error": str(e)}


	def compute_coverage_score(eval_data):
	"""
	Compute a coverage score for the eval card
	Returns a score from 0-100 and a breakdown of coverage by section
	"""
	sections = {
	"metadata": 5,
	"evaluation_design": 10,
	"estimand": 20,
	"estimator": 20,
	"estimate": 20,
	"results_communication": 10,
	"known_issues_and_limitations": 10,
	"version_and_maintenance": 5,
	"citation_and_usage": 5,
	}

	scores = {}
	total_score = 0

	def count_filled_fields(data, prefix=""):
	if isinstance(data, dict):
	filled = 0
	total = 0
	for key, value in data.items():
	if isinstance(value, (dict, list)):
	sub_filled, sub_total = count_filled_fields(
	value, f"{prefix}.{key}" if prefix else key
	)
	filled += sub_filled
	total += sub_total
	else:
	total += 1
	if value and not (
	isinstance(value, str) and value.strip() in ["", "[]", "{}"]
	):
	filled += 1
	return filled, total
	elif isinstance(data, list):
	if not data:
	return 0, 1
	filled = 0
	total = 0
	for item in data:
	sub_filled, sub_total = count_filled_fields(item)
	filled += sub_filled
	total += sub_total
	return filled, total
	else:
	return 1 if data else 0, 1

	# Compute scores for each section
	for section, weight in sections.items():
	if section in eval_data:
	filled, total = count_filled_fields(eval_data[section])
	completion_rate = filled / total if total > 0 else 0
	scores[section] = {
	"score": round(completion_rate * weight, 2),
	"max_score": weight,
	"completion_rate": round(completion_rate * 100, 2),
	"fields_filled": filled,
	"fields_total": total,
	}
	total_score += scores[section]["score"]
	else:
	scores[section] = {
	"score": 0,
	"max_score": weight,
	"completion_rate": 0,
	"fields_filled": 0,
	"fields_total": 0,
	}

	return min(round(total_score, 2), 100), scores


	def get_llm_feedback(yaml_content, api_token=None):
	"""
	Get feedback on the eval card from Groq's LLM
	Uses GROQ_API_KEY from environment variables if no token is provided
	"""
	import os

	import requests
	from dotenv import load_dotenv

	# Load environment variables from .env file if it exists
	load_dotenv()

	# Use provided token or get from environment
	api_token = api_token or os.environ.get("GROQ_API_KEY")

	if not api_token:
	return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token."

	try:
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_token}",
	}

	prompt = f"""
	I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness,
	consistency, and clarity. Provide specific recommendations for improvement.

	Focus on:
	1. Sections that need more detail
	2. Inconsistencies or contradictions
	3. Clarity of language and explanations
	4. Alignment with best practices for ML evaluation

	Here's the YAML content:

	```yaml
	{yaml_content}
	```

	Provide your feedback in a structured format with specific, actionable recommendations.
	"""

	payload = {
	"model": "llama-3.3-70b-versatile", # or another groq supported model
	"messages": [{"role": "user", "content": prompt}],
	}

	response = requests.post(
	"https://api.groq.com/openai/v1/chat/completions",
	headers=headers,
	json=payload,
	)

	if response.status_code == 200:
	return response.json()["choices"][0]["message"]["content"]
	else:
	return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}"

	except Exception as e:
	return f"Error getting Groq LLM feedback: {str(e)}"


	def save_eval_card(yaml_content, paper_url="", repo_url=""):
	"""Save an eval card with additional metadata"""
	try:
	eval_data = yaml.safe_load(yaml_content)

	# Add paper and repository links to metadata
	if paper_url:
	eval_data["metadata"]["paper_link"] = paper_url
	if repo_url:
	eval_data["metadata"]["repository_link"] = repo_url

	# Update the YAML content with the new metadata
	yaml_content = yaml.dump(eval_data)

	filename = re.sub(r"[^\w\-_]", "_", eval_data.get("title", "Unnamed"))
	filename = (
	f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
	)
	file_path = os.path.join(EVAL_CARDS_DIR, filename)

	with open(file_path, "w") as file:
	file.write(yaml_content)

	return f"Evaluation card saved successfully as {filename}", file_path
	except Exception as e:
	return f"Error saving evaluation card: {str(e)}", None


	def load_all_eval_cards():
	"""Load all eval cards from the repository"""
	eval_cards = []

	for filename in os.listdir(EVAL_CARDS_DIR):
	if filename.endswith(".yaml"):
	file_path = os.path.join(EVAL_CARDS_DIR, filename)
	try:
	with open(file_path, "r") as file:
	yaml_content = file.read()
	eval_data = yaml.safe_load(yaml_content)

	# Compute coverage score
	score, score_details = compute_coverage_score(eval_data)
	score = min(score, 100)

	# Extract key metadata
	eval_cards.append(
	{
	"filename": filename,
	"title": eval_data.get("title", "Unnamed Evaluation"),
	"summary": eval_data.get("summary", ""),
	"authors": ", ".join(
	eval_data.get("metadata", {}).get("authors", [])
	),
	"creation_date": eval_data.get("metadata", {}).get(
	"creation_date", ""
	),
	"coverage_score": score,
	"score_details": score_details,
	"yaml_content": yaml_content,
	"data": eval_data,
	}
	)
	except Exception as e:
	print(f"Error loading {filename}: {str(e)}")

	return eval_cards


	def format_eval_card_as_html(eval_card):
	"""Format an eval card as HTML for display"""
	html = f"""
	<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
	<h3>{eval_card["title"]}</h3>
	<p>{eval_card["summary"]}</p>
	<p><strong>Authors:</strong> {eval_card["authors"]}</p>
	<p><strong>Created:</strong> {eval_card["creation_date"]}</p>

	<!-- Add repository and paper links if available -->
	{f'<p><strong>Repository:</strong> <a href="{eval_card["data"]["metadata"].get("repository_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("repository_link", "")}</a></p>' if eval_card["data"]["metadata"].get("repository_link") else ""}
	{f'<p><strong>Paper:</strong> <a href="{eval_card["data"]["metadata"].get("paper_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("paper_link", "")}</a></p>' if eval_card["data"]["metadata"].get("paper_link") else ""}

	<p><strong>Coverage Score:</strong> {eval_card["coverage_score"]}%</p>

	<h4>Coverage by Section:</h4>
	<table style="width: 100%; border-collapse: collapse;">
	<tr>
	<th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th>
	<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th>
	<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
	</tr>
	"""

	for section, details in eval_card["score_details"].items():
	html += f"""
	<tr>
	<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
	<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["score"]}/{details["max_score"]}</td>
	<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["completion_rate"]}%</td>
	</tr>
	"""

	html += """
	</table>
	<div style="margin-top: 15px;">
	<!-- Additional actions can go here -->
	</div>
	</div>
	"""

	return html


	def create_eval_cards_table(eval_cards):
	"""Create an HTML table of eval cards"""
	if not eval_cards:
	return "<p>No evaluation cards found.</p>"

	# Sort by coverage score (highest first)
	eval_cards.sort(key=lambda x: x["coverage_score"], reverse=True)

	html = ""
	for eval_card in eval_cards:
	html += format_eval_card_as_html(eval_card)

	return html


	def upload_file(file):
	"""Process an uploaded YAML file"""
	if file is None:
	return "No file uploaded", None

	try:
	yaml_content = file.decode("utf-8")
	# Validate YAML
	eval_data = yaml.safe_load(yaml_content)
	return yaml_content, eval_data
	except Exception as e:
	return f"Error processing file: {str(e)}", None


	def get_feedback(yaml_content):
	"""Get LLM feedback on the eval card"""
	if not yaml_content:
	return "Please upload or paste a YAML file first."

	# Use provided token or get from environment
	api_token = os.environ.get("GROQ_API_KEY")

	if not api_token:
	return (
	"Please provide an API token or set the GROQ_API_KEY environment variable."
	)

	feedback = get_llm_feedback(yaml_content, api_token)
	return feedback


	def submit_eval_card(yaml_content, paper_url="", repo_url=""):
	"""Submit an eval card to the repository"""
	if not yaml_content:
	return "Please upload or paste a YAML file first.", None, None

	try:
	# Validate YAML
	eval_data = yaml.safe_load(yaml_content)

	# Compute coverage score
	score, score_details = compute_coverage_score(eval_data)

	# Save eval card with URLs
	result, file_path = save_eval_card(yaml_content, paper_url, repo_url)

	if file_path:
	return (
	f"Evaluation card saved successfully! Coverage score: {score}%",
	score,
	score_details,
	)
	else:
	return f"Error saving evaluation card: {result}", None, None

	except Exception as e:
	return f"Error processing evaluation card: {str(e)}", None, None


	def refresh_gallery():
	"""Refresh the gallery of eval cards"""
	eval_cards = load_all_eval_cards()
	html = create_eval_cards_table(eval_cards)

	# Convert data to pandas DataFrame for table view
	table_data = []

	# First, count authors across all cards
	author_counts = {}
	for card in eval_cards:
	authors = card["authors"].split(", ")
	for author in authors:
	if author and author.strip(): # Skip empty authors
	if author in author_counts:
	author_counts[author] += 1
	else:
	author_counts[author] = 1

	# Get top authors
	top_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:20]
	top_authors = [author for author, count in top_authors]

	# Create table data with one entry per card
	for card in eval_cards:
	authors = card["authors"].split(", ")
	filtered_authors = [author for author in authors if author in top_authors]
	table_data.append(
	{
	"Title": card["title"],
	"Authors": ", ".join(filtered_authors),
	"Creation Date": card["creation_date"],
	"Coverage Score": f"{card['coverage_score']}%",
	}
	)

	df = pd.DataFrame(table_data)

	return html, df if not df.empty else None


	def handle_upload_tab(file_obj, yaml_text):
	"""Handle upload tab actions - either use uploaded file or pasted text"""
	if file_obj is not None:
	yaml_content, eval_data = upload_file(file_obj)
	return yaml_content
	else:
	return yaml_text


	# Create the Gradio interface
	with gr.Blocks(title="Evaluation Cards Gallery") as app:
	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("# Evaluation Cards for Machine Learning in Materials Science. ")
	gr.Markdown("""
	Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the gallery.
	checkout the [GitHub repository](https://github.com/lamalab-org/eval-cards) for more information.
	""")

	with gr.Tabs():
	with gr.TabItem("Upload & Review"):
	with gr.Row():
	with gr.Column():
	file_upload = gr.File(
	label="Upload YAML File", file_types=[".yaml", ".yml"]
	)

	with gr.Accordion("Or paste YAML content", open=False):
	yaml_input = gr.TextArea(
	label="YAML Content",
	placeholder="Paste your YAML content here...",
	lines=10,
	)
	paper_url_input = gr.Textbox(
	label="Paper URL (Optional)",
	placeholder="https://arxiv.org/abs/...",
	)

	repo_url_input = gr.Textbox(
	label="Repository URL (Optional)",
	placeholder="https://github.com/...",
	)

	load_template_btn = gr.Button("Load Template")

	# api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")

	with gr.Row():
	get_feedback_btn = gr.Button("Get LLM Feedback")
	submit_btn = gr.Button(
	"Submit Evaluation Card", variant="primary"
	)

	with gr.Column():
	yaml_display = gr.TextArea(label="Current YAML", lines=20)

	with gr.Accordion("LLM Feedback", open=True):
	feedback_display = gr.Markdown()

	with gr.Accordion("Submission Result", open=True):
	result_display = gr.Markdown()
	coverage_score = gr.Number(
	label="Coverage Score", visible=False
	)
	coverage_details = gr.JSON(
	label="Coverage Details", visible=False
	)

	with gr.TabItem("Gallery"):
	refresh_btn = gr.Button("Refresh Gallery")

	with gr.Tabs():
	with gr.TabItem("Card View"):
	gallery_html = gr.HTML()

	with gr.TabItem("Table View"):
	gallery_table = gr.DataFrame()

	# Set up event handlers
	load_template_btn.click(fn=load_template, outputs=[yaml_display])

	file_upload.change(
	fn=handle_upload_tab, inputs=[file_upload, yaml_input], outputs=[yaml_display]
	)

	yaml_input.change(fn=lambda x: x, inputs=[yaml_input], outputs=[yaml_display])

	get_feedback_btn.click(
	fn=get_feedback, inputs=[yaml_display], outputs=[feedback_display]
	)

	submit_btn.click(
	fn=submit_eval_card,
	inputs=[yaml_display, paper_url_input, repo_url_input],
	outputs=[result_display, coverage_score, coverage_details],
	)

	refresh_btn.click(fn=refresh_gallery, outputs=[gallery_html, gallery_table])

	# Initialize the gallery on app start
	app.load(fn=refresh_gallery, outputs=[gallery_html, gallery_table])

	# Launch the app
	if __name__ == "__main__":
	app.launch()