Spaces:
Sleeping
Sleeping
import datetime | |
import os | |
import re | |
import gradio as gr | |
import pandas as pd | |
import yaml | |
# Constants | |
EVAL_CARDS_DIR = "eval_cards" | |
TEMPLATE_PATH = "template.yaml" | |
# Ensure the eval cards directory exists | |
os.makedirs(EVAL_CARDS_DIR, exist_ok=True) | |
# Copy the template to the appropriate location | |
with open("template.yaml", "w") as f: | |
with open("yaml_template.yaml", "r") as template_file: | |
f.write(template_file.read()) | |
def load_template(): | |
"""Load the YAML template""" | |
with open(TEMPLATE_PATH, "r") as file: | |
return file.read() | |
def yaml_to_dict(yaml_str): | |
"""Convert YAML string to Python dictionary""" | |
try: | |
return yaml.safe_load(yaml_str) | |
except yaml.YAMLError as e: | |
return {"error": str(e)} | |
def compute_coverage_score(eval_data): | |
""" | |
Compute a coverage score for the eval card | |
Returns a score from 0-100 and a breakdown of coverage by section | |
""" | |
sections = { | |
"metadata": 5, | |
"evaluation_design": 10, | |
"estimand": 20, | |
"estimator": 20, | |
"estimate": 20, | |
"results_communication": 10, | |
"known_issues_and_limitations": 10, | |
"version_and_maintenance": 5, | |
"citation_and_usage": 5, | |
} | |
scores = {} | |
total_score = 0 | |
def count_filled_fields(data, prefix=""): | |
if isinstance(data, dict): | |
filled = 0 | |
total = 0 | |
for key, value in data.items(): | |
if isinstance(value, (dict, list)): | |
sub_filled, sub_total = count_filled_fields( | |
value, f"{prefix}.{key}" if prefix else key | |
) | |
filled += sub_filled | |
total += sub_total | |
else: | |
total += 1 | |
if value and not ( | |
isinstance(value, str) and value.strip() in ["", "[]", "{}"] | |
): | |
filled += 1 | |
return filled, total | |
elif isinstance(data, list): | |
if not data: | |
return 0, 1 | |
filled = 0 | |
total = 0 | |
for item in data: | |
sub_filled, sub_total = count_filled_fields(item) | |
filled += sub_filled | |
total += sub_total | |
return filled, total | |
else: | |
return 1 if data else 0, 1 | |
# Compute scores for each section | |
for section, weight in sections.items(): | |
if section in eval_data: | |
filled, total = count_filled_fields(eval_data[section]) | |
completion_rate = filled / total if total > 0 else 0 | |
scores[section] = { | |
"score": round(completion_rate * weight, 2), | |
"max_score": weight, | |
"completion_rate": round(completion_rate * 100, 2), | |
"fields_filled": filled, | |
"fields_total": total, | |
} | |
total_score += scores[section]["score"] | |
else: | |
scores[section] = { | |
"score": 0, | |
"max_score": weight, | |
"completion_rate": 0, | |
"fields_filled": 0, | |
"fields_total": 0, | |
} | |
return min(round(total_score, 2), 100), scores | |
def get_llm_feedback(yaml_content, api_token=None): | |
""" | |
Get feedback on the eval card from Groq's LLM | |
Uses GROQ_API_KEY from environment variables if no token is provided | |
""" | |
import os | |
import requests | |
from dotenv import load_dotenv | |
# Load environment variables from .env file if it exists | |
load_dotenv() | |
# Use provided token or get from environment | |
api_token = api_token or os.environ.get("GROQ_API_KEY") | |
if not api_token: | |
return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token." | |
try: | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {api_token}", | |
} | |
prompt = f""" | |
I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness, | |
consistency, and clarity. Provide specific recommendations for improvement. | |
Focus on: | |
1. Sections that need more detail | |
2. Inconsistencies or contradictions | |
3. Clarity of language and explanations | |
4. Alignment with best practices for ML evaluation | |
Here's the YAML content: | |
```yaml | |
{yaml_content} | |
``` | |
Provide your feedback in a structured format with specific, actionable recommendations. | |
""" | |
payload = { | |
"model": "llama-3.3-70b-versatile", # or another groq supported model | |
"messages": [{"role": "user", "content": prompt}], | |
} | |
response = requests.post( | |
"https://api.groq.com/openai/v1/chat/completions", | |
headers=headers, | |
json=payload, | |
) | |
if response.status_code == 200: | |
return response.json()["choices"][0]["message"]["content"] | |
else: | |
return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}" | |
except Exception as e: | |
return f"Error getting Groq LLM feedback: {str(e)}" | |
def save_eval_card(yaml_content, paper_url="", repo_url=""): | |
"""Save an eval card with additional metadata""" | |
try: | |
eval_data = yaml.safe_load(yaml_content) | |
# Add paper and repository links to metadata | |
if paper_url: | |
eval_data["metadata"]["paper_link"] = paper_url | |
if repo_url: | |
eval_data["metadata"]["repository_link"] = repo_url | |
# Update the YAML content with the new metadata | |
yaml_content = yaml.dump(eval_data) | |
filename = re.sub(r"[^\w\-_]", "_", eval_data.get("title", "Unnamed")) | |
filename = ( | |
f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" | |
) | |
file_path = os.path.join(EVAL_CARDS_DIR, filename) | |
with open(file_path, "w") as file: | |
file.write(yaml_content) | |
return f"Evaluation card saved successfully as {filename}", file_path | |
except Exception as e: | |
return f"Error saving evaluation card: {str(e)}", None | |
def load_all_eval_cards(): | |
"""Load all eval cards from the repository""" | |
eval_cards = [] | |
for filename in os.listdir(EVAL_CARDS_DIR): | |
if filename.endswith(".yaml"): | |
file_path = os.path.join(EVAL_CARDS_DIR, filename) | |
try: | |
with open(file_path, "r") as file: | |
yaml_content = file.read() | |
eval_data = yaml.safe_load(yaml_content) | |
# Compute coverage score | |
score, score_details = compute_coverage_score(eval_data) | |
score = min(score, 100) | |
# Extract key metadata | |
eval_cards.append( | |
{ | |
"filename": filename, | |
"title": eval_data.get("title", "Unnamed Evaluation"), | |
"summary": eval_data.get("summary", ""), | |
"authors": ", ".join( | |
eval_data.get("metadata", {}).get("authors", []) | |
), | |
"creation_date": eval_data.get("metadata", {}).get( | |
"creation_date", "" | |
), | |
"coverage_score": score, | |
"score_details": score_details, | |
"yaml_content": yaml_content, | |
"data": eval_data, | |
} | |
) | |
except Exception as e: | |
print(f"Error loading {filename}: {str(e)}") | |
return eval_cards | |
def format_eval_card_as_html(eval_card): | |
"""Format an eval card as HTML for display""" | |
html = f""" | |
<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;"> | |
<h3>{eval_card["title"]}</h3> | |
<p>{eval_card["summary"]}</p> | |
<p><strong>Authors:</strong> {eval_card["authors"]}</p> | |
<p><strong>Created:</strong> {eval_card["creation_date"]}</p> | |
<!-- Add repository and paper links if available --> | |
{f'<p><strong>Repository:</strong> <a href="{eval_card["data"]["metadata"].get("repository_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("repository_link", "")}</a></p>' if eval_card["data"]["metadata"].get("repository_link") else ""} | |
{f'<p><strong>Paper:</strong> <a href="{eval_card["data"]["metadata"].get("paper_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("paper_link", "")}</a></p>' if eval_card["data"]["metadata"].get("paper_link") else ""} | |
<p><strong>Coverage Score:</strong> {eval_card["coverage_score"]}%</p> | |
<h4>Coverage by Section:</h4> | |
<table style="width: 100%; border-collapse: collapse;"> | |
<tr> | |
<th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th> | |
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th> | |
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th> | |
</tr> | |
""" | |
for section, details in eval_card["score_details"].items(): | |
html += f""" | |
<tr> | |
<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td> | |
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["score"]}/{details["max_score"]}</td> | |
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["completion_rate"]}%</td> | |
</tr> | |
""" | |
html += """ | |
</table> | |
<div style="margin-top: 15px;"> | |
<!-- Additional actions can go here --> | |
</div> | |
</div> | |
""" | |
return html | |
def create_eval_cards_table(eval_cards): | |
"""Create an HTML table of eval cards""" | |
if not eval_cards: | |
return "<p>No evaluation cards found.</p>" | |
# Sort by coverage score (highest first) | |
eval_cards.sort(key=lambda x: x["coverage_score"], reverse=True) | |
html = "" | |
for eval_card in eval_cards: | |
html += format_eval_card_as_html(eval_card) | |
return html | |
def upload_file(file): | |
"""Process an uploaded YAML file""" | |
if file is None: | |
return "No file uploaded", None | |
try: | |
yaml_content = file.decode("utf-8") | |
# Validate YAML | |
eval_data = yaml.safe_load(yaml_content) | |
return yaml_content, eval_data | |
except Exception as e: | |
return f"Error processing file: {str(e)}", None | |
def get_feedback(yaml_content): | |
"""Get LLM feedback on the eval card""" | |
if not yaml_content: | |
return "Please upload or paste a YAML file first." | |
# Use provided token or get from environment | |
api_token = os.environ.get("GROQ_API_KEY") | |
if not api_token: | |
return ( | |
"Please provide an API token or set the GROQ_API_KEY environment variable." | |
) | |
feedback = get_llm_feedback(yaml_content, api_token) | |
return feedback | |
def submit_eval_card(yaml_content, paper_url="", repo_url=""): | |
"""Submit an eval card to the repository""" | |
if not yaml_content: | |
return "Please upload or paste a YAML file first.", None, None | |
try: | |
# Validate YAML | |
eval_data = yaml.safe_load(yaml_content) | |
# Compute coverage score | |
score, score_details = compute_coverage_score(eval_data) | |
# Save eval card with URLs | |
result, file_path = save_eval_card(yaml_content, paper_url, repo_url) | |
if file_path: | |
return ( | |
f"Evaluation card saved successfully! Coverage score: {score}%", | |
score, | |
score_details, | |
) | |
else: | |
return f"Error saving evaluation card: {result}", None, None | |
except Exception as e: | |
return f"Error processing evaluation card: {str(e)}", None, None | |
def refresh_gallery(): | |
"""Refresh the gallery of eval cards""" | |
eval_cards = load_all_eval_cards() | |
html = create_eval_cards_table(eval_cards) | |
# Convert data to pandas DataFrame for table view | |
table_data = [] | |
# First, count authors across all cards | |
author_counts = {} | |
for card in eval_cards: | |
authors = card["authors"].split(", ") | |
for author in authors: | |
if author and author.strip(): # Skip empty authors | |
if author in author_counts: | |
author_counts[author] += 1 | |
else: | |
author_counts[author] = 1 | |
# Get top authors | |
top_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:20] | |
top_authors = [author for author, count in top_authors] | |
# Create table data with one entry per card | |
for card in eval_cards: | |
authors = card["authors"].split(", ") | |
filtered_authors = [author for author in authors if author in top_authors] | |
table_data.append( | |
{ | |
"Title": card["title"], | |
"Authors": ", ".join(filtered_authors), | |
"Creation Date": card["creation_date"], | |
"Coverage Score": f"{card['coverage_score']}%", | |
} | |
) | |
df = pd.DataFrame(table_data) | |
return html, df if not df.empty else None | |
def handle_upload_tab(file_obj, yaml_text): | |
"""Handle upload tab actions - either use uploaded file or pasted text""" | |
if file_obj is not None: | |
yaml_content, eval_data = upload_file(file_obj) | |
return yaml_content | |
else: | |
return yaml_text | |
# Create the Gradio interface | |
with gr.Blocks(title="Evaluation Cards Gallery") as app: | |
with gr.Row(): | |
with gr.Column(scale=2): | |
gr.Markdown("# Evaluation Cards for Machine Learning in Materials Science. ") | |
gr.Markdown(""" | |
Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the gallery. | |
checkout the [GitHub repository](https://github.com/lamalab-org/eval-cards) for more information. | |
""") | |
with gr.Tabs(): | |
with gr.TabItem("Upload & Review"): | |
with gr.Row(): | |
with gr.Column(): | |
file_upload = gr.File( | |
label="Upload YAML File", file_types=[".yaml", ".yml"] | |
) | |
with gr.Accordion("Or paste YAML content", open=False): | |
yaml_input = gr.TextArea( | |
label="YAML Content", | |
placeholder="Paste your YAML content here...", | |
lines=10, | |
) | |
paper_url_input = gr.Textbox( | |
label="Paper URL (Optional)", | |
placeholder="https://arxiv.org/abs/...", | |
) | |
repo_url_input = gr.Textbox( | |
label="Repository URL (Optional)", | |
placeholder="https://github.com/...", | |
) | |
load_template_btn = gr.Button("Load Template") | |
# api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password") | |
with gr.Row(): | |
get_feedback_btn = gr.Button("Get LLM Feedback") | |
submit_btn = gr.Button( | |
"Submit Evaluation Card", variant="primary" | |
) | |
with gr.Column(): | |
yaml_display = gr.TextArea(label="Current YAML", lines=20) | |
with gr.Accordion("LLM Feedback", open=True): | |
feedback_display = gr.Markdown() | |
with gr.Accordion("Submission Result", open=True): | |
result_display = gr.Markdown() | |
coverage_score = gr.Number( | |
label="Coverage Score", visible=False | |
) | |
coverage_details = gr.JSON( | |
label="Coverage Details", visible=False | |
) | |
with gr.TabItem("Gallery"): | |
refresh_btn = gr.Button("Refresh Gallery") | |
with gr.Tabs(): | |
with gr.TabItem("Card View"): | |
gallery_html = gr.HTML() | |
with gr.TabItem("Table View"): | |
gallery_table = gr.DataFrame() | |
# Set up event handlers | |
load_template_btn.click(fn=load_template, outputs=[yaml_display]) | |
file_upload.change( | |
fn=handle_upload_tab, inputs=[file_upload, yaml_input], outputs=[yaml_display] | |
) | |
yaml_input.change(fn=lambda x: x, inputs=[yaml_input], outputs=[yaml_display]) | |
get_feedback_btn.click( | |
fn=get_feedback, inputs=[yaml_display], outputs=[feedback_display] | |
) | |
submit_btn.click( | |
fn=submit_eval_card, | |
inputs=[yaml_display, paper_url_input, repo_url_input], | |
outputs=[result_display, coverage_score, coverage_details], | |
) | |
refresh_btn.click(fn=refresh_gallery, outputs=[gallery_html, gallery_table]) | |
# Initialize the gallery on app start | |
app.load(fn=refresh_gallery, outputs=[gallery_html, gallery_table]) | |
# Launch the app | |
if __name__ == "__main__": | |
app.launch() | |