|
import datetime |
|
import os |
|
import re |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import yaml |
|
|
|
|
|
EVAL_CARDS_DIR = "eval_cards" |
|
TEMPLATE_PATH = "template.yaml" |
|
|
|
|
|
|
|
os.makedirs(EVAL_CARDS_DIR, exist_ok=True) |
|
|
|
|
|
with open("template.yaml", "w") as f: |
|
with open("yaml_template.yaml", "r") as template_file: |
|
f.write(template_file.read()) |
|
|
|
|
|
def load_template(): |
|
"""Load the YAML template""" |
|
with open(TEMPLATE_PATH, "r") as file: |
|
return file.read() |
|
|
|
|
|
def yaml_to_dict(yaml_str): |
|
"""Convert YAML string to Python dictionary""" |
|
try: |
|
return yaml.safe_load(yaml_str) |
|
except yaml.YAMLError as e: |
|
return {"error": str(e)} |
|
|
|
|
|
def compute_coverage_score(eval_data): |
|
""" |
|
Compute a coverage score for the eval card |
|
Returns a score from 0-100 and a breakdown of coverage by section |
|
""" |
|
sections = { |
|
"metadata": 5, |
|
"evaluation_design": 10, |
|
"estimand": 20, |
|
"estimator": 20, |
|
"estimate": 20, |
|
"results_communication": 10, |
|
"known_issues_and_limitations": 10, |
|
"version_and_maintenance": 5, |
|
"citation_and_usage": 5, |
|
} |
|
|
|
scores = {} |
|
total_score = 0 |
|
|
|
def count_filled_fields(data, prefix=""): |
|
if isinstance(data, dict): |
|
filled = 0 |
|
total = 0 |
|
for key, value in data.items(): |
|
if isinstance(value, (dict, list)): |
|
sub_filled, sub_total = count_filled_fields( |
|
value, f"{prefix}.{key}" if prefix else key |
|
) |
|
filled += sub_filled |
|
total += sub_total |
|
else: |
|
total += 1 |
|
if value and not ( |
|
isinstance(value, str) and value.strip() in ["", "[]", "{}"] |
|
): |
|
filled += 1 |
|
return filled, total |
|
elif isinstance(data, list): |
|
if not data: |
|
return 0, 1 |
|
filled = 0 |
|
total = 0 |
|
for item in data: |
|
sub_filled, sub_total = count_filled_fields(item) |
|
filled += sub_filled |
|
total += sub_total |
|
return filled, total |
|
else: |
|
return 1 if data else 0, 1 |
|
|
|
|
|
for section, weight in sections.items(): |
|
if section in eval_data: |
|
filled, total = count_filled_fields(eval_data[section]) |
|
completion_rate = filled / total if total > 0 else 0 |
|
scores[section] = { |
|
"score": round(completion_rate * weight, 2), |
|
"max_score": weight, |
|
"completion_rate": round(completion_rate * 100, 2), |
|
"fields_filled": filled, |
|
"fields_total": total, |
|
} |
|
total_score += scores[section]["score"] |
|
else: |
|
scores[section] = { |
|
"score": 0, |
|
"max_score": weight, |
|
"completion_rate": 0, |
|
"fields_filled": 0, |
|
"fields_total": 0, |
|
} |
|
|
|
return min(round(total_score, 2), 100), scores |
|
|
|
|
|
def get_llm_feedback(yaml_content, api_token=None): |
|
""" |
|
Get feedback on the eval card from Groq's LLM |
|
Uses GROQ_API_KEY from environment variables if no token is provided |
|
""" |
|
import os |
|
|
|
import requests |
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
api_token = api_token or os.environ.get("GROQ_API_KEY") |
|
|
|
if not api_token: |
|
return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token." |
|
|
|
try: |
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {api_token}", |
|
} |
|
|
|
prompt = f""" |
|
I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness, |
|
consistency, and clarity. Provide specific recommendations for improvement. |
|
|
|
Focus on: |
|
1. Sections that need more detail |
|
2. Inconsistencies or contradictions |
|
3. Clarity of language and explanations |
|
4. Alignment with best practices for ML evaluation |
|
|
|
Here's the YAML content: |
|
|
|
```yaml |
|
{yaml_content} |
|
``` |
|
|
|
Provide your feedback in a structured format with specific, actionable recommendations. |
|
""" |
|
|
|
payload = { |
|
"model": "llama-3.3-70b-versatile", |
|
"messages": [{"role": "user", "content": prompt}], |
|
} |
|
|
|
response = requests.post( |
|
"https://api.groq.com/openai/v1/chat/completions", |
|
headers=headers, |
|
json=payload, |
|
) |
|
|
|
if response.status_code == 200: |
|
return response.json()["choices"][0]["message"]["content"] |
|
else: |
|
return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}" |
|
|
|
except Exception as e: |
|
return f"Error getting Groq LLM feedback: {str(e)}" |
|
|
|
|
|
def save_eval_card(yaml_content, paper_url="", repo_url=""): |
|
"""Save an eval card with additional metadata""" |
|
try: |
|
eval_data = yaml.safe_load(yaml_content) |
|
|
|
|
|
if paper_url: |
|
eval_data["metadata"]["paper_link"] = paper_url |
|
if repo_url: |
|
eval_data["metadata"]["repository_link"] = repo_url |
|
|
|
|
|
yaml_content = yaml.dump(eval_data) |
|
|
|
filename = re.sub(r"[^\w\-_]", "_", eval_data.get("title", "Unnamed")) |
|
filename = ( |
|
f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" |
|
) |
|
file_path = os.path.join(EVAL_CARDS_DIR, filename) |
|
|
|
with open(file_path, "w") as file: |
|
file.write(yaml_content) |
|
|
|
return f"Evaluation card saved successfully as {filename}", file_path |
|
except Exception as e: |
|
return f"Error saving evaluation card: {str(e)}", None |
|
|
|
|
|
def load_all_eval_cards(): |
|
"""Load all eval cards from the repository""" |
|
eval_cards = [] |
|
|
|
for filename in os.listdir(EVAL_CARDS_DIR): |
|
if filename.endswith(".yaml"): |
|
file_path = os.path.join(EVAL_CARDS_DIR, filename) |
|
try: |
|
with open(file_path, "r") as file: |
|
yaml_content = file.read() |
|
eval_data = yaml.safe_load(yaml_content) |
|
|
|
|
|
score, score_details = compute_coverage_score(eval_data) |
|
score = min(score, 100) |
|
|
|
|
|
eval_cards.append( |
|
{ |
|
"filename": filename, |
|
"title": eval_data.get("title", "Unnamed Evaluation"), |
|
"summary": eval_data.get("summary", ""), |
|
"authors": ", ".join( |
|
eval_data.get("metadata", {}).get("authors", []) |
|
), |
|
"creation_date": eval_data.get("metadata", {}).get( |
|
"creation_date", "" |
|
), |
|
"coverage_score": score, |
|
"score_details": score_details, |
|
"yaml_content": yaml_content, |
|
"data": eval_data, |
|
} |
|
) |
|
except Exception as e: |
|
print(f"Error loading {filename}: {str(e)}") |
|
|
|
return eval_cards |
|
|
|
|
|
def format_eval_card_as_html(eval_card): |
|
"""Format an eval card as HTML for display""" |
|
html = f""" |
|
<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;"> |
|
<h3>{eval_card["title"]}</h3> |
|
<p>{eval_card["summary"]}</p> |
|
<p><strong>Authors:</strong> {eval_card["authors"]}</p> |
|
<p><strong>Created:</strong> {eval_card["creation_date"]}</p> |
|
|
|
<!-- Add repository and paper links if available --> |
|
{f'<p><strong>Repository:</strong> <a href="{eval_card["data"]["metadata"].get("repository_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("repository_link", "")}</a></p>' if eval_card["data"]["metadata"].get("repository_link") else ""} |
|
{f'<p><strong>Paper:</strong> <a href="{eval_card["data"]["metadata"].get("paper_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("paper_link", "")}</a></p>' if eval_card["data"]["metadata"].get("paper_link") else ""} |
|
|
|
<p><strong>Coverage Score:</strong> {eval_card["coverage_score"]}%</p> |
|
|
|
<h4>Coverage by Section:</h4> |
|
<table style="width: 100%; border-collapse: collapse;"> |
|
<tr> |
|
<th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th> |
|
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th> |
|
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th> |
|
</tr> |
|
""" |
|
|
|
for section, details in eval_card["score_details"].items(): |
|
html += f""" |
|
<tr> |
|
<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td> |
|
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["score"]}/{details["max_score"]}</td> |
|
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["completion_rate"]}%</td> |
|
</tr> |
|
""" |
|
|
|
html += """ |
|
</table> |
|
<div style="margin-top: 15px;"> |
|
<!-- Additional actions can go here --> |
|
</div> |
|
</div> |
|
""" |
|
|
|
return html |
|
|
|
|
|
def create_eval_cards_table(eval_cards): |
|
"""Create an HTML table of eval cards""" |
|
if not eval_cards: |
|
return "<p>No evaluation cards found.</p>" |
|
|
|
|
|
eval_cards.sort(key=lambda x: x["coverage_score"], reverse=True) |
|
|
|
html = "" |
|
for eval_card in eval_cards: |
|
html += format_eval_card_as_html(eval_card) |
|
|
|
return html |
|
|
|
|
|
def upload_file(file): |
|
"""Process an uploaded YAML file""" |
|
if file is None: |
|
return "No file uploaded", None |
|
|
|
try: |
|
yaml_content = file.decode("utf-8") |
|
|
|
eval_data = yaml.safe_load(yaml_content) |
|
return yaml_content, eval_data |
|
except Exception as e: |
|
return f"Error processing file: {str(e)}", None |
|
|
|
|
|
def get_feedback(yaml_content): |
|
"""Get LLM feedback on the eval card""" |
|
if not yaml_content: |
|
return "Please upload or paste a YAML file first." |
|
|
|
|
|
api_token = os.environ.get("GROQ_API_KEY") |
|
|
|
if not api_token: |
|
return ( |
|
"Please provide an API token or set the GROQ_API_KEY environment variable." |
|
) |
|
|
|
feedback = get_llm_feedback(yaml_content, api_token) |
|
return feedback |
|
|
|
|
|
def submit_eval_card(yaml_content, paper_url="", repo_url=""): |
|
"""Submit an eval card to the repository""" |
|
if not yaml_content: |
|
return "Please upload or paste a YAML file first.", None, None |
|
|
|
try: |
|
|
|
eval_data = yaml.safe_load(yaml_content) |
|
|
|
|
|
score, score_details = compute_coverage_score(eval_data) |
|
|
|
|
|
result, file_path = save_eval_card(yaml_content, paper_url, repo_url) |
|
|
|
if file_path: |
|
return ( |
|
f"Evaluation card saved successfully! Coverage score: {score}%", |
|
score, |
|
score_details, |
|
) |
|
else: |
|
return f"Error saving evaluation card: {result}", None, None |
|
|
|
except Exception as e: |
|
return f"Error processing evaluation card: {str(e)}", None, None |
|
|
|
|
|
def refresh_gallery(): |
|
"""Refresh the gallery of eval cards""" |
|
eval_cards = load_all_eval_cards() |
|
html = create_eval_cards_table(eval_cards) |
|
|
|
|
|
table_data = [] |
|
for card in eval_cards: |
|
author_counts = {} |
|
for card in eval_cards: |
|
authors = card["authors"].split(", ") |
|
for author in authors: |
|
if author in author_counts: |
|
author_counts[author] += 1 |
|
else: |
|
author_counts[author] = 1 |
|
|
|
top_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:20] |
|
top_authors = [author for author, count in top_authors] |
|
|
|
for card in eval_cards: |
|
authors = card["authors"].split(", ") |
|
filtered_authors = [author for author in authors if author in top_authors] |
|
table_data.append( |
|
{ |
|
"Title": card["title"], |
|
"Authors": ", ".join(filtered_authors), |
|
"Creation Date": card["creation_date"], |
|
"Coverage Score": f"{card['coverage_score']}%", |
|
} |
|
) |
|
|
|
df = pd.DataFrame(table_data) |
|
|
|
return html, df if not df.empty else None |
|
|
|
|
|
def handle_upload_tab(file_obj, yaml_text): |
|
"""Handle upload tab actions - either use uploaded file or pasted text""" |
|
if file_obj is not None: |
|
yaml_content, eval_data = upload_file(file_obj) |
|
return yaml_content |
|
else: |
|
return yaml_text |
|
|
|
|
|
|
|
with gr.Blocks(title="Evaluation Cards Gallery") as app: |
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.Markdown("# Evaluation Cards for Machine Learning in Materials Science. ") |
|
gr.Markdown(""" |
|
Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the gallery. |
|
checkout the [GitHub repository](https://github.com/lamalab-org/eval-cards) for more information. |
|
""") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Upload & Review"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
file_upload = gr.File( |
|
label="Upload YAML File", file_types=[".yaml", ".yml"] |
|
) |
|
|
|
with gr.Accordion("Or paste YAML content", open=False): |
|
yaml_input = gr.TextArea( |
|
label="YAML Content", |
|
placeholder="Paste your YAML content here...", |
|
lines=10, |
|
) |
|
paper_url_input = gr.Textbox( |
|
label="Paper URL (Optional)", |
|
placeholder="https://arxiv.org/abs/...", |
|
) |
|
|
|
repo_url_input = gr.Textbox( |
|
label="Repository URL (Optional)", |
|
placeholder="https://github.com/...", |
|
) |
|
|
|
load_template_btn = gr.Button("Load Template") |
|
|
|
|
|
|
|
with gr.Row(): |
|
get_feedback_btn = gr.Button("Get LLM Feedback") |
|
submit_btn = gr.Button( |
|
"Submit Evaluation Card", variant="primary" |
|
) |
|
|
|
with gr.Column(): |
|
yaml_display = gr.TextArea(label="Current YAML", lines=20) |
|
|
|
with gr.Accordion("LLM Feedback", open=True): |
|
feedback_display = gr.Markdown() |
|
|
|
with gr.Accordion("Submission Result", open=True): |
|
result_display = gr.Markdown() |
|
coverage_score = gr.Number( |
|
label="Coverage Score", visible=False |
|
) |
|
coverage_details = gr.JSON( |
|
label="Coverage Details", visible=False |
|
) |
|
|
|
with gr.TabItem("Gallery"): |
|
refresh_btn = gr.Button("Refresh Gallery") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Card View"): |
|
gallery_html = gr.HTML() |
|
|
|
with gr.TabItem("Table View"): |
|
gallery_table = gr.DataFrame() |
|
|
|
|
|
load_template_btn.click(fn=load_template, outputs=[yaml_display]) |
|
|
|
file_upload.change( |
|
fn=handle_upload_tab, inputs=[file_upload, yaml_input], outputs=[yaml_display] |
|
) |
|
|
|
yaml_input.change(fn=lambda x: x, inputs=[yaml_input], outputs=[yaml_display]) |
|
|
|
get_feedback_btn.click( |
|
fn=get_feedback, inputs=[yaml_display], outputs=[feedback_display] |
|
) |
|
|
|
submit_btn.click( |
|
fn=submit_eval_card, |
|
inputs=[yaml_display, paper_url_input, repo_url_input], |
|
outputs=[result_display, coverage_score, coverage_details], |
|
) |
|
|
|
refresh_btn.click(fn=refresh_gallery, outputs=[gallery_html, gallery_table]) |
|
|
|
|
|
app.load(fn=refresh_gallery, outputs=[gallery_html, gallery_table]) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |
|
|